In [None]:
# This script processes a dataset available in an S3 bucket conataining information about broadband availability across the United States.            # Specifically, the data shows available broadband speeds,the broadband         # infrastrucure technologies, and the provider names for broadband systems      # across the more than 11 million US census blocks in the United States.
# The data source, file structure including variable names, and other documentation for the broadband dataset is available here: 
# https://opendata.fcc.gov/Wireline/Fixed-Broadband-Deployment-Data-June-2017-Status-V/9r8r-g7ut

# If PYTHONPATH is not set, findspark and findspark.init() will find it on your machine 
import findspark
findspark.init()

import re
import sys
import spark
from pyspark import SparkContext
from pyspark.sql.types import *
import os
import sys
from pyspark.sql.functions import size

from pyspark.sql.functions import substring, length, col, expr


# schemaString = 'something'

# fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()]

from pyspark.sql import SparkSession
spark = SparkSession \
        .builder \
        .master('local') \
        .appName("BroadbandScout")\
        .getOrCreate()

sc = SparkContext.getOrCreate()


pathname_input = 's3a://sparkforinsightproject/Fixed_Broadband_Deployment_Data__June__2017_Status_V1.csv'

pathname_output = 's3a://sparkforinsightproject/database_data/sparkdf_broadband_output_2'

sc = SparkContext.getOrCreate()

def etl_broadband(input_data_txt):

    broadband_schema = StructType([
        StructField("Logical Record Number", StringType(), True),
        StructField("Provider ID", StringType(), True),
        StructField("FRN", StringType(), True),
        StructField("Provider Name", StringType(), True),
        StructField("DBA Name", StringType(), True),
        StructField("Holding Company Name", StringType(), True),
        StructField("Holding Company Number", StringType(), True),
        StructField("Holding Company Final", StringType(), True),
        StructField("State", StringType(), True),
        StructField("Census Block FIPS Code", StringType(), True),
        StructField("Technology Code", StringType(), True),
        StructField("Consumer", StringType(), True),
        StructField("Max Advertised Downstream Speed (mbps)", IntegerType(), True),
        StructField("Max Advertised Upstream Speed (mbps)", IntegerType(), True),
        StructField("Business", StringType(), True),
        StructField("Max CIR Downstream Speed (mbps)", IntegerType(), True),
        StructField("Max CIR Upstream Speed (mbps)", IntegerType(), True)
        ])

    df_BROADBAND = spark.read.csv(input_data_txt, quote='"', header=True, sep=',', nullValue='NA', schema=broadband_schema)

    df_BROADBAND = df_BROADBAND\
                               .withColumnRenamed("DBA Name", "dba_name")\
                               .withColumnRenamed("State", "state")\
                               .withColumnRenamed("Census Block FIPS Code", "census_block")\
                               .withColumnRenamed("Technology Code", "technology")\
                               .withColumnRenamed("Max Advertised Downstream Speed (mbps)", "ma_downspeed")\
                               .withColumnRenamed("Max Advertised Upstream Speed (mbps)", "ma_upspeed")\
                               .withColumnRenamed("Max CIR Downstream Speed (mbps)",  "mc_downspeed")\
                               .withColumnRenamed("Max CIR Upstream Speed (mbps)",  "mc_upspeed")


    # This code selects and saves just seven of the 16 columns from the 
    # original file that have some clear potential value for the database.

    df_BROADBAND = df_BROADBAND\
                                       .select(
                                       "dba_name",\
                                       "census_block",\
                                       "state",\
                                       "technology",\
                                       "ma_downspeed",\
                                       "ma_upspeed",\
                                       "mc_downspeed",\
                                       "mc_upspeed")
    
    
    df_BROADBAND = df_BROADBAND.na.drop()


    return df_BROADBAND

def main():
    input_data_txt = sys.argv[1]
    output_data_txt = sys.argv[2]
    extract_transform_load_broadband(input_data_txt, output_data_txt)

    if __name__ == '__main__':
        main()



# output_df_broadband = etl_broadband(pathname_input)


# output_df_broadband.show()