In [None]:
# This script extracts, transforms, and loads a dataset that includes nine columns. The primary value 
# of this dataset for this project is that it links all US zip codes to GPS coordinates -- the 
# provided GPS coordinates are the latitude and longitude for the center of the given zip code. 
# Additionally, the dataset includes columns matching all US zip codes: primary city; "acceptable
# cities," a variable which acknowledges that some zip codes match with two or more cities; the county 
# for the corresponding zip code; the state; the time zone; and the estimated population in 2015.

# Data was found here: https://www.unitedstateszipcodes.org/zip-code-database/

# If PYTHONPATH is not set, findspark and findspark.init() will find it on your machine 
import findspark
findspark.init()

import re
import sys
import spark
from pyspark import SparkContext
from pyspark.sql.types import *
import os
import sys
from pyspark.sql.functions import size

# schemaString = 'something'

# fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()]

from pyspark.sql import SparkSession
spark = SparkSession \
        .builder \
        .master('local') \
        .appName("BroadbandScout")\
        .getOrCreate()

# import pandas as pd

sc = SparkContext.getOrCreate()


input_pathname_ZIP_GPS = 's3a://sparkforinsightproject/usps_zip_code_database.csv'

output_pathname_ZIP_GPS = "s3a://sparkforinsightproject/database_data/transformed_ZIP_CODES_to_GPS"

input_pathname = 's3a://sparkforinsightproject/usps_zip_code_database.csv'


from pyspark.sql.functions import trim

def etl_ZIP_to_GPS(input_pathname):

    
    zip_to_gps_schema = StructType([
    StructField("zip", StringType(), True),
    StructField("type", StringType(), True),
    StructField("decommissioned", StringType(), True),
    StructField("primary_city", StringType(), True),
    StructField("acceptable_cities", StringType(), True),
    StructField("unacceptable_cities", StringType(), True),
    StructField("state", StringType(), True),
    StructField("county", StringType(), True),
    StructField("timezone", StringType(), True),
    StructField("area_code", StringType(), True),
    StructField("world_region", StringType(), True),
    StructField("country", StringType(), True),
    StructField("latitude", FloatType(), True),
    StructField("longitude", FloatType(), True),
    StructField("irs_estimated_population_2015", IntegerType(), True)
    ])

    df_ZIP_CODES_GPS = spark.read.csv(input_pathname, sep=',', header=True, schema=zip_to_gps_schema)
    
    
    
    df_ZIP_CODES_GPS = df_ZIP_CODES_GPS.select('zip', 'primary_city', 'acceptable_cities', 'state', 'county',
                            'timezone', 'latitude', 'longitude', 'irs_estimated_population_2015')\
                    .withColumn("county", trim(df_ZIP_CODES_GPS. county))\
                    .withColumn("acceptable_cities", trim(df_ZIP_CODES_GPS. acceptable_cities))\
                    .withColumnRenamed("zip", "zip_code")\
                    .withColumnRenamed("irs_estimated_population_2015", "population_2015")\
                    .withColumnRenamed("acceptable_cities", "accepted_cities")
    
    return df_ZIP_CODES_GPS

def main():
    input_data_txt = sys.argv[1]
    etl_ZIP_to_GPS(input_pathname)

    if __name__ == '__main__':
        main()

    
# output_df_HPI_to_CENSUS_TRACTS = etl_ZIP_to_GPS(input_pathname_ZIP_GPS)

# output_df_HPI_to_CENSUS_TRACTS.show(3)