In [2]:
# This script processes a dataset available in an S3 bucket conataining information about broadband availability across the United States.            # Specifically, the data shows available broadband speeds,the broadband         # infrastrucure technologies, and the provider names for broadband systems      # across the more than 11 million US census blocks in the United States.
# The data source, file structure including variable names, and other documentation for the broadband dataset is available here: 
# https://opendata.fcc.gov/Wireline/Fixed-Broadband-Deployment-Data-June-2017-Status-V/9r8r-g7ut

# If PYTHONPATH is not set, findspark and findspark.init() will find it on your machine 
import findspark
findspark.init()

import re
import sys
import spark
from pyspark import SparkContext
from pyspark.sql.types import *
import os
import sys
from pyspark.sql.functions import size

from pyspark.sql.functions import substring, length, col, expr


# schemaString = 'something'

# fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()]

from pyspark.sql import SparkSession
spark = SparkSession \
        .builder \
        .master('local') \
        .appName("BroadbandScout")\
        .getOrCreate()

sc = SparkContext.getOrCreate()


pathname_input = 's3a://sparkforinsightproject/Fixed_Broadband_Deployment_Data__June__2017_Status_V1.csv'

pathname_output = 's3a://sparkforinsightproject/database_data/sparkdf_broadband_output_2'

sc = SparkContext.getOrCreate()

def etl_broadband(input_data_txt):

    broadband_schema = StructType([
        StructField("Logical Record Number", StringType(), True),
        StructField("Provider ID", StringType(), True),
        StructField("FRN", StringType(), True),
        StructField("Provider Name", StringType(), True),
        StructField("DBA Name", StringType(), True),
        StructField("Holding Company Name", StringType(), True),
        StructField("Holding Company Number", StringType(), True),
        StructField("Holding Company Final", StringType(), True),
        StructField("State", StringType(), True),
        StructField("Census Block FIPS Code", StringType(), True),
        StructField("Technology Code", StringType(), True),
        StructField("Consumer", StringType(), True),
        StructField("Max Advertised Downstream Speed (mbps)", IntegerType(), True),
        StructField("Max Advertised Upstream Speed (mbps)", IntegerType(), True),
        StructField("Business", StringType(), True),
        StructField("Max CIR Downstream Speed (mbps)", IntegerType(), True),
        StructField("Max CIR Upstream Speed (mbps)", IntegerType(), True)
        ])

    df_BROADBAND = spark.read.csv(input_data_txt, quote='"', header=True, sep=',', nullValue='NA', schema=broadband_schema)

    df_BROADBAND = df_BROADBAND\
                               .withColumnRenamed("DBA Name", "dba_name")\
                               .withColumnRenamed("State", "state")\
                               .withColumnRenamed("Census Block FIPS Code", "census_block")\
                               .withColumnRenamed("Technology Code", "technology")\
                               .withColumnRenamed("Max Advertised Downstream Speed (mbps)", "ma_downspeed")\
                               .withColumnRenamed("Max Advertised Upstream Speed (mbps)", "ma_upspeed")\
                               .withColumnRenamed("Max CIR Downstream Speed (mbps)",  "mc_downspeed")\
                               .withColumnRenamed("Max CIR Upstream Speed (mbps)",  "mc_upspeed")


    # This code selects and saves just seven of the 16 columns from the 
    # original file that have some clear potential value for the database.

    df_BROADBAND = df_BROADBAND\
                                       .select(
                                       "dba_name",\
                                       "census_block",\
                                       "state",\
                                       "technology",\
                                       "ma_downspeed",\
                                       "ma_upspeed",\
                                       "mc_downspeed",\
                                       "mc_upspeed")

    return df_BROADBAND

# def main():
#     input_data_txt = sys.argv[1]
#     output_data_txt = sys.argv[2]
#     extract_transform_load_broadband(input_data_txt, output_data_txt)

# if __name__ == '__main__':
#     main()



output_df_broadband = etl_broadband(pathname_input)


output_df_broadband.show()

+--------------------+---------------+-----+----------+------------+----------+------------+----------+
|            dba_name|   census_block|state|technology|ma_downspeed|ma_upspeed|mc_downspeed|mc_upspeed|
+--------------------+---------------+-----+----------+------------+----------+------------+----------+
|   unWired Broadband|060770053034016|   CA|        70|          21|         5|         200|       200|
|                  t6|550590029041059|   WI|        70|           5|         1|        1000|      1000|
|   Veracity Networks|490351031002010|   UT|        30|          12|        12|          12|        12|
|Monmouth Telephon...|340155016082005|   NJ|        50|           0|         0|         100|       100|
|Morris Broadband,...|371139706003009|   NC|        42|         100|         5|         100|         5|
|                null|           null| null|      null|        null|      null|        null|      null|
|Great Basin Inter...|320310012012053|   NV|        70|         

In [72]:
# count after dropping null rows

df_na_dropped = output_df_broadband.na.drop()
df_na_dropped.show()

+---------------+-----+----------+------------+----------+------------+----------+
|   census_tract|state|technology|ma_downspeed|ma_upspeed|mc_downspeed|mc_upspeed|
+---------------+-----+----------+------------+----------+------------+----------+
|060770053034016|   CA|        70|          21|         5|         200|       200|
|550590029041059|   WI|        70|           5|         1|        1000|      1000|
|490351031002010|   UT|        30|          12|        12|          12|        12|
|340155016082005|   NJ|        50|           0|         0|         100|       100|
|371139706003009|   NC|        42|         100|         5|         100|         5|
|320310012012053|   NV|        70|          12|         3|           0|         0|
|170759506001009|   IL|        70|           5|         1|        1000|      1000|
|490451307011026|   UT|        70|          15|         3|        1000|      1000|
|530770018004053|   WA|        70|          25|        25|        1000|      1000|
|551

In [None]:

output_df_broadband_WA_ONLY.count()

In [14]:
output_df_broadband_WA_ONLY.count()

1378559

## Create Washington only dataframe

In [96]:

df_BROADBAND_WA_ONLY = output_df_broadband.filter(output_df_broadband["state"] == "WA")
    

df_BROADBAND_WA_ONLY.show()

+---------------+-----+----------+------------+----------+------------+----------+
|   census_tract|state|technology|ma_downspeed|ma_upspeed|mc_downspeed|mc_upspeed|
+---------------+-----+----------+------------+----------+------------+----------+
|530770018004053|   WA|        70|          25|        25|        1000|      1000|
|530330304013022|   WA|        70|           0|         0|        1000|      1000|
|530439604003069|   WA|        70|          25|        25|        1000|      1000|
|530610402002017|   WA|        70|          25|        25|        1000|      1000|
|530730102002089|   WA|        70|          25|        25|        1000|      1000|
|530750009003056|   WA|        70|          25|        25|        1000|      1000|
|530330029003005|   WA|        70|           0|         0|        1000|      1000|
|530330320104003|   WA|        70|           0|         0|        1000|      1000|
|530250109011002|   WA|        70|          25|        25|        1000|      1000|
|530

In [21]:
df_BROADBAND_WA_ONLY.count()

1378559

In [110]:
# Change technology column to 1 or 0

# from pyspark.sql.functions import UserDefinedFunction
# from pyspark.sql.types import StringType

# changing_column = 'technology'
# udf_first = UserDefinedFunction(lambda x: 1, IntegerType())
# udf_second = UserDefinedFunction(lambda x: 0, IntegerType())
# first_df = df_BROADBAND_WA_ONLY.select(*[udf_first(changing_column) if column == 50 else column for column in df_BROADBAND_WA_ONLY])
# second_df = first_df.select(*[udf_second(changing_column) if column != 50 else column for column in df_BROADBAND_WA_ONLY])
# second_df.show()

new_df = 

ValueError: Cannot convert column into bool: please use '&' for 'and', '|' for 'or', '~' for 'not' when building DataFrame boolean expressions.

## Washington Only with null values removed


In [52]:
from pyspark.sql.functions import isnan

df_BROADBAND_WA_ONLY_NONULL = df_BROADBAND_WA_ONLY.filter(isnan(df_BROADBAND_WA_ONLY["ma_downspeed"]))
df_BROADBAND_WA_ONLY_NONULL.count()

0

In [55]:
# # Count null values

# from pyspark.sql.functions import isnan, when, count, col

# df_test = df_BROADBAND_WA_ONLY - df_BROADBAND_WA_ONLY.select([count(when(isnan(c), c)).alias(c) for c in df_BROADBAND_WA_ONLY.columns]).show()

# df_test.show()

+------------+-----+----------+------------+----------+------------+----------+
|census_tract|state|technology|ma_downspeed|ma_upspeed|mc_downspeed|mc_upspeed|
+------------+-----+----------+------------+----------+------------+----------+
|           0|    0|         0|           0|         0|           0|         0|
+------------+-----+----------+------------+----------+------------+----------+



TypeError: unsupported operand type(s) for -: 'DataFrame' and 'NoneType'

In [38]:
from pyspark.sql.functions import col, isnan, when, trim

def to_null(c):
    return when(~(col(c).isNull() | isnan(col(c)) | (trim(col(c)) == "")), col(c))


df_BROADBAND_WA_ONLY.select([to_null(c).alias(c) for c in df_BROADBAND_WA_ONLY.columns]).na.drop().show()


+---------------+-----+----------+------------+----------+------------+----------+
|   census_tract|state|technology|ma_downspeed|ma_upspeed|mc_downspeed|mc_upspeed|
+---------------+-----+----------+------------+----------+------------+----------+
|530770018004053|   WA|        70|          25|        25|        1000|      1000|
|530330304013022|   WA|        70|           0|         0|        1000|      1000|
|530439604003069|   WA|        70|          25|        25|        1000|      1000|
|530610402002017|   WA|        70|          25|        25|        1000|      1000|
|530730102002089|   WA|        70|          25|        25|        1000|      1000|
|530750009003056|   WA|        70|          25|        25|        1000|      1000|
|530330029003005|   WA|        70|           0|         0|        1000|      1000|
|530330320104003|   WA|        70|           0|         0|        1000|      1000|
|530250109011002|   WA|        70|          25|        25|        1000|      1000|
|530

# Additional code for filtering null values
| df_BROADBAND_WA_ONLY["ma_downspeed"].isNull() | isnan(df_BROADBAND_WA_ONLY["ma_downspeed"])


# Remove some number of characters from column

In [78]:
df_na_dropped.printSchema.show()

<bound method DataFrame.printSchema of DataFrame[census_tract: string, state: string, technology: string, ma_downspeed: int, ma_upspeed: int, mc_downspeed: int, mc_upspeed: int]>

In [76]:

# Removes last 4 characters of census_tract_column:

from pyspark.sql.functions import substring, length, col, expr

# # def remove_if_fifteen(input_df):

# # df_BROADBAND_WA_ONLY = df_BROADBAND_WA_ONLY.withColumn("census_tract",expr("substring(census_tract, 1, length(census_tract)-4)"))when(len(df_BROADBAND_WA_ONLY["session"]) == 15)

# df = df_BROADBAND_WA_ONLY.withColumn("census_tract",expr("substring(census_tract, 1, length(census_tract)-4)"))

df_eleven_tract_WA = df_BROADBAND_WA_ONLY.withColumn("census_tract_11",expr("substring(census_tract, 1, 11)"))

# # df_BROADBAND_WA_ONLY.show()

# df = output_df_broadband.withColumn('census_tract', output_df_broadband.substring(expr.col('census_tract'), 1, 11))
# df = output_df_broadband.withColumn('census_tract', df['census_tract'].substr(1, 11))

# from pyspark.sql.functions import substring, length

# df = df_na_dropped.withColumn("census_tract", when(length(df_na_dropped.census_tract) > 11, substring(df_na_dropped.census_tract, 1, length(df_na_dropped.census_tract) - 4)).otherwise(df_na_dropped.census_tract))

# df = df_na_dropped.substring(df_na_dropped.col('census_tract'), 1, 11) in place of df_na_dropped['census_tract'].substr(1, 11)
df_eleven_tract_WA.show()

+---------------+-----+----------+------------+----------+------------+----------+---------------+
|   census_tract|state|technology|ma_downspeed|ma_upspeed|mc_downspeed|mc_upspeed|census_tract_11|
+---------------+-----+----------+------------+----------+------------+----------+---------------+
|530770018004053|   WA|        70|          25|        25|        1000|      1000|    53077001800|
|530330304013022|   WA|        70|           0|         0|        1000|      1000|    53033030401|
|530439604003069|   WA|        70|          25|        25|        1000|      1000|    53043960400|
|530610402002017|   WA|        70|          25|        25|        1000|      1000|    53061040200|
|530730102002089|   WA|        70|          25|        25|        1000|      1000|    53073010200|
|530750009003056|   WA|        70|          25|        25|        1000|      1000|    53075000900|
|530330029003005|   WA|        70|           0|         0|        1000|      1000|    53033002900|
|530330320

In [124]:
# sqlContext.registerDataFrameAsTable(df_eleven_tract_WA, "broadband_table")

# df_broadband_WA = sqlContext.sql("SELECT census_tract AS census_block, technology, mc_downspeed, ma_downspeed, census_tract_11 as census_tract from broadband_table")
# df_broadband_WA.show(5)

from pyspark.sql.functions import when

df_eleven_tract_WA\
.select("*", when(df_broadband_WA.technology == 50, 1)\
        .otherwise(0)\
        .alias("tech_binary"))\
.show()


+---------------+-----+----------+------------+----------+------------+----------+---------------+-----------+
|   census_tract|state|technology|ma_downspeed|ma_upspeed|mc_downspeed|mc_upspeed|census_tract_11|tech_binary|
+---------------+-----+----------+------------+----------+------------+----------+---------------+-----------+
|530770018004053|   WA|        70|          25|        25|        1000|      1000|    53077001800|          0|
|530330304013022|   WA|        70|           0|         0|        1000|      1000|    53033030401|          0|
|530439604003069|   WA|        70|          25|        25|        1000|      1000|    53043960400|          0|
|530610402002017|   WA|        70|          25|        25|        1000|      1000|    53061040200|          0|
|530730102002089|   WA|        70|          25|        25|        1000|      1000|    53073010200|          0|
|530750009003056|   WA|        70|          25|        25|        1000|      1000|    53075000900|          0|
|

In [118]:
df_broadband_WA = sqlContext.sql(
                "SELECT census_block, technology, mc_downspeed, ma_downspeed, census_tract_11 as census_tract CASE WHEN technology = 50 THEN 'FIBER' ELSE Quantity = 'NOT FIBER' FROM broadband_table")
df_broadband_WA.show(5)

ParseException: u"\nmismatched input 'CASE' expecting <EOF>(line 1, pos 93)\n\n== SQL ==\nSELECT census_block, technology, mc_downspeed, ma_downspeed, census_tract_11 as census_tract CASE WHEN technology = 50 THEN 'FIBER' ELSE Quantity = 'NOT FIBER' FROM broadband_table\n---------------------------------------------------------------------------------------------^^^\n"

In [90]:


df_broadband_WA.select("census_tract").distinct().show()



+------------+
|census_tract|
+------------+
| 53073000502|
| 53053061001|
| 53063003200|
| 53077003002|
| 53033032007|
| 53011041331|
| 53015001300|
| 53067012211|
| 53077002200|
| 53021020100|
| 53063001100|
| 53065951300|
| 53061052607|
| 53011040703|
| 53035091204|
| 53033003000|
| 53025010100|
| 53011040710|
| 53047970600|
| 53061053700|
+------------+
only showing top 20 rows



In [87]:
df_broadband_WA.count()

1378559

In [125]:
mode = "overwrite"
database_name = 'broadband_scoutdb'
hostname = 'ec2-54-186-79-112.us-west-2.compute.amazonaws.com'
url = "jdbc:postgresql://{hostname}:5432/{db}".format(hostname=hostname, db=database_name)
properties = {"user": "postgres","password": "postgres","driver": "org.postgresql.Driver"}
df_eleven_tract_WA.write.jdbc(url=url, table="broadband_table_WA", properties=properties)

## COUNT the distinct values in Washington Only for ma_downspeed with null values removed


In [35]:

from pyspark.sql.functions import col, countDistinct

count_distinct_vals = df_BROADBAND_WA_ONLY.agg(*(countDistinct(col(ma_upspeed)).alias(ma_upspeed) for ma_upspeed in df_BROADBAND_WA_ONLY.columns))

count_distinct_vals.show()

+------------+-----+----------+------------+----------+------------+----------+
|census_tract|state|technology|ma_downspeed|ma_upspeed|mc_downspeed|mc_upspeed|
+------------+-----+----------+------------+----------+------------+----------+
|      195574|    1|        12|          35|        25|          49|        44|
+------------+-----+----------+------------+----------+------------+----------+



In [56]:
#DOWNSPEED
count_distinct_vals = df_BROADBAND_WA_ONLY.agg(*(countDistinct(col(mc_downspeed)).alias(mc_downspeed) for ma_upspeed in df_BROADBAND_WA_ONLY.columns))

count_distinct_vals.show()

NameError: global name 'countDistinct' is not defined

In [52]:
# Count length of census tract column to check if all values are 15 or if some are 11 already, creates new column tract_length
# all lengths of the census tract column

# df_count_tract_col_lengths = output_df_broadband.select('*',size(output_df_broadband['census_tract']).alias('tract_length'))


In [None]:
df_BROADBAND_WA_ONLY['technology'] = df_BROADBAND_WA_ONLY['technology'].replace("50", "1")
df_BROADBAND_WA_ONLY['technology'] = df_BROADBAND_WA_ONLY['technology'].replace(!="50", "1")


In [59]:
df_BROADBAND_WA_ONLY.show(10)

+---------------+-----+----------+------------+----------+------------+----------+
|   census_tract|state|technology|ma_downspeed|ma_upspeed|mc_downspeed|mc_upspeed|
+---------------+-----+----------+------------+----------+------------+----------+
|530770018004053|   WA|        70|          25|        25|        1000|      1000|
|530330304013022|   WA|        70|           0|         0|        1000|      1000|
|530439604003069|   WA|        70|          25|        25|        1000|      1000|
|530610402002017|   WA|        70|          25|        25|        1000|      1000|
|530730102002089|   WA|        70|          25|        25|        1000|      1000|
|530750009003056|   WA|        70|          25|        25|        1000|      1000|
|530330029003005|   WA|        70|           0|         0|        1000|      1000|
|530330320104003|   WA|        70|           0|         0|        1000|      1000|
|530250109011002|   WA|        70|          25|        25|        1000|      1000|
|530

# Group broadband table by shortened 11 character census tract, and get median value of mc_downspeed


In [63]:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

df_eleven_tract_WA.registerTempTable("broadband_table_wa")
df_BROADBAND_WA_grouped = sqlContext.sql("select census_tract, technology, percentile_approx(mc_downspeed, 0.5) as med_mc_downspeed from broadband_table_wa group by census_tract, technology")

df_BROADBAND_WA_grouped.show(10)

+------------+----------+----------------+
|census_tract|technology|med_mc_downspeed|
+------------+----------+----------------+
| 53005010201|        70|             100|
| 53033021700|        70|            1000|
| 53033025302|        70|            1000|
| 53053062400|        70|            1000|
| 53053063400|        42|               0|
| 53053072112|        42|               0|
| 53063000500|        70|               5|
| 53063010202|        70|              30|
| 53073000400|        70|              25|
| 53073010301|        70|            1000|
+------------+----------+----------------+
only showing top 10 rows



In [64]:
df_BROADBAND_WA_grouped.count()

11228

In [None]:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

df_BROADBAND_WA_grouped.registerTempTable("broadband_table_wa")
df_BROADBAND_WA_grouped = sqlContext.sql("select census_tract, percentile_approx(mc_downspeed, 0.5) as med_mc_downspeed from broadband_table_wa group by census_tract")

df_BROADBAND_WA_grouped.show(10)

In [60]:
from pyspark.sql.functions import regexp_replace, col

pathname_input_TRACT_ZIP = 's3a://sparkforinsightproject/TRACT_ZIP_122017.csv'


def etl_tract_to_zip(input_TRACT_ZIP_csv):
    
    census_zip_schema = StructType([
    StructField('tract', StringType(), True),
    StructField("zip", StringType(), True),
    StructField("bus_ratio", FloatType(), True)
        ])

    df_TRACT_ZIP = spark.read.csv(input_TRACT_ZIP_csv, header=True, schema=census_zip_schema, sep=',', nullValue='NA')\
                                .withColumnRenamed("tract", "census_tract")

    return df_TRACT_ZIP


df_TRACT_ZIP_output = etl_tract_to_zip(pathname_input_TRACT_ZIP)  

In [75]:
# Join broaband_table_WA with ct to zip code data

# left_join_BROADBAND_ZIP_ADDED = df_eleven_tract_WA.join(df_TRACT_ZIP_output, df_eleven_tract_WA.census_tract == df_TRACT_ZIP_output.census_tract,how='left') # Could also use 'left_outer'
# left_join_BROADBAND_ZIP_ADDED.show()


# left_join_BROADBAND_ZIP_ADDED = df_BROADBAND_WA_grouped.join(df_TRACT_ZIP_output,[df_BROADBAND_WA_grouped.census_tract==df_TRACT_ZIP_output.census_tract, df_BROADBAND_WA_grouped.census_tract==df_TRACT_ZIP_output.census_tract],'left')

left_join_BROADBAND_ZIP_ADDED = df_broadband_WA.join(df_TRACT_ZIP_output, df_BROADBAND_WA_grouped.census_tract == df_TRACT_ZIP_output.census_tract)

left_join_BROADBAND_ZIP_ADDED.show()

+------------+----------+----------------+------------+-----+-----------+
|census_tract|technology|med_mc_downspeed|census_tract|  zip|  bus_ratio|
+------------+----------+----------------+------------+-----+-----------+
| 53011041331|        11|               0| 53011041331|98682|        1.0|
| 53011041331|        60|               0| 53011041331|98682|        1.0|
| 53011041331|        30|              10| 53011041331|98682|        1.0|
| 53011041331|        70|            1000| 53011041331|98682|        1.0|
| 53011041331|        43|               0| 53011041331|98682|        1.0|
| 53011041331|        50|               0| 53011041331|98682|        1.0|
| 53011041331|        12|               0| 53011041331|98682|        1.0|
| 53015001300|        43|               0| 53015001300|98626|        1.0|
| 53015001300|        50|            1000| 53015001300|98626|        1.0|
| 53015001300|        60|               0| 53015001300|98626|        1.0|
| 53015001300|        11|             

In [74]:
http://southpark.cc.com/full-episodes/s22e09-unfulfilled#source=2b6c5ab4-d717-4e84-9143-918793a3b636:63a32034-1ea6-492d-b95b-9433e3f62f8d&position=2&sort=airdate.show(5)

+------------+----------+----------------+------------+-----+---------+
|census_tract|technology|med_mc_downspeed|census_tract|  zip|bus_ratio|
+------------+----------+----------------+------------+-----+---------+
| 53011041331|        11|               0| 53011041331|98682|      1.0|
| 53011041331|        60|               0| 53011041331|98682|      1.0|
| 53011041331|        30|              10| 53011041331|98682|      1.0|
| 53011041331|        70|            1000| 53011041331|98682|      1.0|
| 53011041331|        43|               0| 53011041331|98682|      1.0|
+------------+----------+----------------+------------+-----+---------+
only showing top 5 rows



In [71]:

    
    
    # If PYTHONPATH is not set, findspark and findspark.init() will find it on your machine 
import findspark
findspark.init()

import re
import sys
import spark
from pyspark import SparkContext
from pyspark.sql.types import *
import os
import sys
# schemaString = 'something'

# fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()]

from pyspark.sql import SparkSession
spark = SparkSession \
        .builder \
        .master('local') \
        .appName("BroadbandScout")\
        .getOrCreate()

from pyspark.sql.functions import regexp_replace, col

# import pandas as pd

sc = SparkContext.getOrCreate()


pathname_input = 's3a://sparkforinsightproject/HPI_AT_BDL_tract.csv'

pathname_output = 's3a://sparkforinsightproject/database_data/sparkdf_broadband_output_2'

sc = SparkContext.getOrCreate()

def etl_hpi_to_tract(input_data_txt):
    

    rdd_HPI_to_CENSUS = sc.textFile(input_data_txt)

    # This file is a comma separated file and has some commas embedded within string objects within some of the columns. The file is first read in to       # an RDD and this function is used in a map transformation (below) to           # remove those commas.

#     def remove_commas(row):

#         row = re.sub('(?!(([^"]*"){2})*[^"]*$),', '', row)

#         return row

#     # The first 11 numbers of a census block number match the 11 numbers in a census tract number. While the census block numbers in this file are 15       # digits long, the other datasets to be joined with this file for the           # database (containing housing data and zip codes) only have the 11 digit       # census tract data. Therefore, this function is used in a map                  # transformation (below) to keep only the first 11 numbers of the census        # block numbers. This column will be used to join this file by other files      # containing census tract numbers.


#     def reduce_census_block_code(row):

#         row[9] = row[9][:11]

#         return row

#     # This block removes the header, which is necessary for implementing the code below.

# #    header = rdd_BROADBAND.first()

# #    rdd_BROADBAND = rdd_BROADBAND.filter(lambda line: line != header)


#     # This block of code removes ascii unicode information, removes commas embedded in within strings in some of the columns, splits the RDD object      # on the commas, and reduces the length of the census blocks from 15 to 11      # digits.

    rdd_HPI_to_TRACT = rdd_HPI_to_CENSUS\
                                   .map(lambda x: x.encode('ascii', 'ignore'))\
                                   .map(lambda x: x.split(','))\
                                   .map(lambda l: (l[0], l[1], l[2], l[4]))

    

    hpi_schema = StructType([
        StructField("tract", StringType(), True),
        StructField("state_abbr", StringType(), True),
        StructField("year", StringType(), True),
        StructField("annual_change", StringType(), True),
        StructField("hpi", FloatType(), True),
        StructField("hpi1990", FloatType(), True),
        StructField("hpi2000", FloatType(), True)
        ])


    df_HPI_to_TRACT = spark.read.csv(input_data_txt, header=True, schema=hpi_schema)#\
#                         .withColumn("('tract'",regexp_replace(col("('tract'"), "\(", ""))\
#                         .withColumn("'hpi')",regexp_replace(col("'hpi')"), "\)", "")) 
                                        
    
    #, quote='"', header=True, mode="DROPMALFORMED", sep=',', nullValue='NA', schema=broadband_schema

#     df_transformed_BROADBAND = spark.read.csv(rdd_transformed_BROADBAND, header=False,
#                                               mode="DROPMALFORMED", sep=',')

    # This code renames the columns I want to keep.

    df_HPI_to_TRACT = df_HPI_to_TRACT\
                               .withColumnRenamed("tract", "census_tract")\
                               .withColumnRenamed("state_abbr", "state")


    # This code selects and saves just seven of the 16 columns from the 
    # original file that have some clear potential value for the database.

    df_HPI_to_TRACT = df_HPI_to_TRACT\
                                       .select(
                                       "census_tract",
                                       "state",
                                       "year",
                                       "hpi")
#                                        "ma_downspeed",\
#                                        "ma_upspeed",\
#                                        "mc_downspeed",\
#                                        "mc_upspeed")

    return df_HPI_to_TRACT

# def main():
#     input_data_txt = sys.argv[1]
#     output_data_txt = sys.argv[2]
#     extract_transform_load_broadband(input_data_txt, output_data_txt)

# if __name__ == '__main__':
#     main()




output_hpi_to_tract = etl_hpi_to_tract(pathname_input)


df_hpi_tract_2017_wa = output_hpi_to_tract.where(col('year').isin({'2017'})).where(col('state').isin({'WA'}))

df_hpi_tract_2017_wa.show(5)

+------------+-----+----+------+
|census_tract|state|year|   hpi|
+------------+-----+----+------+
| 53001950100|   WA|2017|121.51|
| 53001950300|   WA|2017|250.55|
| 53001950400|   WA|2017|204.59|
| 53001950500|   WA|2017|218.37|
| 53003960100|   WA|2017|258.93|
+------------+-----+----+------+
only showing top 5 rows



In [73]:

inner_join_broadband_hpi = left_join_BROADBAND_ZIP_ADDED.join(df_hpi_tract_2017_wa,[left_join_BROADBAND_ZIP_ADDED.\
            census_tract==df_hpi_tract_2017_wa.census_tract, left_join_BROADBAND_ZIP_ADDED.census_tract==df_hpi_tract_2017_wa.census_tract],'inner')


AnalysisException: u"Reference 'census_tract' is ambiguous, could be: broadband_table_wa.census_tract, census_tract.;"

In [None]:
left_join_BROADBAND_ZIP_ADDED

In [67]:
mode = "overwrite"
database_name = 'broadband_scoutdb'
hostname = 'ec2-54-186-79-112.us-west-2.compute.amazonaws.com'
url = "jdbc:postgresql://{hostname}:5432/{db}".format(hostname=hostname, db=database_name)
properties = {"user": "postgres","password": "postgres","driver": "org.postgresql.Driver"}
left_join_BROADBAND_ZIP_ADDED.write.jdbc(url=url, table="broadband_table_zip", properties=properties)

In [None]:










# mode = "overwrite"
# database_name = 'broadband_scoutdb'
# hostname = 'ec2-54-186-79-112.us-west-2.compute.amazonaws.com'
# url = "jdbc:postgresql://{hostname}:5432/{db}".format(hostname=hostname, db=database_name)
# properties = {"user": "postgres","password": "postgres","driver": "org.postgresql.Driver"}
# broadband_output.write.jdbc(url=url, table="broadband_table", mode=mode, properties=properties)

Py4JJavaError: An error occurred while calling o78.jdbc.
: java.lang.ClassNotFoundException: org.postgresql.Driver
	at java.net.URLClassLoader.findClass(URLClassLoader.java:382)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
	at org.apache.spark.sql.execution.datasources.jdbc.DriverRegistry$.register(DriverRegistry.scala:45)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions$$anonfun$6.apply(JDBCOptions.scala:79)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions$$anonfun$6.apply(JDBCOptions.scala:79)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions.<init>(JDBCOptions.scala:79)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions.<init>(JDBCOptions.scala:35)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider.createRelation(JdbcRelationProvider.scala:60)
	at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:45)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:70)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:68)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:86)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:80)
	at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:80)
	at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:654)
	at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:654)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:77)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:654)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:273)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:267)
	at org.apache.spark.sql.DataFrameWriter.jdbc(DataFrameWriter.scala:499)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


In [None]:
mode = "overwrite"
url = "jdbc:postgresql://198.123.43.24:5432/kockpit"


In [11]:
broadband_output.write.jdbc(url=url, table="broadband_table", mode=mode, properties=properties)

Py4JJavaError: An error occurred while calling o302.jdbc.
: java.lang.ClassNotFoundException: org.postgresql.Driver
	at java.net.URLClassLoader.findClass(URLClassLoader.java:382)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
	at org.apache.spark.sql.execution.datasources.jdbc.DriverRegistry$.register(DriverRegistry.scala:45)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions$$anonfun$6.apply(JDBCOptions.scala:79)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions$$anonfun$6.apply(JDBCOptions.scala:79)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions.<init>(JDBCOptions.scala:79)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions.<init>(JDBCOptions.scala:35)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider.createRelation(JdbcRelationProvider.scala:60)
	at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:45)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:70)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:68)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:86)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:80)
	at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:80)
	at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:654)
	at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:654)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:77)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:654)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:273)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:267)
	at org.apache.spark.sql.DataFrameWriter.jdbc(DataFrameWriter.scala:499)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


In [12]:
type(broadband_output)

pyspark.sql.dataframe.DataFrame

In [11]:

from pyspark.sql import (SparkSession, 
                         functions as F,
                        )
import postgres
import argparse

DEBUG = False

pathname_input = 's3a://sparkforinsightproject/Fixed_Broadband_Deployment_Data__June__2017_Status_V1.csv'



class transform_BROADBAND_data(object):
    def __init__(self, file_name):
        self.spark = SparkSession \
            .builder \
            .master('local') \
            .appName("BroadbandScout") \
            .getOrCreate()
        
        broadband_schema = StructType([
            StructField("Logical Record Number", StringType(), True),
            StructField("Provider ID", StringType(), True),
            StructField("FRN", StringType(), True),
            StructField("Provider Name", StringType(), True),
            StructField("DBA Name", StringType(), True),
            StructField("Holding Company Name", StringType(), True),
            StructField("Holding Company Number", StringType(), True),
            StructField("Holding Company Final", StringType(), True),
            StructField("State", StringType(), True),
            StructField("Census Block FIPS Code", StringType(), True),
            StructField("Technology Code", StringType(), True),
            StructField("Consumer", StringType(), True),
            StructField("Max Advertised Downstream Speed (mbps)", IntegerType(), True),
            StructField("Max Advertised Upstream Speed (mbps)", IntegerType(), True),
            StructField("Business", StringType(), True),
            StructField("Max CIR Downstream Speed (mbps)", IntegerType(), True),
            StructField("Max CIR Upstream Speed (mbps)", IntegerType(), True)
            ])


    input_file_directory = 's3a://sparkforinsightproject/'
    input_filename = 'Fixed_Broadband_Deployment_Data__June__2017_Status_V1.csv'
   
    def read_BROADBAND_csv(self):
        file_name = "{0}/{file_name}".format(file_directory, input_filename)
        return self.spark.read.csv(input_data_txt, quote='"', header=True,
                                     mode="DROPMALFORMED", sep=',', nullValue='NA', schema=broadband_schema)


    def df_transformer(self):
        df_transformed_BROADBAND = df_transformed_BROADBAND\
           .withColumnRenamed("State", "state")\
           .withColumnRenamed("Census Block FIPS Code", "census_tract")\
           .withColumnRenamed("Technology Code", "technology")\
           .withColumnRenamed("Max Advertised Downstream Speed (mbps)", "ma_downspeed")\
           .withColumnRenamed("Max Advertised Upstream Speed (mbps)", "ma_upspeed")\
           .withColumnRenamed("Max CIR Downstream Speed (mbps)",  "mc_downspeed")\
           .withColumnRenamed("Max CIR Upstream Speed (mbps)",  "mc_upspeed")


        # This code selects and saves just seven of the 16 columns from the original 
        # file that have some clear potential value for the database.

        df_transformed_BROADBAND = df_transformed_BROADBAND\
           .select(
           "census_tract",\
           "state",\
           "technology",\
           "ma_downspeed",\
           "ma_upspeed",\
           "mc_downspeed",\
           "mc_upspeed")
        
        return df_transformed_BROADBAND
    
    def write_to_postgresql(self, out_df):
        table = "broadband_table"
        mode = "append"
        
        connector = postgres.PostgresConnector()
        connector.write(out_df, table, mode)

    def run(self):
        csv_df = self.read_BROADBAND_csv()
        csv_df.printSchema()
        
        out_df = self.df_transformer(csv_df)
        out_df.printSchema()
        
        self.write_to_postgresql(out_df)
        

def run():
    parser = argparse.ArgumentParser()
#         parser.add_argument("--debug", help="debug mode, loads small test file.", action="store_true")
#         parser.add_argument("--file", help="file name to process")
    args = parser.parse_args()

    file_name = args.file if args.file else input_filename
    proc = transform_BROADBAND_data(file_name)
    proc.run()

run()

usage: ipykernel_launcher.py [-h]
ipykernel_launcher.py: error: unrecognized arguments: -f /run/user/1000/jupyter/kernel-f8b3444c-69c1-494c-9b27-6f89a0a0e471.json


SystemExit: 2

In [5]:
%tb

SystemExit: 2

In [3]:
# This cell creates "BroadbandScout" as my broadband_table for the BroadbandScout project.

broadband_output.createOrReplaceTempView("broadband_table")


In [3]:
print('g')

g


In [4]:

#Pandas select all columns from the broadband dataframe and creates a pyspark.sql.dataframe.Dataframe

broadband_full_table = spark.sql('SELECT * \
FROM broadband_table')

# broadband_full_table.show(5)

In [5]:
import pandas as pd

pd_df_broadband = pd.read_csv(broadband_full_table)

ValueError: Invalid file path or buffer object type: <class 'pyspark.sql.dataframe.DataFrame'>

In [6]:
# Converts pyspark.sql.dataframe to a Pandas dataframe

pd_df_broadband = broadband_full_table.toPandas()



Py4JJavaError: An error occurred while calling o120.collectToPython.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 47 in stage 1.0 failed 1 times, most recent failure: Lost task 47.0 in stage 1.0 (TID 48, localhost, executor driver): java.lang.OutOfMemoryError: Java heap space
	at java.util.Arrays.copyOf(Arrays.java:3236)
	at java.io.ByteArrayOutputStream.grow(ByteArrayOutputStream.java:118)
	at java.io.ByteArrayOutputStream.ensureCapacity(ByteArrayOutputStream.java:93)
	at java.io.ByteArrayOutputStream.write(ByteArrayOutputStream.java:153)
	at org.apache.spark.util.ByteBufferOutputStream.write(ByteBufferOutputStream.scala:41)
	at java.io.ObjectOutputStream$BlockDataOutputStream.write(ObjectOutputStream.java:1853)
	at java.io.ObjectOutputStream.write(ObjectOutputStream.java:709)
	at org.apache.spark.util.Utils$.writeByteBuffer(Utils.scala:247)
	at org.apache.spark.scheduler.DirectTaskResult$$anonfun$writeExternal$1.apply$mcV$sp(TaskResult.scala:50)
	at org.apache.spark.scheduler.DirectTaskResult$$anonfun$writeExternal$1.apply(TaskResult.scala:48)
	at org.apache.spark.scheduler.DirectTaskResult$$anonfun$writeExternal$1.apply(TaskResult.scala:48)
	at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1346)
	at org.apache.spark.scheduler.DirectTaskResult.writeExternal(TaskResult.scala:48)
	at java.io.ObjectOutputStream.writeExternalData(ObjectOutputStream.java:1459)
	at java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1430)
	at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1178)
	at java.io.ObjectOutputStream.writeObject(ObjectOutputStream.java:348)
	at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:43)
	at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:100)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:454)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1602)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1590)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1589)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1589)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:831)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1823)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1772)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1761)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2034)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2055)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2074)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2099)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:939)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:938)
	at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:297)
	at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply(Dataset.scala:3195)
	at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply(Dataset.scala:3192)
	at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3254)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:77)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3253)
	at org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:3192)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.OutOfMemoryError: Java heap space
	at java.util.Arrays.copyOf(Arrays.java:3236)
	at java.io.ByteArrayOutputStream.grow(ByteArrayOutputStream.java:118)
	at java.io.ByteArrayOutputStream.ensureCapacity(ByteArrayOutputStream.java:93)
	at java.io.ByteArrayOutputStream.write(ByteArrayOutputStream.java:153)
	at org.apache.spark.util.ByteBufferOutputStream.write(ByteBufferOutputStream.scala:41)
	at java.io.ObjectOutputStream$BlockDataOutputStream.write(ObjectOutputStream.java:1853)
	at java.io.ObjectOutputStream.write(ObjectOutputStream.java:709)
	at org.apache.spark.util.Utils$.writeByteBuffer(Utils.scala:247)
	at org.apache.spark.scheduler.DirectTaskResult$$anonfun$writeExternal$1.apply$mcV$sp(TaskResult.scala:50)
	at org.apache.spark.scheduler.DirectTaskResult$$anonfun$writeExternal$1.apply(TaskResult.scala:48)
	at org.apache.spark.scheduler.DirectTaskResult$$anonfun$writeExternal$1.apply(TaskResult.scala:48)
	at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1346)
	at org.apache.spark.scheduler.DirectTaskResult.writeExternal(TaskResult.scala:48)
	at java.io.ObjectOutputStream.writeExternalData(ObjectOutputStream.java:1459)
	at java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1430)
	at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1178)
	at java.io.ObjectOutputStream.writeObject(ObjectOutputStream.java:348)
	at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:43)
	at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:100)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:454)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [None]:
pd_df_broadband.head(5)

In [50]:
# This code only works with a pandas dataframe, not a spark dataframe

from sqlalchemy import create_engine
engine = create_engine('postgresql://{0}:{1}@{2}:5432/{3}'\
        .format('postgres', 'postgres', 'ec2-54-186-79-112.us-west-2.compute.amazonaws.com', 'broadband_scoutDB'))
nto.to_sql('broadband_table', engine)

# from sqlalchemy import create_engine
# engine = create_engine('postgresql://username:password@localhost:5432/name_of_db')
# df.to_sql('table_name', engine)

# import pymysql
# from sqlalchemy import create_engine
# cnx = create_engine('mysql+pymysql://[user]:[pass]@[host]:[port]/[schema]', echo=False)

# data = pd.read_sql('SELECT * FROM sample_table', cnx)
# data.to_sql(name='sample_table2', con=cnx, if_exists = 'append', index=False)


# import mysql.connector
# from sqlalchemy import create_engine

# engine = create_engine('mysql+mysqlconnector://[user]:[pass]@[host]:[port]/[schema]', echo=False)
# data.to_sql(name='sample_table2', con=engine, if_exists = 'append', index=False)


AttributeError: 'NoneType' object has no attribute 'to_sql'

In [None]:
import postgres

pathname_input = 's3a://sparkforinsightproject/Fixed_Broadband_Deployment_Data__June__2017_Status_V1.csv'

table = "broadband_table"
mode = "append"



input_directory = 's3a://sparkforinsightproject/'

input_filename = 'Fixed_Broadband_Deployment_Data__June__2017_Status_V1.csv'


connector = postgres.PostgresConnector()
connector.write(out_df, table, mode)

parser = argparse.ArgumentParser()
args = parser.parse_args()

file_name = args.file if args.file else input_filename



In [None]:
import pymysql
from sqlalchemy import create_engine
cnx = create_engine('mysql+pymysql://[user]:[pass]@[host]:[port]/[schema]', echo=False)

data = pd.read_sql('SELECT * FROM sample_table', cnx)
data.to_sql(name='sample_table2', con=cnx, if_exists = 'append', index=False)

In [50]:
broadband_full_table.head(10)

[Row(census_tract=u'060770053034016', state=u'CA', technology_code=u'70', max_adver_downstr_speed=u'21', max_adver_upstr_speed=u'5', max_cir_downstr_speed=u'200', max_cir_upstr_speed=u'200'),
 Row(census_tract=u'550590029041059', state=u'WI', technology_code=u'70', max_adver_downstr_speed=u'5', max_adver_upstr_speed=u'1', max_cir_downstr_speed=u'1000', max_cir_upstr_speed=u'1000'),
 Row(census_tract=u'490351031002010', state=u'UT', technology_code=u'30', max_adver_downstr_speed=u'12', max_adver_upstr_speed=u'12', max_cir_downstr_speed=u'12', max_cir_upstr_speed=u'12'),
 Row(census_tract=u'340155016082005', state=u'NJ', technology_code=u'50', max_adver_downstr_speed=u'0', max_adver_upstr_speed=u'0', max_cir_downstr_speed=u'100', max_cir_upstr_speed=u'100'),
 Row(census_tract=u'371139706003009', state=u'NC', technology_code=u'42', max_adver_downstr_speed=u'100', max_adver_upstr_speed=u'5', max_cir_downstr_speed=u'100', max_cir_upstr_speed=u'5'),
 Row(census_tract=u'360210004011070', stat

In [51]:
broadband_full_table_ct11 = spark.sql(
    'UPDATE BroadbandScout \
    SET max_adver_upstream_speed=LEFT(max_adver_upstream_speed, LEN(max_adver_upstream_speed)-4)')



ParseException: u"\nmismatched input 'UPDATE' expecting {'(', 'SELECT', 'FROM', 'ADD', 'DESC', 'WITH', 'VALUES', 'CREATE', 'TABLE', 'INSERT', 'DELETE', 'DESCRIBE', 'EXPLAIN', 'SHOW', 'USE', 'DROP', 'ALTER', 'MAP', 'SET', 'RESET', 'START', 'COMMIT', 'ROLLBACK', 'REDUCE', 'REFRESH', 'CLEAR', 'CACHE', 'UNCACHE', 'DFS', 'TRUNCATE', 'ANALYZE', 'LIST', 'REVOKE', 'GRANT', 'LOCK', 'UNLOCK', 'MSCK', 'EXPORT', 'IMPORT', 'LOAD'}(line 1, pos 0)\n\n== SQL ==\nUPDATE BroadbandScout     SET max_adver_upstream_speed=LEFT(max_adver_upstream_speed, LEN(max_adver_upstream_speed)-4)\n^^^\n"

In [49]:
broadband_full_table_ct11.head(10)

NameError: name 'broadband_full_table_ct11' is not defined

In [37]:
#Pandas select all columns ONLY WHERE state = WA from the broadband dataframe

broadband_full_table = spark.sql('SELECT * \
FROM BroadbandScout \
WHERE state = "WA"')

In [38]:
broadband_full_table.head(30)

[Row(census_tract=u'530770018004053', state=u'WA', technology_code=u'70', max_adver_downstr_speed=u'25', max_adver_upstr_speed=u'25', max_cir_downstr_speed=u'1000', max_cir_upstr_speed=u'1000'),
 Row(census_tract=u'530330304013022', state=u'WA', technology_code=u'70', max_adver_downstr_speed=u'0', max_adver_upstr_speed=u'0', max_cir_downstr_speed=u'1000', max_cir_upstr_speed=u'1000'),
 Row(census_tract=u'530439604003069', state=u'WA', technology_code=u'70', max_adver_downstr_speed=u'25', max_adver_upstr_speed=u'25', max_cir_downstr_speed=u'1000', max_cir_upstr_speed=u'1000'),
 Row(census_tract=u'530610402002017', state=u'WA', technology_code=u'70', max_adver_downstr_speed=u'25', max_adver_upstr_speed=u'25', max_cir_downstr_speed=u'1000', max_cir_upstr_speed=u'1000'),
 Row(census_tract=u'530730102002089', state=u'WA', technology_code=u'70', max_adver_downstr_speed=u'25', max_adver_upstr_speed=u'25', max_cir_downstr_speed=u'1000', max_cir_upstr_speed=u'1000'),
 Row(census_tract=u'5307500

In [None]:
spark

In [1]:
print('hello world')

hello world


In [35]:
# Get average value of max_adver_downstr_speed

                                                                                             
broadband_results = spark.sql('SELECT avg(max_adver_downstr_speed) \
FROM BroadbandScout \
WHERE state = "WA"')

pd_df_broadband = broadband_results.toPandas()




In [None]:
# Get max_adver_downstr_speed and housing values joined by census tract number

broadband_output.createOrReplaceTempView("BroadbandScout")
                                                                                             
broadband_results = spark.sql('SELECT avg(max_adver_downstr_speed) \
FROM BroadbandScout \
WHERE state = "WA"')

pd_df_broadband = broadband_results.toPandas()


In [2]:
# pd_df_broadband = pd_df_broadband.sort_values('max_adver_downstr_speed')

pd_df_broadband.head(20)

NameError: name 'pd_df_broadband' is not defined

In [1]:



import psycopg2


class PostgresConnector(object):
    def __init__(self):
        self.database_name = 'broadband_scoutdb'
        self.hostname = 'ec2-54-186-79-112.us-west-2.compute.amazonaws.com'
        self.url_connect = "jdbc:postgresql://{hostname}:5432/{db}".format(hostname=self.hostname, db=self.database_name)
        self.properties = {"user":"thomas_ernste",
                      "driver": "org.postgresql.Driver"
                     }
    def get_writer(self, df):
        return DataFrameWriter(df)
        
    def write(self, df, table, mode):
        my_writer = self.get_writer(df)
        my_writer.jdbc(self.url_connect, table, mode, self.properties)



In [None]:
print(my_database)

go


In [None]:
 
# "password":os.environ['POSTGRES_PASS'],





conn = psycopg2.connect(host = "ec2-54-186-79-112.us-west-2.compute.amazonaws.com",\
database="broadband_scoutdb",/
user='thomas_ernste', 
password=')

#cursor
cur = con.cursor()

#example 1 execute a query
cur.execute("insert into employees ")

#example 1 execute a query
$execute query
cur.execute('SELECT * FROM broadband_table')


rows = cur.fetchall()

for r in rows:
    print(f"id {r[0]} name {r[1]}")

#close the cursor
cur = con.cursor()

#close the connection
conn.close()

In [None]:

#!/usr/bin/env python
# coding: utf-8

from pyspark.sql import DataFrameWriter
import os

class PostgresConnector(object):
    def __init__(self):
        self.database_name = 'occupancy'
        self.hostname = 'ec2-52-39-242-144.us-west-2.compute.amazonaws.com'
        self.url_connect = "jdbc:postgresql://{hostname}:5432/{db}".format(hostname=self.hostname, db=self.database_name)
        self.properties = {"user":"spark_user", 
                      "password":os.environ['POSTGRES_PASS'],
                      "driver": "org.postgresql.Driver"
                     }
    def get_writer(self, df):
        return DataFrameWriter(df)
        
    def write(self, df, table, mode):
        my_writer = self.get_writer(df)
        my_writer.jdbc(self.url_connect, table, mode, self.properties)

In [None]:
import psycopg2

my_ec2 = 'ec2-52-39-242-144.us-west-2.compute.amazonaws.com'

conn = psycopg2.connect(host=my_ec2, database='broadband_scoutDB',
                       user='postgres', password='postgres')



In [None]:
#!/usr/bin/python

# Script created from a template found here:
# http://www.postgresqltutorial.com/postgresql-python/connect/

import psycopg2
from config import config
 
def connect():
    """ Connect to the PostgreSQL database server """
    conn = None
    try:
        # read connection parameters
        params = config()
 
        # connect to the PostgreSQL server
        print('Connecting to the PostgreSQL database...')
        conn = psycopg2.connect(**params)
 
        # create a cursor
        cur = conn.cursor()
        
 # execute a statement
        print('PostgreSQL database version:')
        cur.execute('SELECT version()')
 
        # display the PostgreSQL database server version
        db_version = cur.fetchone()
        print(db_version)
       
     # close the communication with the PostgreSQL
        cur.close()
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
    finally:
        if conn is not None:
            conn.close()
            print('Database connection closed.')
 
 
if __name__ == '__main__':
    connect()

In [None]:
#!/usr/bin/python

# Script created from template here: 
# http://www.postgresqltutorial.com/postgresql-python/connect/

from configparser import ConfigParser


def config(filename='database.ini', section='postgresql'):
    # create a parser
    parser = ConfigParser()
    # read config file
    parser.read(filename)
 
    # get section, default to postgresql
    db = {}
    if parser.has_section(section):
        params = parser.items(section)
        for param in params:
            db[param[0]] = param[1]
    else:
        raise Exception('Section {0} not found in the {1} file'.format(section, filename))
 
    return db