In [1]:
import pandas as pd
from pyspark.sql import SparkSession
import os
import configparser
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, \
                            date_format, dayofweek, monotonically_increasing_id
from pyspark.sql.types import StructType, StructField, DoubleType, StringType, \
                                IntegerType, DateType, TimestampType
import pyspark.sql.functions as F

In [2]:
config = configparser.ConfigParser()
config.read('aws_credentials.cfg')

os.environ['AWS_ACCESS_KEY_ID'] = config['AWS']['AWS_ACCESS_KEY_ID']
os.environ['AWS_SECRET_ACCESS_KEY'] = config['AWS']['AWS_SECRET_ACCESS_KEY']

In [3]:
spark = SparkSession \
        .builder \
        .appName('capstone') \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.1.0") \
        .getOrCreate()
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")

DataFrame[key: string, value: string]

In [4]:
input_data = "s3a://tung99-bucket/"

zone_data = os.path.join(input_data, "taxi+_zone_lookup.csv")
temp_data = os.path.join(input_data, "Hyperlocal_Temperature_Monitoring.csv")
nta_data = os.path.join(input_data, "nta_codes.json")
trips_data = os.path.join(input_data, "yellow_tripdata_2018-07.csv")

In [5]:
zoneSchema = StructType([
    StructField("location_id", IntegerType(), nullable=False),
    StructField("boro_name", StringType(), nullable=False),
    StructField("nta_name", StringType(), nullable=False),
    StructField("service_zone", StringType(), nullable=False)
])

zone_df = spark.read.csv(zone_data, header=True, schema=zoneSchema)
zone_df.createOrReplaceTempView("zones")
zone_df.printSchema()
zone_df.show(5)

root
 |-- location_id: integer (nullable = true)
 |-- boro_name: string (nullable = true)
 |-- nta_name: string (nullable = true)
 |-- service_zone: string (nullable = true)

+-----------+-------------+--------------------+------------+
|location_id|    boro_name|            nta_name|service_zone|
+-----------+-------------+--------------------+------------+
|          1|          EWR|      Newark Airport|         EWR|
|          2|       Queens|         Jamaica Bay|   Boro Zone|
|          3|        Bronx|Allerton/Pelham G...|   Boro Zone|
|          4|    Manhattan|       Alphabet City| Yellow Zone|
|          5|Staten Island|       Arden Heights|   Boro Zone|
+-----------+-------------+--------------------+------------+
only showing top 5 rows



In [6]:
tempSchema = StructType([
    StructField("sensor_id", StringType(), nullable=False),
    StructField("air_temp", DoubleType(), nullable=False),
    StructField("date", StringType(), nullable=False),
    StructField("hour", IntegerType(), nullable=False),
    StructField("latitude", DoubleType(), nullable=False),
    StructField("longitude", DoubleType(), nullable=False),
    StructField("year", IntegerType(), nullable=False),
    StructField("install_type", StringType(), nullable=False)
])

temp_df = spark.read.csv(temp_data, header=True, schema=tempSchema)
temp_df = temp_df.withColumn("date", F.to_timestamp("date", "M/dd/yyyy"))
temp_df = temp_df.withColumn("month", F.month("date"))
temp_df = temp_df.withColumn("day", F.dayofmonth("date"))

temp_df.createOrReplaceTempView("temperatures")
temp_df.printSchema()
temp_df.show(5)

root
 |-- sensor_id: string (nullable = true)
 |-- air_temp: double (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- hour: integer (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- year: integer (nullable = true)
 |-- install_type: string (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)

+---------+-----------+-------------------+----+-----------+------------+----+------------+-----+---+
|sensor_id|   air_temp|               date|hour|   latitude|   longitude|year|install_type|month|day|
+---------+-----------+-------------------+----+-----------+------------+----+------------+-----+---+
| Bk-BR_01|     71.189|2018-06-15 00:00:00|   1|40.66620508|-73.91691035|2018| Street Tree|    6| 15|
| Bk-BR_01|70.24333333|2018-06-15 00:00:00|   2|40.66620508|-73.91691035|2018| Street Tree|    6| 15|
| Bk-BR_01|69.39266667|2018-06-15 00:00:00|   3|40.66620508|-73.91691035|2018| Street Tree|

In [7]:
nta_df = spark.read.json(nta_data, multiLine=True)
nta_df.show(5)

+--------------------+--------------------+
|                data|                meta|
+--------------------+--------------------+
|[[row-jnwt.3pd3_8...|{{[{cpf4-rkhq, 15...|
+--------------------+--------------------+



In [8]:
tripsSchema = StructType([
    StructField("vendor_id", IntegerType(), nullable=True),
    StructField("PU_date", StringType(), nullable=True),
    StructField("DO_date", StringType(), nullable=True),
    StructField("passenger_count", IntegerType(), nullable=True),
    StructField("trip_distance", DoubleType(), nullable=True),
    StructField("ratecode_id", IntegerType(), nullable=True),
    StructField("store_and_fwd_flag", StringType(), nullable=True),
    StructField("PU_location_id", IntegerType(), nullable=True),
    StructField("DO_location_id", IntegerType(), nullable=True),
    StructField("payment_type", IntegerType(), nullable=True),
    StructField("fare_amount", DoubleType(), nullable=True),
    StructField("extra", DoubleType(), nullable=True),
    StructField("mta_tax", DoubleType(), nullable=True),
    StructField("tip_amount", DoubleType(), nullable=True),
    StructField("tolls_amount", DoubleType(), nullable=True),
    StructField("improvement_surcharge", DoubleType(), nullable=True),
    StructField("total_amount", DoubleType(), nullable=True)
])

trips_df = spark.read.csv(trips_data, header=True, schema=tripsSchema)
trips_df = trips_df.withColumn("PU_date", F.to_timestamp("PU_date", "M/d/yyyy H:mm"))
trips_df = trips_df.withColumn("DO_date", F.to_timestamp("DO_date", "M/d/yyyy H:mm"))
trips_df = trips_df.withColumn("month", F.month("PU_date"))
trips_df = trips_df.withColumn("PU_day", F.dayofmonth("PU_date"))
trips_df = trips_df.withColumn("DO_day", F.dayofmonth("DO_date"))
trips_df = trips_df.withColumn("PU_hour", F.hour("PU_date"))
trips_df = trips_df.withColumn("DO_hour", F.hour("DO_date"))

trips_df.createOrReplaceTempView("trips")
trips_df.printSchema()
trips_df.show(5)

root
 |-- vendor_id: integer (nullable = true)
 |-- PU_date: timestamp (nullable = true)
 |-- DO_date: timestamp (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- ratecode_id: integer (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PU_location_id: integer (nullable = true)
 |-- DO_location_id: integer (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- month: integer (nullable = true)
 |-- PU_day: integer (nullable = true)
 |-- DO_day: integer (nullable = true)
 |-- PU_hour: integer (nullable = true)
 |-- DO_hour: integer (nullable = true)

+---------+-------------------+-------------

In [9]:
time_table = spark.sql('''
    SELECT month(date) AS month, dayofmonth(date) AS day, dayofweek(date) AS weekday, hour
    FROM temperatures
    WHERE year(date)=2018 AND month(date)=7
''').distinct().withColumn('time_id', monotonically_increasing_id())

time_table.createOrReplaceTempView("times")
time_table.show(5)

+-----+---+-------+----+----------+
|month|day|weekday|hour|   time_id|
+-----+---+-------+----+----------+
|    7| 14|      7|   0|         0|
|    7| 25|      4|   8|         1|
|    7| 29|      1|   5|         2|
|    7| 17|      3|  10|8589934592|
|    7| 22|      1|   0|8589934593|
+-----+---+-------+----+----------+
only showing top 5 rows



In [10]:
temps_table = spark.sql('''
    SELECT times.time_id, t.air_temp, t.install_type
    FROM temperatures t
    JOIN times ON times.month=t.month AND times.day=t.day AND times.hour=t.hour
''')

temps_table.show(5)

+------------+-----------+------------+
|     time_id|   air_temp|install_type|
+------------+-----------+------------+
|901943132162|72.53066667| Street Tree|
|901943132162|     73.948| Street Tree|
|901943132162|71.49733333|  Light Pole|
|901943132162|73.32866667| Street Tree|
|901943132162|72.09316667|  Light Pole|
+------------+-----------+------------+
only showing top 5 rows



In [11]:
trips_table = spark.sql('''
    SELECT vendor_id, times.time_id AS PU_date_id, DO_day, t.month, 
            DO_hour, passenger_count, trip_distance, PU_location_id, 
            DO_location_id, payment_type, fare_amount, extra, mta_tax,
            tip_amount, tolls_amount, improvement_surcharge, total_amount
    FROM trips t
    JOIN times ON t.PU_day=times.day AND t.month=times.month AND t.PU_hour=times.hour
''')

trips_table.createOrReplaceTempView("trips")
trips_table = spark.sql('''
    SELECT vendor_id, PU_date_id, times.time_id AS DO_date_id, passenger_count, 
            trip_distance, PU_location_id, DO_location_id, payment_type, 
            fare_amount, extra, mta_tax, tip_amount, tolls_amount, 
            improvement_surcharge, total_amount
    FROM trips t
    JOIN times ON t.DO_day=times.day AND t.month=times.month AND t.DO_hour=times.hour
''')

trips_table.show(5)

+---------+-------------+-------------+---------------+-------------+--------------+--------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+
|vendor_id|   PU_date_id|   DO_date_id|passenger_count|trip_distance|PU_location_id|DO_location_id|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|
+---------+-------------+-------------+---------------+-------------+--------------+--------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+
|        2|1322849927169|1039382085634|              3|         1.28|           246|           234|           1|        7.5|  0.5|    0.5|      1.76|         0.0|                  0.3|       10.56|
|        2|1262720385024|1039382085634|              1|         2.01|           164|            79|           2|        8.0|  0.5|    0.5|       0.0|         0.0|                  0.3|         9.3|
|        2

In [14]:
loc_table = spark.sql('''
    SELECT location_id, boro_name, nta_name, service_zone
    FROM zones
''')

loc_table.show(5)

+-----------+-------------+--------------------+------------+
|location_id|    boro_name|            nta_name|service_zone|
+-----------+-------------+--------------------+------------+
|          1|          EWR|      Newark Airport|         EWR|
|          2|       Queens|         Jamaica Bay|   Boro Zone|
|          3|        Bronx|Allerton/Pelham G...|   Boro Zone|
|          4|    Manhattan|       Alphabet City| Yellow Zone|
|          5|Staten Island|       Arden Heights|   Boro Zone|
+-----------+-------------+--------------------+------------+
only showing top 5 rows

