In [9]:
"""
Create a Spark Session

"""

from pyspark.sql import SparkSession
import pyarrow.parquet as pq
spark = (
    SparkSession.builder.appName("MAST30034 Assignment")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)

In [10]:
"""
Open the parquet file we just saved

"""

from pyspark.sql.functions import col, month, hour, to_date, dayofweek

# data cleansing for yellow taxi 
data = spark.read.parquet("data/curated/curated_yellow_taxi.parquet")
data.show()

+----------+------------+----+-----------+----------------+------+------+---------+-----+
|      date|total_amount|hour|day of week|temp_categorized|rained|snowed|  Borough|event|
+----------+------------+----+-----------+----------------+------+------+---------+-----+
|2019-05-08|        14.0|   0|          4|        Moderate|    No|    No|Manhattan|false|
|2019-05-08|         8.3|   0|          4|        Moderate|    No|    No|Manhattan|false|
|2019-05-08|       14.76|   0|          4|        Moderate|    No|    No|Manhattan|false|
|2019-05-27|       16.56|  14|          2|        Moderate|    No|    No|Manhattan|false|
|2019-02-23|         9.8|   0|          7|            Cold|   Yes|    No|Manhattan|false|
|2019-02-23|       23.14|  18|          7|            Cold|   Yes|    No|Manhattan|false|
|2019-02-23|        15.3|   0|          7|            Cold|   Yes|    No|Manhattan|false|
|2019-02-23|       15.38|   0|          7|            Cold|   Yes|    No|Manhattan|false|
|2019-02-2

In [16]:
"""
Perform all anova analysis

"""
from scipy.stats import f_oneway
import numpy as np

# temperature
hourly_temp_data = data.groupBy('date', 'hour', 'temp_categorized').count().drop('date', 'hour').toPandas()
temp_anova_event = f_oneway(hourly_temp_data[hourly_temp_data['temp_categorized'] == 'Cold']['count'],
                      hourly_temp_data[hourly_temp_data['temp_categorized'] == 'Moderate']['count'],
                      hourly_temp_data[hourly_temp_data['temp_categorized'] == 'Hot']['count'])

print(temp_anova_event)
# F_onewayResult(statistic=23.320703203270472, pvalue=8.433015701890361e-11)






In [15]:
# day of week

hourly_day_of_week_data = data.groupBy('date', 'hour', 'day of week').count().drop('date', 'hour').toPandas()

hourly_day_of_week_anova_event = f_oneway(hourly_day_of_week_data[hourly_day_of_week_data['day of week'] == 1]['count'],
                      hourly_day_of_week_data[hourly_day_of_week_data['day of week'] == 2]['count'],
                      hourly_day_of_week_data[hourly_day_of_week_data['day of week'] == 3]['count'],
                      hourly_day_of_week_data[hourly_day_of_week_data['day of week'] == 4]['count'],
                      hourly_day_of_week_data[hourly_day_of_week_data['day of week'] == 5]['count'],
                      hourly_day_of_week_data[hourly_day_of_week_data['day of week'] == 6]['count'],
                      hourly_day_of_week_data[hourly_day_of_week_data['day of week'] == 7]['count'])

print(hourly_day_of_week_anova_event)
# F_onewayResult(statistic=20.88765002215742, pvalue=2.8470674250821966e-24)


print(len(hourly_day_of_week_data))



F_onewayResult(statistic=20.88765002215742, pvalue=2.8470674250821966e-24)
4343


                                                                                

In [13]:

# time of day
hourly_time_of_day_data = data.groupBy('date', 'hour').count().drop('date').toPandas()
times = [x for x in range(0,24)]

hourly_time_of_day_data = [hourly_time_of_day_data[hourly_time_of_day_data['hour'] == h]['count'] for h in times]
hourly_time_of_day_anova_event = f_oneway(*hourly_time_of_day_data)

print(hourly_time_of_day_anova_event)
# extremely small p-values rounds to zero, but since f-stats is sufficiently large it's not error
F_onewayResult(statistic=398.1111490509572, pvalue=0.0)
print(len(hourly_time_of_day_data))



F_onewayResult(statistic=398.1111490509572, pvalue=0.0)
24


                                                                                

In [14]:
# Borough
from pyspark.sql.functions import col

distinct_boroughs = data.select('borough').distinct()
# distinct_boroughs.show()
boroughs = ["Queens", "EWR", "Brooklyn", "Staten Island", "Manhattan", "Bronx"]

hourly_borough_data = data.groupBy('date', 'hour', 'borough').count().drop('date', 'hour').toPandas()

hourly_borough_data = [hourly_borough_data[hourly_borough_data['borough'] == b]['count'] for b in boroughs]
hourly_borough_anova_event = f_oneway(*hourly_borough_data)

print(hourly_borough_anova_event)
# extremely small p-values rounds to zero, but since f-stats is sufficiently large it's not error
# F_onewayResult(statistic=9506.069945822808, pvalue=0.0)



                                                                                

F_onewayResult(statistic=9506.069945822808, pvalue=0.0)
