# MAST30034 Applied Data Science Project 1

## Part 2: Analysis

### Import Libraries and Create Spark Session

In [None]:
import pandas as pd
import numpy as np
import geopandas as gpd
import folium
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib import lines
from statsmodels.formula.api import ols
import statsmodels.api as sm
from pyspark.sql import SparkSession, functions as F

In [None]:
# Create a spark session
spark = (
    SparkSession.builder.appName("MAST30034 Project 1-2")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.executor.memory", "10g")
    .config("spark.driver.memory", "10g")
    .config("spark.sql.session.timeZone",  "Etc/UTC")
    .getOrCreate()
)

### Read In Data

In [None]:
total_pu = pd.read_csv("../data/curated/analysis/total_pu.csv")
zones_rent = pd.read_csv("../data/curated/analysis/zones_rent.csv")
pu_rent = pd.read_csv("../data/curated/analysis/pu_rent.csv")
pickup_sample = pd \
    .read_parquet('../data/curated/sampled_data')
sf = gpd.read_file("../data/taxi_zones/taxi_zones.shp")

### Relationship between Pickup and Rent

In [None]:
# Rent of months we would like to analyse
RENT_COLS = [f"2021-{i:02}" for i in range(1, 13)]

Top Zones by Rent & Pickup

In [None]:
# Inspect neighbourhoods with highest pickups
pu_rent \
    .groupby("Zone") \
    .aggregate({"total_pickups": sum}) \
    .sort_values(by="total_pickups", ascending=False) \
    .head(5)

In [None]:
# Inspect neighbourhoods with highest rent
zones_rent[['LocationID', 'Zone', 'rental_zone', 'Average Median Rent']] \
    .sort_values(by="Average Median Rent", ascending=False).head(5)

Line Plot of Monthly Pickup and Rent

In [None]:
sns.set(style="whitegrid")
sns.set(rc={'figure.figsize':(9,8)})

# Get the monthly rent for each neighbourhood
MONTHS = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 
             'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']  
rent = zones_rent[RENT_COLS].drop_duplicates()


# Plot rent on one axis (left)
rent_plot = sns.pointplot(x=MONTHS, y=rent.mean().values)
rent_plot.set_xticklabels(MONTHS)
rent_plot.set_xlabel("Month") 
rent_plot.set_ylabel("Average of Median Asking Rent ($)") 
rent_plot.set(title='Rent and Pickups in Manhattan in 2021')

# Plot pickups on one axis (right)
ax2 = rent_plot.twinx()
pu_plot = sns.pointplot(x=MONTHS, y=total_pu["total_pickups"], ax=ax2, 
                        color='r')
pu_plot.set_ylabel("Total Pickups (in Millions)")

pu_plot.legend(
        handles=[lines.Line2D([], [], marker='_', color="r", label='Pickups'), 
        lines.Line2D([], [], marker='_', color="b", label='Rent')]
        )

plt.savefig("../plots/rent_pickups.png")

ANOVA of Hourly Pickup with Rent

In [None]:
fit_rent = ols(
    formula="total_pickups ~ rent + C(PULocationID)",
    data=pu_rent
).fit()
print(sm.stats.anova_lm(fit_rent, typ=2))

### Geospatial Visualisation of Pickup and Rent

Create GeoDataFrame and GeoJson

In [None]:
# Code from MAST30034 Applied Data Science Tutorial 2
# Convert the geometry shape to to latitude and longitude
sf['geometry'] = sf['geometry'] \
                    .to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")

zones = zones_rent[["LocationID", "Borough", "Zone", "service_zone"]]
gdf = gpd.GeoDataFrame(
    pd.merge(zones, sf, on='LocationID', how='inner')
)
geoJSON = gdf[['LocationID', 'geometry']] \
    .drop_duplicates('LocationID').to_json()

Aggregation of sample done here (using Pandas) since it is relatively small and quick

In [None]:
# (Sampled) total pickups for visualisation
pu_freq = pickup_sample \
    .groupby('PULocationID') \
    .agg({'PULocationID': 'count'}) \
    .rename({'PULocationID': 'total_trips'}, axis=1)

Map for Average Pickup in 2021

In [None]:
m_pu = folium.Map(location=[40.73, -73.74],
                  tiles="cartodbpositron", zoom_start=11)

c_pu = folium.Choropleth(
    geo_data=geoJSON,
    name='Pickup Frequency By Zone',
    data=pu_freq.reset_index(),
    columns=['PULocationID', 'total_trips'],
    key_on='properties.LocationID',
    fill_color='YlOrRd',
    nan_fill_color='black',
    legend_name='Pickup Frequency By Zone'
)

c_pu.add_to(m_pu)

m_pu

Map for Average Median Rent in 2021

In [None]:
m_rent = folium.Map(location=[40.73, -73.74],
                    tiles="cartodbpositron", zoom_start=11)

c_rent = folium.Choropleth(
    geo_data=geoJSON,
    name='Log of Median Rent of Manhattan Neighbourhoods',
    data=zones_rent,
    columns=['LocationID', "Average Median Rent (Scaled)"],
    key_on='properties.LocationID',
    fill_color='YlOrRd',
    nan_fill_color='black',
    legend_name='Log of Median Rent of Manhattan Neighbourhoods'
)

c_rent.add_to(m_rent)

m_rent