# MAST30034 Applied Data Science Project 1

## Part 1: Preprocessing (II)

### Import Libraries and Create Spark Session

In [None]:
import pandas as pd
import numpy as np
import os
from pyspark.sql import SparkSession, functions as F

In [None]:
spark = (
    SparkSession.builder.appName("MAST30034 Project 1-1-2")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.executor.memory", "10g")
    .config("spark.driver.memory", "10g")
    .config("spark.sql.session.timeZone",  "Etc/UTC")
    .getOrCreate()
)

### Read In Data

In [None]:
sdf = spark.read.parquet("../data/curated/full_data/manhattan_data_cleaned")
sdf_test = spark.read.parquet("../data/curated/full_data/manhattan_data_test")
zones_rent = pd.read_csv("../data/curated/rent_taxi_zone_lookup.csv")

### Aggregate Data For Analysis

In [None]:
ANALYSIS_PATH = "../data/curated/analysis/"
if not os.path.exists(ANALYSIS_PATH):
    os.makedirs(ANALYSIS_PATH)

In [None]:
# Taxi zone - monthly rent data
zones = zones_rent[["LocationID", "Borough", "Zone", "service_zone"]]
RENT_COLS = [f"2021-{i:02}" for i in range(1, 13)]
zones_rent["Average Median Rent"] = zones_rent[RENT_COLS].mean(axis=1)
zones_rent["Average Median Rent (Scaled)"] = \
                        np.log(zones_rent["Average Median Rent"])
zones_rent.to_csv("../data/curated/analysis/zones_rent.csv")

In [None]:
# Monthly total pickups, each zone
total_pu = sdf \
                .groupBy(F.month('pickup_datetime').alias("month")) \
                .agg(
                    F.count("PULocationID").alias("total_pickups")
                ) \
                .orderBy("month") \
                .toPandas()
total_pu.to_csv("../data/curated/analysis/total_pu.csv")

In [None]:
# Hourly total pickups, each zone
pu_by_hour = sdf \
                .groupBy(F.to_date('pickup_datetime').alias("date"),
                         F.hour('pickup_datetime').alias("hour"),
                         'PULocationID') \
                .agg(
                    F.count("PULocationID").alias("total_pickups")
                ) \
                .orderBy("date", "hour", "PULocationID") \
                .toPandas()

In [None]:
# Hourly pickup, each zone + its rent that month
pu_rent = pu_by_hour \
        .merge(zones_rent, left_on="PULocationID", right_on="LocationID") \
        .drop(["LocationID", "Unnamed: 0", "Borough", "service_zone",
               "rental_zone"], axis=1)

# Extract rent of the month from columns
pu_rent['month'] = pd.DatetimeIndex(pu_rent['date']).month
pu_rent["rent"] = pu_rent.apply(lambda row:
                                row[f"2021-{row['month']:02}"], axis=1)
pu_rent = pu_rent \
                .sort_values(by=["date", "hour", 'PULocationID']) \
                .drop(RENT_COLS, axis=1)
pu_rent.to_csv("../data/curated/analysis/pu_rent.csv")

### Aggregate Data For Modelling

In [None]:
# Hourly total pickups, each zone, for each rideshare company
pu_by_hour_license = sdf \
                .groupBy(F.to_date('pickup_datetime').alias("date"),
                         F.hour('pickup_datetime').alias("hour"),
                         'PULocationID', "hvfhs_license_num") \
                .agg(
                    F.count("PULocationID").alias("total_pickups")
                ) \
                .orderBy("date", "hour", "PULocationID", "hvfhs_license_num") \
                .toPandas()

# Test data
pu_by_hour_license_test = sdf_test \
                .groupBy(F.to_date('pickup_datetime').alias("date"),
                         F.hour('pickup_datetime').alias("hour"),
                         'PULocationID', "hvfhs_license_num") \
                .agg(
                    F.count("PULocationID").alias("total_pickups")
                ) \
                .orderBy("date", "hour", "PULocationID", "hvfhs_license_num") \
                .toPandas()

Extract Datetime and Rent Data for Modelling

In [None]:
CATEGORICAL_LIST = ["month", "day", "day_of_week", "hour",
                    'PULocationID', "hvfhs_license_num"]

# For training data
pu_rent_train = pu_by_hour_license \
        .merge(zones_rent, left_on="PULocationID", right_on="LocationID") \
        .drop(["LocationID", "Unnamed: 0", "Borough", "service_zone",
               "rental_zone", "Average Median Rent",
               "Average Median Rent (Scaled)"], axis=1)

pu_rent_train['month'] = pd.DatetimeIndex(pu_rent_train['date']).month
pu_rent_train['day'] = pd.DatetimeIndex(pu_rent_train['date']).day
pu_rent_train['day_of_week'] = pd \
        .DatetimeIndex(pu_rent_train['date']).dayofweek
pu_rent_train["rent"] = pu_rent_train \
        .apply(lambda row: row[f"2021-{row['month']:02}"], axis=1)
pu_rent_train = pu_rent_train \
                .sort_values(by=CATEGORICAL_LIST) \
                .drop(RENT_COLS + ["2022-04", "date", "Zone"], axis=1)
model_data = spark.createDataFrame(pu_rent_train)

model_data \
    .write \
    .mode('overwrite') \
    .parquet('../data/curated/modelling/model_data')

In [None]:
# For test data
pu_rent_test = pu_by_hour_license_test \
        .merge(zones_rent, left_on="PULocationID", right_on="LocationID") \
        .drop(["LocationID", "Unnamed: 0", "Borough", "service_zone",
               "rental_zone", "Average Median Rent",
               "Average Median Rent (Scaled)"], axis=1)

pu_rent_test['month'] = pd.DatetimeIndex(pu_rent_test['date']).month
pu_rent_test['day'] = pd.DatetimeIndex(pu_rent_test['date']).day
pu_rent_test['day_of_week'] = pd.DatetimeIndex(pu_rent_test['date']).dayofweek
pu_rent_test["rent"] = pu_rent_test["2022-04"]
pu_rent_test = pu_rent_test \
                .sort_values(by=CATEGORICAL_LIST) \
                .drop(RENT_COLS + ["2022-04", "date", "Zone"], axis=1)
test_data = spark.createDataFrame(pu_rent_test)

test_data \
    .write \
    .mode('overwrite') \
    .parquet('../data/curated/modelling/model_test_data')