### Part 1: Data Preprocessing

In [1]:
import numpy as np
import pandas as pd
import math
import geopandas as gpd
import bs4
import re
import requests
import os
import pyarrow.parquet as pq

In [2]:
result = set()
for filename in os.listdir("./yellow_taxi"):
    if filename.endswith(".parquet"):
        existing_columns = set(pq.ParquetFile("./yellow_taxi/"+filename).schema.names)
        result = result.union(existing_columns)

result

{'DOLocationID',
 'End_Lat',
 'End_Lon',
 'Fare_Amt',
 'PULocationID',
 'Passenger_Count',
 'Payment_Type',
 'Rate_Code',
 'RatecodeID',
 'Start_Lat',
 'Start_Lon',
 'Tip_Amt',
 'Tolls_Amt',
 'Total_Amt',
 'Trip_Distance',
 'Trip_Dropoff_DateTime',
 'Trip_Pickup_DateTime',
 'VendorID',
 '__index_level_0__',
 'airport_fee',
 'congestion_surcharge',
 'dropoff_datetime',
 'dropoff_latitude',
 'dropoff_longitude',
 'extra',
 'fare_amount',
 'improvement_surcharge',
 'mta_tax',
 'passenger_count',
 'payment_type',
 'pickup_datetime',
 'pickup_latitude',
 'pickup_longitude',
 'rate_code',
 'store_and_forward',
 'store_and_fwd_flag',
 'surcharge',
 'tip_amount',
 'tolls_amount',
 'total_amount',
 'tpep_dropoff_datetime',
 'tpep_pickup_datetime',
 'trip_distance',
 'vendor_id',
 'vendor_name'}

In [3]:
# get yellow taxi data
def download_yellow_taxi_parquet_files():
    response = requests.get("https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page")
    soup = bs4.BeautifulSoup(response.content, 'html.parser')
    yellow_records = soup.find_all("a", attrs={"title": "Yellow Taxi Trip Records"})
    if not os.path.exists("./yellow_taxi"):
        os.makedirs("./yellow_taxi")
    
    for record in yellow_records:
        pattern = r'yellow_tripdata_(2009|201[0-5])-\d{2}\.'
        link = record["href"]
        if re.search(pattern, link):
            filename = os.path.join("yellow_taxi", link.split("/")[-1])
            response = requests.get(link)
            
            with open(filename, "wb") as f:
                f.write(response.content)

In [4]:
# download_yellow_taxi_parquet_files()

In [5]:
def generate_yellow_taxi_df():
    columns_to_select = ['DOLocationID',
                         'End_Lat',
                         'End_Lon',
                         'PULocationID',
                         'Passenger_Count',
                         'Start_Lat',
                         'Start_Lon',
                         'Total_Amt',
                         'Trip_Distance',
                         'Trip_Dropoff_DateTime',
                         'Trip_Pickup_DateTime',
                         'dropoff_datetime',
                         'dropoff_latitude',
                         'dropoff_longitude',
                         'passenger_count',
                         'pickup_datetime',
                         'pickup_latitude',
                         'pickup_longitude',
                         'total_amount',
                         'tpep_dropoff_datetime',
                         'tpep_pickup_datetime',
                         'trip_distance']

    directory = "./yellow_taxi"
    yellow_taix_df = pd.DataFrame()
    for filename in os.listdir(directory):
        if filename.endswith(".parquet"):
            existing_columns = set(pq.ParquetFile("./yellow_taxi/"+filename).schema.names)
            columns_to_read = list(set(columns_to_select) & existing_columns)
            table = pq.read_table("./yellow_taxi/"+filename, columns=columns_to_read)
            df = table.to_pandas()
            random_subset = df.sample(n=3000)
            yellow_taix_df = pd.concat([yellow_taix_df, random_subset], ignore_index=True)
    yellow_taix_df.to_csv("yellow_taxi_ride_sample.csv", index = False)

In [6]:
# generate_yellow_taxi_df()

In [7]:
yellow_taxi_ride_sample = pd.read_csv("yellow_taxi_ride_sample.csv")

In [8]:
def convert_to_datetime(df):
    df["tpep_pickup_datetime"] = pd.to_datetime(df["tpep_pickup_datetime"])
    df["Trip_Pickup_DateTime"] = pd.to_datetime(df["Trip_Pickup_DateTime"])
    df["pickup_datetime"] = pd.to_datetime(df["pickup_datetime"])

    df["tpep_dropoff_datetime"] = pd.to_datetime(df["tpep_dropoff_datetime"])
    df["Trip_Dropoff_DateTime"] = pd.to_datetime(df["Trip_Dropoff_DateTime"])
    df["dropoff_datetime"] = pd.to_datetime(df["dropoff_datetime"])
    return df

In [9]:
yellow_taxi_ride_sample = convert_to_datetime(yellow_taxi_ride_sample)

In [10]:
def impute_pickup_datetime(row):
    if pd.notna(row['tpep_pickup_datetime']):
        return row['tpep_pickup_datetime']
    elif pd.notna(row['pickup_datetime']):
        return row['pickup_datetime']
    else:
        return row['Trip_Pickup_DateTime']

In [11]:
yellow_taxi_ride_sample['pickup_datetime'] = yellow_taxi_ride_sample.apply(impute_pickup_datetime, axis=1)

In [12]:
def impute_dropoff_datetime(row):
    if pd.notna(row['tpep_dropoff_datetime']):
        return row['tpep_dropoff_datetime']
    elif pd.notna(row['dropoff_datetime']):
        return row['dropoff_datetime']
    else:
        return row['Trip_Dropoff_DateTime']

In [13]:
yellow_taxi_ride_sample['dropoff_datetime'] = yellow_taxi_ride_sample.apply(impute_dropoff_datetime, axis=1)

In [14]:
def impute_trip_distance(row):
    if pd.notna(row['trip_distance']):
        return row['trip_distance']
    else:
        return row['Trip_Distance']

In [15]:
yellow_taxi_ride_sample['trip_distance'] = yellow_taxi_ride_sample.apply(impute_trip_distance, axis=1)

In [16]:
def impute_total_amount(row):
    if pd.notna(row['Total_Amt']):
        return row['Total_Amt']
    else:
        return row['total_amount']

In [17]:
yellow_taxi_ride_sample['total_amount'] = yellow_taxi_ride_sample.apply(impute_total_amount, axis=1)

In [18]:
def drop_columns1(df):
    df = df.drop('Trip_Distance', axis=1)
    df = df.drop('tpep_pickup_datetime', axis=1)
    df = df.drop('Trip_Pickup_DateTime', axis=1)
    df = df.drop('tpep_dropoff_datetime', axis=1)
    df = df.drop('Trip_Dropoff_DateTime', axis=1)
    df = df.drop('Total_Amt', axis=1)
    return df

In [19]:
yellow_taxi_ride_sample = drop_columns1(yellow_taxi_ride_sample)

In [20]:
gdf_polygons = gpd.read_file('./taxi_zones/taxi_zones.shp')

In [21]:
def gdf_get_location(df):
    df = gdf_polygons.to_crs(4326)
    df['pickup_lon'] = df['geometry'].centroid.x
    df['pickup_lat'] = df['geometry'].centroid.y
    df['dropoff_lon'] = df['geometry'].centroid.x
    df['dropoff_lat'] = df['geometry'].centroid.y
    return df

In [22]:
gdf_polygons = gdf_get_location(gdf_polygons)


  df['pickup_lon'] = df['geometry'].centroid.x

  df['pickup_lat'] = df['geometry'].centroid.y

  df['dropoff_lon'] = df['geometry'].centroid.x

  df['dropoff_lat'] = df['geometry'].centroid.y


In [23]:
gdf_polygons

Unnamed: 0,OBJECTID,Shape_Leng,Shape_Area,zone,LocationID,borough,geometry,pickup_lon,pickup_lat,dropoff_lon,dropoff_lat
0,1,0.116357,0.000782,Newark Airport,1,EWR,"POLYGON ((-74.18445 40.69500, -74.18449 40.695...",-74.174000,40.691831,-74.174000,40.691831
1,2,0.433470,0.004866,Jamaica Bay,2,Queens,"MULTIPOLYGON (((-73.82338 40.63899, -73.82277 ...",-73.831299,40.616745,-73.831299,40.616745
2,3,0.084341,0.000314,Allerton/Pelham Gardens,3,Bronx,"POLYGON ((-73.84793 40.87134, -73.84725 40.870...",-73.847422,40.864474,-73.847422,40.864474
3,4,0.043567,0.000112,Alphabet City,4,Manhattan,"POLYGON ((-73.97177 40.72582, -73.97179 40.725...",-73.976968,40.723752,-73.976968,40.723752
4,5,0.092146,0.000498,Arden Heights,5,Staten Island,"POLYGON ((-74.17422 40.56257, -74.17349 40.562...",-74.188484,40.552659,-74.188484,40.552659
...,...,...,...,...,...,...,...,...,...,...,...
258,259,0.126750,0.000395,Woodlawn/Wakefield,259,Bronx,"POLYGON ((-73.85107 40.91037, -73.85207 40.909...",-73.852215,40.897932,-73.852215,40.897932
259,260,0.133514,0.000422,Woodside,260,Queens,"POLYGON ((-73.90175 40.76078, -73.90147 40.759...",-73.906306,40.744235,-73.906306,40.744235
260,261,0.027120,0.000034,World Trade Center,261,Manhattan,"POLYGON ((-74.01333 40.70503, -74.01327 40.704...",-74.013023,40.709139,-74.013023,40.709139
261,262,0.049064,0.000122,Yorkville East,262,Manhattan,"MULTIPOLYGON (((-73.94383 40.78286, -73.94376 ...",-73.946510,40.775932,-73.946510,40.775932


In [24]:
def merge_geo_df(geo_df, df):
    df = pd.merge(df, geo_df[['LocationID', 'pickup_lon']],
                                   left_on='PULocationID', right_on='LocationID')
    df = pd.merge(df, geo_df[['LocationID', 'dropoff_lon']],
                                   left_on='DOLocationID', right_on='LocationID')
    df = pd.merge(df, geo_df[['LocationID', 'pickup_lat']],
                                   left_on='PULocationID', right_on='LocationID')
    df = pd.merge(df, geo_df[['LocationID', 'dropoff_lat']],
                                   left_on='PULocationID', right_on='LocationID')
    df = df.drop(['DOLocationID', 'PULocationID', 'LocationID_x', 'LocationID_y'], axis=1)
    return [geo_df, df]

In [25]:
results = merge_geo_df(gdf_polygons, yellow_taxi_ride_sample)
gdf_polygons = results[0]
yellow_taxi_ride_sample = results[1]

  df = pd.merge(df, geo_df[['LocationID', 'dropoff_lat']],


In [26]:
def impute_pickup_longitude(row):
    if pd.notna(row['Start_Lon']):
        return row['Start_Lon']
    elif pd.notna(row['pickup_longitude']):
        return row['pickup_longitude']
    else:
        return row['pickup_lon']

In [27]:
yellow_taxi_ride_sample['pickup_longitude'] = yellow_taxi_ride_sample.apply(impute_pickup_longitude, axis=1)

In [28]:
def impute_pickup_latitude(row):
    if pd.notna(row['Start_Lat']):
        return row['Start_Lat']
    elif pd.notna(row['pickup_latitude']):
        return row['pickup_latitude']
    else:
        return row['pickup_lat']

In [29]:
yellow_taxi_ride_sample['pickup_latitude'] = yellow_taxi_ride_sample.apply(impute_pickup_latitude, axis=1)

In [30]:
def impute_dropoff_longitude(row):
    if pd.notna(row['End_Lon']):
        return row['End_Lon']
    elif pd.notna(row['dropoff_longitude']):
        return row['dropoff_longitude']
    else:
        return row['dropoff_lon']

In [31]:
yellow_taxi_ride_sample['dropoff_longitude'] = yellow_taxi_ride_sample.apply(impute_dropoff_longitude, axis=1)

In [32]:
def impute_dropoff_latitude(row):
    if pd.notna(row['End_Lat']):
        return row['End_Lat']
    elif pd.notna(row['dropoff_latitude']):
        return row['dropoff_latitude']
    else:
        return row['dropoff_lat']

In [33]:
yellow_taxi_ride_sample['dropoff_latitude'] = yellow_taxi_ride_sample.apply(impute_dropoff_latitude, axis=1)

In [34]:
def impute_passenger_count(row):
    if pd.notna(row['passenger_count']):
        return row['passenger_count']
    else:
        return row['Passenger_Count']

In [35]:
yellow_taxi_ride_sample['passenger_count'] = yellow_taxi_ride_sample.apply(impute_passenger_count, axis=1)

In [36]:
def drop_columns2(df):
    df = df.drop(["Start_Lon", "pickup_lon"], axis=1)
    df = df.drop(["Start_Lat", "pickup_lat"], axis=1)
    df = df.drop(["End_Lon", "dropoff_lon"], axis=1)
    df = df.drop(["End_Lat", "dropoff_lat"], axis=1)
    df = df.drop(["Passenger_Count"], axis=1)
    return df

In [37]:
yellow_taxi_ride_sample = drop_columns2(yellow_taxi_ride_sample)
yellow_taxi_ride_sample

Unnamed: 0,trip_distance,total_amount,passenger_count,dropoff_latitude,dropoff_datetime,dropoff_longitude,pickup_latitude,pickup_datetime,pickup_longitude
0,2.25,10.60,1.0,40.759818,2011-07-20 08:41:00,-74.004015,40.759818,2011-07-20 08:24:00,-73.984196
1,0.70,7.20,1.0,40.759818,2011-07-15 10:03:47,-74.004015,40.759818,2011-07-15 09:56:50,-73.984196
2,1.80,8.60,1.0,40.759818,2011-07-16 16:59:14,-74.004015,40.759818,2011-07-16 16:48:56,-73.984196
3,0.70,5.80,1.0,40.759818,2011-07-19 12:42:30,-74.004015,40.759818,2011-07-19 12:36:29,-73.984196
4,1.42,9.25,6.0,40.759818,2014-06-15 09:56:00,-74.004015,40.759818,2014-06-15 09:49:00,-73.984196
...,...,...,...,...,...,...,...,...,...
175647,2.00,12.30,1.0,40.691201,2015-12-18 20:30:43,-73.763146,40.691201,2015-12-18 20:16:50,-73.763146
175648,0.02,7.30,1.0,40.657853,2015-08-31 11:26:56,-73.739473,40.657853,2015-08-31 11:26:49,-73.739473
175649,1.90,13.20,1.0,40.586787,2014-05-01 14:47:07,-74.085512,40.586787,2014-05-01 14:33:21,-74.085512
175650,2.80,11.60,1.0,40.630049,2011-02-04 19:56:39,-74.087839,40.630049,2011-02-04 19:44:21,-74.102860


In [38]:
yellow_taxi_ride_sample.isna().sum()

trip_distance        0
total_amount         0
passenger_count      0
dropoff_latitude     0
dropoff_datetime     0
dropoff_longitude    0
pickup_latitude      0
pickup_datetime      0
pickup_longitude     0
dtype: int64

In [39]:
yellow_taxi_ride_sample.shape

(175652, 9)

In [40]:
yellow_taxi_ride_sample.to_csv("yellow_taxi_ride_sample_cleaned.csv")

In [41]:
yellow_taxi_ride_sample.dtypes

trip_distance               float64
total_amount                float64
passenger_count             float64
dropoff_latitude            float64
dropoff_datetime     datetime64[ns]
dropoff_longitude           float64
pickup_latitude             float64
pickup_datetime      datetime64[ns]
pickup_longitude            float64
dtype: object

In [42]:
def remove_out_region(df):
    df = df[df['pickup_latitude'] >= 40.560445]
    df = df[df['pickup_latitude'] <= 40.908524]
    df = df[df['dropoff_latitude'] >= 40.560445]
    df = df[df['dropoff_latitude'] <= 40.908524]
    
    df = df[df['pickup_longitude'] >= -74.242330]
    df = df[df['pickup_longitude'] <= -73.71704]
    df = df[df['dropoff_longitude'] >= -74.242330]
    df = df[df['dropoff_longitude'] <= -73.71704]
    return df

In [43]:
yellow_taxi_ride_sample = remove_out_region(yellow_taxi_ride_sample)

In [44]:
uber_rides_sample = pd.read_csv("uber_rides_sample.csv")

In [45]:
uber_rides_sample = remove_out_region(remove_out_region(uber_rides_sample))

In [46]:
uber_rides_sample['pickup_datetime'] = pd.to_datetime(uber_rides_sample['pickup_datetime'])
uber_rides_sample = uber_rides_sample.drop('Unnamed: 0', axis=1)
uber_rides_sample = uber_rides_sample.drop('key', axis=1)
uber_rides_sample.dtypes

fare_amount                      float64
pickup_datetime      datetime64[ns, UTC]
pickup_longitude                 float64
pickup_latitude                  float64
dropoff_longitude                float64
dropoff_latitude                 float64
passenger_count                    int64
dtype: object

In [47]:
yellow_taxi_ride_sample['passenger_count'] = yellow_taxi_ride_sample['passenger_count'].astype(int)

In [48]:
def calculate_trip_distance(row):
    pickup_latitude = row['pickup_latitude']
    pickup_longitude = row['pickup_longitude']
    dropoff_latitude = row['dropoff_latitude']
    dropoff_longitude = row['dropoff_longitude']
    
    lat1, lon1, lat2, lon2 = map(math.radians, [pickup_latitude, pickup_longitude,  dropoff_latitude, dropoff_longitude])
    
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    
    dist = math.sqrt(dlat**2 + dlon**2)
    
    R = 6371
    distance = dist * R
    return distance
    

In [49]:
yellow_taxi_ride_sample['distance'] = yellow_taxi_ride_sample.apply(calculate_trip_distance, axis=1)

In [50]:
uber_rides_sample['trip_distance'] = uber_rides_sample.apply(calculate_trip_distance, axis=1)

In [51]:
def impute_distance(row):
    if pd.notna(row['distance']):
        return row['distance']
    else:
        return row['trip_distance']

In [52]:
yellow_taxi_ride_sample['trip_distance'] = yellow_taxi_ride_sample.apply(impute_distance, axis=1)

In [53]:
yellow_taxi_ride_sample = yellow_taxi_ride_sample.drop('distance', axis=1)

In [54]:
def remove_zero_distance(df):
    condition = df['trip_distance'] == 0
    df = df.drop(df[condition].index)
    return df

In [55]:
yellow_taxi_ride_sample = remove_zero_distance(yellow_taxi_ride_sample)
uber_rides_sample = remove_zero_distance(uber_rides_sample)

In [56]:
yellow_taxi_ride_sample

Unnamed: 0,trip_distance,total_amount,passenger_count,dropoff_latitude,dropoff_datetime,dropoff_longitude,pickup_latitude,pickup_datetime,pickup_longitude
0,2.203732,10.60,1,40.759818,2011-07-20 08:41:00,-74.004015,40.759818,2011-07-20 08:24:00,-73.984196
1,2.203732,7.20,1,40.759818,2011-07-15 10:03:47,-74.004015,40.759818,2011-07-15 09:56:50,-73.984196
2,2.203732,8.60,1,40.759818,2011-07-16 16:59:14,-74.004015,40.759818,2011-07-16 16:48:56,-73.984196
3,2.203732,5.80,1,40.759818,2011-07-19 12:42:30,-74.004015,40.759818,2011-07-19 12:36:29,-73.984196
4,2.203732,9.25,6,40.759818,2014-06-15 09:56:00,-74.004015,40.759818,2014-06-15 09:49:00,-73.984196
...,...,...,...,...,...,...,...,...,...
175626,4.251128,23.30,5,40.644288,2015-12-17 01:06:29,-73.899735,40.644288,2015-12-17 00:58:34,-73.937966
175635,10.486233,42.80,2,40.697001,2011-07-22 18:48:00,-73.777253,40.697001,2011-07-22 18:06:00,-73.871558
175643,0.965181,9.00,1,40.677098,2011-11-26 10:32:00,-73.735554,40.677098,2011-11-26 10:23:00,-73.744234
175644,3.181071,13.80,1,40.618769,2011-01-31 09:43:16,-74.102313,40.618769,2011-01-31 09:19:59,-74.073704


In [57]:
uber_rides_sample

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,trip_distance
0,7.5,2015-05-07 19:52:06+00:00,-73.999817,40.738354,-73.999512,40.723217,1,1.683468
1,7.7,2009-07-17 20:04:56+00:00,-73.994355,40.728225,-73.994710,40.750325,1,2.457725
2,12.9,2009-08-24 21:45:00+00:00,-74.005043,40.740770,-73.962565,40.772647,1,5.905407
3,5.3,2009-06-26 08:22:21+00:00,-73.976124,40.790844,-73.965316,40.803349,3,1.837874
4,16.0,2014-08-28 17:47:00+00:00,-73.925023,40.744085,-73.973082,40.761247,5,5.674431
...,...,...,...,...,...,...,...,...
199995,3.0,2012-10-28 10:49:00+00:00,-73.987042,40.739367,-73.986525,40.740297,1,0.118316
199996,7.5,2014-03-14 01:09:00+00:00,-73.984722,40.736837,-74.006672,40.739620,1,2.460268
199997,30.9,2009-06-29 00:42:00+00:00,-73.986017,40.756487,-73.858957,40.692588,2,15.814454
199998,14.5,2015-05-20 14:56:25+00:00,-73.997124,40.725452,-73.983215,40.695415,1,3.680636


### Part 2: Storing Data

### Part 3: Understanding Data

### Part 4: Visualizing Data