https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page

[Data dictionary](https://www.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_yellow.pdf)

In [32]:
import pandas as pd
import numpy as np
import datetime
import time
import os
import pyarrow
# import fastparquet

# url = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2010-09.parquet"
# df2 = pd.read_parquet(url)

In [11]:
# Define helper function
# Define a function to convert the values to 'negative', 'positive', or 'zero'
def cash_or_credit(x):
    if x in [1, 'CREDIT', 'Credit', 'CRD', 'Crd', 'CRE', 'Cre']:
        return 1
    elif x in [2, 'CASH', 'Cash', 'CSH', 'Csh', 'CAS', 'Cas']:
        return 0
    else:
        return np.nan

In [12]:
# Define variables specific to this dataset
car_type = 'yellow_taxi'
month_list = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
# month_list = ['01'] #, '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
year_list = [str(i) for i in range(2009, 2024)]
first_day = datetime.date(2009, 1, 1)
last_day = datetime.date(2023, 1, 31)

# More variables
last_month = last_day.strftime('%Y-%m')
data = pd.DataFrame()

# Loop through each year-month
for year in year_list:
    for month in month_list:
        print(f"Adding {year}-{month}...")

        # Load data
        url = f"https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{year}-{month}.parquet"
        df = pd.read_parquet(url)

        # Rename columns
        new_names = {'Passanger_Count': 'passanger_count',
                     'Trip_Distance': 'trip_distance',
                     'Payment_Type': 'payment_type',
                     'Tip_Amt': 'tip_amount',
                     'Total_Amt': 'total_amount',
                     'Trip_Pickup_DateTime': 'tpep_pickup_datetime',
                     'Trip_Dropoff_DateTime': 'tpep_dropoff_datetime',
                     'pickup_datetime': 'tpep_pickup_datetime',
                     'dropoff_datetime': 'tpep_dropoff_datetime',
                     'old_col2': 'new_col2'}
        df = df.rename(columns=new_names)

        # Exclude dates not in range
        df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
        df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])
        df = df[df['tpep_pickup_datetime'].dt.date >= first_day]
        df = df[df['tpep_pickup_datetime'].dt.date <= last_day]

        # Create a column for trip duration
        duration_list = []
        time_delta_series = df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']
        time_delta_series = time_delta_series.dt.total_seconds() / 60
        df.insert(3, 'duration', time_delta_series)

        # Define date from the pickup time
        df.insert(0, 'date', pd.to_datetime(df['tpep_pickup_datetime']).dt.date)

        # Define type of car
        df.insert(1, 'type', car_type)

        # Create a variable for the share of trip to an airport
        if 'RatecodeID' in df.columns:
            if 'airport_fee' in df.columns:
                # Fill None values with NaN
                df['airport_fee'] = df['airport_fee'].fillna(value=np.nan)

                # Indicate if the ride was to an airport (RatecodeID = 2 or 3 or airport_fee is non-zero)
                df['airport_share'] = df.apply(lambda x: 1 if x['RatecodeID'] in [2, 3] or x['airport_fee'] > 0 else 0, axis=1)
            else:
                df['airport_share'] = df.apply(lambda x: 1 if x['RatecodeID'] in [2, 3] else 0, axis=1)
        else:
            # Add blank column
            df['airport_share'] = np.nan

        # Indicate if the payment was credit (if column exists)
        if 'payment_type' in df.columns:
            # Use helper function to set cash=0, credit=1 and 0 otherwise
            df['credit_share'] = df['payment_type'].apply(cash_or_credit)
        else:
            df['credit_share'] = np.nan

        # Create empty column if it does not exist
        if 'passenger_count' not in df.columns:
            df['passenger_count'] = np.nan

        # Define columns to keep
        columns_to_keep =  ['date', 'type', 'duration', 'passenger_count', 'trip_distance', 'tip_amount', 'total_amount', 'airport_share', 'credit_share']
        df = df[columns_to_keep]

        # Replace rows in defined range with NaN (high likelihood of being outliers or errors)
        df.loc[(df['passenger_count'] < 1) | (df['passenger_count'] > 9), 'passenger_count'] = np.nan
        df.loc[(df['trip_distance'] <= 0) | (df['trip_distance'] > 100), 'trip_distance'] = np.nan
        df.loc[(df['tip_amount'] < 0) | (df['tip_amount'] > 1000), 'tip_amount'] = np.nan
        df.loc[(df['total_amount'] < 0) | (df['total_amount'] > 1000), 'total_amount'] = np.nan
        df.loc[(df['duration'] < 0) | (df['duration'] > 500), 'duration'] = np.nan

        # Group and combine data
        df_daily = df.groupby(['date', 'type']).mean()
        df_daily.insert(0, 'count', df.groupby(['date', 'type']).count()['duration'])
        data = pd.concat([data, df_daily], axis=0)
        data = data.sort_values('date')

        # # Speak status
        # month_name = datetime.date(2000, int(month), 1).strftime('%B')
        # os.system(f'say "{month_name} {year} done"')

        # Stop the loop on the last month
        if f"{year}-{month}" == last_month:
            break

    # Speak status
    os.system(f'say "{year} done"')

    # Stop the loop on the last month
    if f"{year}-{month}" == last_month:
        break

# Resent the index
data = data.reset_index()

# Announce completion with a song
time.sleep(2)
os.system('say "Daisy, Daisy, give me your answer true. I\'m. Half. Cray. Zee. All for the likes of you."')

Adding 2009-01...
Adding 2009-02...
Adding 2009-03...
Adding 2009-04...
Adding 2009-05...
Adding 2009-06...
Adding 2009-07...
Adding 2009-08...
Adding 2009-09...
Adding 2009-10...
Adding 2009-11...
Adding 2009-12...
Adding 2010-01...
Adding 2010-02...
Adding 2010-03...
Adding 2010-04...
Adding 2010-05...
Adding 2010-06...
Adding 2010-07...
Adding 2010-08...
Adding 2010-09...
Adding 2010-10...
Adding 2010-11...
Adding 2010-12...
Adding 2011-01...
Adding 2011-02...
Adding 2011-03...
Adding 2011-04...
Adding 2011-05...
Adding 2011-06...
Adding 2011-07...
Adding 2011-08...
Adding 2011-09...
Adding 2011-10...
Adding 2011-11...
Adding 2011-12...
Adding 2012-01...
Adding 2012-02...
Adding 2012-03...
Adding 2012-04...
Adding 2012-05...
Adding 2012-06...
Adding 2012-07...
Adding 2012-08...
Adding 2012-09...
Adding 2012-10...
Adding 2012-11...
Adding 2012-12...
Adding 2013-01...
Adding 2013-02...
Adding 2013-03...
Adding 2013-04...
Adding 2013-05...
Adding 2013-06...
Adding 2013-07...
Adding 201

0

In [21]:
# Define date variables
data['date'] = pd.to_datetime(data['date'])
data['day_name'] = data['date'].dt.day_name()
data['day_number_of_week'] = data['date'].dt.weekday
data['day_number_of_year'] = data['date'].dt.dayofyear
data['month_name'] = data['date'].dt.month_name()
data['month_number'] = data['date'].dt.month

In [26]:
da = data.copy()
da = da.sort_values(by='count', ascending=False)
da = da.drop_duplicates(subset='date', keep='first')
da = da.sort_values('date')
da = da.reset_index(drop=True)
da.to_csv('nyc_taxis_before_weather.csv')

In [23]:
# da.sort_values(by='count', ascending=False)
da.sort_values(by='date')

Unnamed: 0,date,type,count,duration,passenger_count,trip_distance,tip_amount,total_amount,airport_share,credit_share,day_name,day_number_of_week,day_number_of_year,month_name,month_number
29,2009-01-01,yellow_taxi,26,22.710897,1.358974,3.315897,0.870513,17.716154,0.000000,0.282051,Thursday,3,1,January,1
10,2009-01-01,yellow_taxi,3,10.888889,2.666667,2.286667,0.000000,12.133333,0.000000,0.000000,Thursday,3,1,January,1
37,2009-01-01,yellow_taxi,3,10.294444,1.000000,1.570000,0.000000,11.300000,0.000000,0.000000,Thursday,3,1,January,1
23,2009-01-01,yellow_taxi,17,24.827451,1.681818,6.499545,2.194545,31.459091,0.090909,0.272727,Thursday,3,1,January,1
40,2009-01-01,yellow_taxi,10,22.368333,1.545455,5.332000,2.161818,24.803636,0.090909,0.545455,Thursday,3,1,January,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6329,2023-01-27,yellow_taxi,111564,14.754598,1.379156,3.204753,3.352443,27.020168,0.082235,0.827366,Friday,4,27,January,1
6330,2023-01-28,yellow_taxi,111521,14.009502,1.463578,3.019139,3.120034,25.165459,0.059089,0.831443,Saturday,5,28,January,1
6331,2023-01-29,yellow_taxi,88042,13.571688,1.428349,3.753645,3.458473,27.867417,0.110177,0.831086,Sunday,6,29,January,1
6332,2023-01-30,yellow_taxi,83704,14.521546,1.314911,3.548006,3.423843,28.022183,0.111057,0.817129,Monday,0,30,January,1


In [24]:
df

Unnamed: 0,date,type,duration,passenger_count,trip_distance,tip_amount,total_amount,airport_share,credit_share
0,2023-01-01,yellow_taxi,8.433333,1.0,0.97,0.00,14.30,0,0.0
1,2023-01-01,yellow_taxi,6.316667,1.0,1.10,4.00,16.90,0,1.0
2,2023-01-01,yellow_taxi,12.750000,1.0,2.51,15.00,34.90,0,1.0
3,2023-01-01,yellow_taxi,9.616667,,1.90,0.00,20.85,1,1.0
4,2023-01-01,yellow_taxi,10.833333,1.0,1.43,3.28,19.68,0,1.0
...,...,...,...,...,...,...,...,...,...
3066761,2023-01-31,yellow_taxi,13.983333,,3.05,3.96,23.76,0,
3066762,2023-01-31,yellow_taxi,19.450000,,5.80,2.64,29.07,0,
3066763,2023-01-31,yellow_taxi,24.516667,,4.67,5.32,26.93,0,
3066764,2023-01-31,yellow_taxi,13.000000,,3.15,4.43,26.58,0,


# Add weather data
[Data source](https://www.ncei.noaa.gov)
[Documentation](https://www.ncei.noaa.gov/data/daily-summaries/doc/GHCND_documentation.pdf)

In [31]:
import pandas as pd
dfw = pd.read_csv('nyc_weather.csv')
df = pd.read_csv('nyc_taxis_before_weather.csv')

if 'Unnamed: 0' in df.columns:
    df = df.drop(columns='Unnamed: 0')

col_dict = {
    'DATE':'date',
    'AWND':'avg_wind_speed',
    'PRCP':'precipitation',
    'SNOW':'snow',
    'WT17':'freezing_rain'}

columns_to_keep = [key for key, value in col_dict.items()]
dfw = dfw[columns_to_keep]
dfw = dfw.rename(columns=col_dict)
dfw['freezing_rain'] = dfw['freezing_rain'].fillna(0)

df_full = pd.merge(df, dfw, on='date', how='left')
df_full.to_csv('nyc_taxis.csv', index=False)
df_full

Unnamed: 0,date,type,count,duration,passenger_count,trip_distance,tip_amount,total_amount,airport_share,credit_share,day_name,day_number_of_week,day_number_of_year,month_name,month_number,avg_wind_speed,precipitation,snow,freezing_rain
0,2009-01-01,yellow_taxi,324225,10.110423,,2.907790,0.329896,10.377966,,0.145649,Thursday,3,1,January,1,11.18,0.00,0.0,0.0
1,2009-01-02,yellow_taxi,372059,10.521323,,2.706211,0.339906,10.353366,,0.154869,Friday,4,2,January,1,6.26,0.00,0.0,0.0
2,2009-01-03,yellow_taxi,427312,10.676456,,2.748637,0.366799,10.278468,,0.168207,Saturday,5,3,January,1,10.07,0.00,0.0,0.0
3,2009-01-04,yellow_taxi,362132,10.665589,,3.101978,0.452462,11.064838,,0.190243,Sunday,6,4,January,1,7.61,0.00,0.0,0.0
4,2009-01-05,yellow_taxi,365983,10.476732,,2.816382,0.439258,10.605498,,0.190240,Monday,0,5,January,1,6.93,0.00,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5139,2023-01-27,yellow_taxi,111564,14.754598,1.379156,3.204753,3.352443,27.020168,0.082235,0.827366,Friday,4,27,January,1,5.82,0.00,0.0,0.0
5140,2023-01-28,yellow_taxi,111521,14.009502,1.463578,3.019139,3.120034,25.165459,0.059089,0.831443,Saturday,5,28,January,1,7.16,0.00,0.0,0.0
5141,2023-01-29,yellow_taxi,88042,13.571688,1.428349,3.753645,3.458473,27.867417,0.110177,0.831086,Sunday,6,29,January,1,4.70,0.00,0.0,0.0
5142,2023-01-30,yellow_taxi,83704,14.521546,1.314911,3.548006,3.423843,28.022183,0.111057,0.817129,Monday,0,30,January,1,2.91,0.00,0.0,0.0


In [30]:
df_full.columns

Index(['Unnamed: 0', 'date', 'type', 'count', 'duration', 'passenger_count',
       'trip_distance', 'tip_amount', 'total_amount', 'airport_share',
       'credit_share', 'day_name', 'day_number_of_week', 'day_number_of_year',
       'month_name', 'month_number', 'avg_wind_speed', 'precipitation', 'snow',
       'freezing_rain'],
      dtype='object')