https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page

[Data dictionary](https://www.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_yellow.pdf)

In [8]:
import pandas as pd
import numpy as np
import datetime
import time
import os
import pyarrow
# import fastparquet

# url = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2010-01.parquet"
# url3 = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-12.parquet"
# df2 = pd.read_parquet(url)
# df3 = pd.read_parquet(url)

In [9]:
# month = '05'
# year = '2020'
#
# url = f"https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{year}-{month}.parquet"
# df2 = pd.read_parquet(url)
# print('\n', month, year)
# print(df2.value_counts('payment_type'))

In [10]:
# month_list = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
# year_list = [str(i) for i in range(2009, 2024)]
# for year in year_list:
#     for month in month_list:
#         try:
#             url = f"https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{year}-{month}.parquet"
#             df2 = pd.read_parquet(url)
#             print('\n', month, year)
#             print(df2.value_counts('payment_type'))
#         except:
#             print(f"No data for {year}-{month}")
#     os.system(f'say "{year} done"')
#
# time.sleep(2)
# os.system('say "Daisy, Daisy, give me your answer true. I\'m. Half. Cray. Zee. All for the likes of you."')

In [11]:
# Define helper function
# Define a function to convert the values to 'negative', 'positive', or 'zero'
def cash_or_credit(x):
    if x in [1, 'CREDIT', 'Credit', 'CRD', 'Crd', 'CRE', 'Cre']:
        return 1
    elif x in [2, 'CASH', 'Cash', 'CSH', 'Csh', 'CAS', 'Cas']:
        return 0
    else:
        return np.nan

In [None]:
# Define variables specific to this dataset
car_type = 'yellow_taxi'
month_list = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
# month_list = ['01'] #, '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
year_list = [str(i) for i in range(2009, 2024)]
first_day = datetime.date(2009, 1, 1)
last_day = datetime.date(2023, 1, 31)

# More variables
last_month = last_day.strftime('%Y-%m')
data = pd.DataFrame()

# Loop through each year-month
for year in year_list:
    for month in month_list:
        print(f"Adding {year}-{month}...")

        # Load data
        url = f"https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{year}-{month}.parquet"
        df = pd.read_parquet(url)

        # Rename columns
        new_names = {'Passanger_Count': 'passanger_count',
                     'Trip_Distance': 'trip_distance',
                     'Payment_Type': 'payment_type',
                     'Tip_Amt': 'tip_amount',
                     'Total_Amt': 'total_amount',
                     'Trip_Pickup_DateTime': 'tpep_pickup_datetime',
                     'Trip_Dropoff_DateTime': 'tpep_dropoff_datetime',
                     'pickup_datetime': 'tpep_pickup_datetime',
                     'dropoff_datetime': 'tpep_dropoff_datetime',
                     'old_col2': 'new_col2'}
        df = df.rename(columns=new_names)

        # Exclude dates not in range
        df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
        df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])
        df = df[df['tpep_pickup_datetime'].dt.date >= first_day]
        df = df[df['tpep_pickup_datetime'].dt.date <= last_day]

        # Create a column for trip duration
        duration_list = []
        time_delta_series = df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']
        time_delta_series = time_delta_series.dt.total_seconds() / 60
        df.insert(3, 'duration', time_delta_series)

        # Define date from the pickup time
        df.insert(0, 'date', pd.to_datetime(df['tpep_pickup_datetime']).dt.date)

        # Define type of car
        df.insert(1, 'type', car_type)

        # Create a variable for the share of trip to an airport
        if 'RatecodeID' in df.columns:
            if 'airport_fee' in df.columns:
                # Fill None values with NaN
                df['airport_fee'] = df['airport_fee'].fillna(value=np.nan)

                # Indicate if the ride was to an airport (RatecodeID = 2 or 3 or airport_fee is non-zero)
                df['airport_share'] = df.apply(lambda x: 1 if x['RatecodeID'] in [2, 3] or x['airport_fee'] > 0 else 0, axis=1)
            else:
                df['airport_share'] = df.apply(lambda x: 1 if x['RatecodeID'] in [2, 3] else 0, axis=1)
        else:
            # Add blank column
            df['airport_share'] = np.nan

        # Indicate if the payment was credit (if column exists)
        if 'payment_type' in df.columns:
            # df['credit_share'] = df['payment_type'].apply(lambda x: 1 if x in [1, 'CREDIT', 'Credit', 'CRD', 'Crd', 'CRE', 'Cre'] else 0)
            # Use helper function to set cash=0, credit=1 and 0 otherwise
            df['credit_share'] = df['payment_type'].apply(cash_or_credit)
        else:
            df['credit_share'] = np.nan

        # Create empty column if it does not exist
        if 'passenger_count' not in df.columns:
            df['passenger_count'] = np.nan

        # Define columns to keep
        columns_to_keep =  ['date', 'type', 'duration', 'passenger_count', 'trip_distance', 'tip_amount', 'total_amount', 'airport_share', 'credit_share']
        df = df[columns_to_keep]

        # Replace rows in defined range with NaN (high likelihood of being outliers or errors)
        df.loc[(df['passenger_count'] < 1) | (df['passenger_count'] > 9), 'passenger_count'] = np.nan
        df.loc[(df['trip_distance'] <= 0) | (df['trip_distance'] > 100), 'trip_distance'] = np.nan
        df.loc[(df['tip_amount'] < 0) | (df['tip_amount'] > 1000), 'tip_amount'] = np.nan
        df.loc[(df['total_amount'] < 0) | (df['total_amount'] > 1000), 'total_amount'] = np.nan
        df.loc[(df['duration'] < 0) | (df['duration'] > 500), 'duration'] = np.nan

        # Group and combine data
        df_daily = df.groupby(['date', 'type']).mean()
        df_daily.insert(0, 'count', df.groupby(['date', 'type']).count()['duration'])
        data = pd.concat([data, df_daily], axis=0)
        data = data.sort_values('date')

        # # Speak status
        # month_name = datetime.date(2000, int(month), 1).strftime('%B')
        # os.system(f'say "{month_name} {year} done"')

        # Stop the loop on the last month
        if f"{year}-{month}" == last_month:
            break

    # Speak status
    os.system(f'say "{year} done"')

    # Stop the loop on the last month
    if f"{year}-{month}" == last_month:
        break

# Announce completion with a song
time.sleep(2)
os.system('say "Daisy, Daisy, give me your answer true. I\'m. Half. Cray. Zee. All for the likes of you."')

Adding 2009-01...


In [None]:
# Define date variables
data['day_name'] = data['date'].dt.day_name()
data['day_number_of_week'] = data['date'].dt.weekday
data['day_number_of_year'] = data['date'].dt.dayofyear
data['month_name'] = data['date'].dt.month_name()
data['month_number'] = data['date'].dt.month

In [None]:
da = data.copy()
da = da.sort_values(by='count', ascending=False)
da = da[~da.index.duplicated(keep='first')]
da = da.sort_values('date')
da.to_csv('data4.csv')

In [None]:
# da.sort_values(by='count', ascending=False)
da.sort_values(by='date')

In [None]:
df