# Part 1: Data Preprocessing

In [1]:
import numpy as np
import pandas as pd
import math
import geopandas as gpd
import bs4
import re
import requests
import os
import pyarrow.parquet as pq
import glob
import matplotlib.pyplot as plt
import folium
import folium.plugins
from matplotlib.animation import FuncAnimation
import pytest
import sqlite3
from sqlalchemy import create_engine

### Uber Data

In [2]:
uber_rides_sample = pd.read_csv("uber_rides_sample.csv", index_col = 0)
uber_rides_sample.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1
44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1
25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5


In [3]:
uber_rides_sample.info()

<class 'pandas.core.frame.DataFrame'>
Index: 200000 entries, 24238194 to 11951496
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   key                200000 non-null  object 
 1   fare_amount        200000 non-null  float64
 2   pickup_datetime    200000 non-null  object 
 3   pickup_longitude   200000 non-null  float64
 4   pickup_latitude    200000 non-null  float64
 5   dropoff_longitude  199999 non-null  float64
 6   dropoff_latitude   199999 non-null  float64
 7   passenger_count    200000 non-null  int64  
dtypes: float64(5), int64(1), object(2)
memory usage: 13.7+ MB


### Download Yellow Taxi Data and get a sample dataset with its size close to Uber Dataset

In [4]:
# result = set()
# for filename in os.listdir("./yellow_taxi"):
#     if filename.endswith(".parquet"):
#         existing_columns = set(pq.ParquetFile("./yellow_taxi/"+filename).schema.names)
#         result = result.union(existing_columns)

# result

In [5]:
# Get Yellow Taxi Parquet files
def download_yellow_taxi_parquet_files():
    """
    Downloads Yellow Taxi trip record data files in Parquet format from the NYC Taxi and Limousine Commission website.
    
    This function sends a GET request to the webpage containing the links to the Yellow Taxi trip record data files,
    parses the HTML response using BeautifulSoup, and downloads any files matching the pattern "yellow_tripdata_YYYY-MM.parquet",
    where YYYY-MM is a year-month combination from 2009-01 to 2015-12.
    
    The downloaded files are saved in a directory called 'yellow_taxi' in the current working directory.
    
    Raises:
        OSError: If the directory 'yellow_taxi' already exists and is not writable, or if there is a problem writing to any of the downloaded files.
        requests.exceptions.RequestException: If there is a problem sending the GET request to the webpage containing the links to the data files.
    """
    
    response = requests.get("https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page")
    soup = bs4.BeautifulSoup(response.content, 'html.parser')
    yellow_records = soup.find_all("a", attrs={"title": "Yellow Taxi Trip Records"})
    if not os.path.exists("./yellow_taxi"):
        os.makedirs("./yellow_taxi")
    
    for record in yellow_records:
        pattern = r'yellow_tripdata_(2009|201[0-5])-\d{2}\.'
        link = record["href"]
        if re.search(pattern, link):
            filename = os.path.join("yellow_taxi", link.split("/")[-1])
            response = requests.get(link)
            
            with open(filename, "wb") as f:
                f.write(response.content)

In [6]:
"""
Comment the line below if no need to download the Yellow Taxi Parquet files.
"""

# download_yellow_taxi_parquet_files()

'\nComment the line below if no need to download the Yellow Taxi Parquet files.\n'

In [7]:
# Get Yellow Taxi Data sample
def generate_yellow_taxi_df():
    """
    Reads Parquet files containing Yellow Taxi trip data, selects a subset of columns, and outputs a CSV file with a random sample of 3000 rides.
    
    This function reads Parquet files in the './yellow_taxi' directory, selects a subset of columns from the schema
    (specified in the 'columns_to_select' list), and reads the data into a Pandas DataFrame. A random sample of 3000
    rows is taken from each file, and the resulting DataFrames are concatenated into a single DataFrame. Finally, this
    DataFrame is saved to a CSV file called 'yellow_taxi_ride_sample.csv' in the current working directory.
    
    Raises:
        pyarrow.lib.ArrowInvalid: If there is an error reading the Parquet file schema.
        pyarrow.lib.ArrowIOError: If there is an error reading the Parquet file.
    """   
    
    columns_to_select = ['DOLocationID',
                         'End_Lat',
                         'End_Lon',
                         'PULocationID',
                         'Passenger_Count',
                         'Start_Lat',
                         'Start_Lon',
                         'Total_Amt',
                         'Trip_Distance',
                         'Trip_Dropoff_DateTime',
                         'Trip_Pickup_DateTime',
                         'dropoff_datetime',
                         'dropoff_latitude',
                         'dropoff_longitude',
                         'passenger_count',
                         'pickup_datetime',
                         'pickup_latitude',
                         'pickup_longitude',
                         'total_amount',
                         'tpep_dropoff_datetime',
                         'tpep_pickup_datetime',
                         'trip_distance',
                         'Tip_Amt',
                         'tip_amount']

    directory = "./yellow_taxi"
    yellow_taix_df = pd.DataFrame()
    for filename in os.listdir(directory):
        if filename.endswith(".parquet"):
            existing_columns = set(pq.ParquetFile("./yellow_taxi/"+filename).schema.names)
            columns_to_read = list(set(columns_to_select) & existing_columns)
            table = pq.read_table("./yellow_taxi/"+filename, columns=columns_to_read)
            df = table.to_pandas()
            random_subset = df.sample(n=3000)
            yellow_taix_df = pd.concat([yellow_taix_df, random_subset], ignore_index=True)
    yellow_taix_df.to_csv("yellow_taxi_ride_sample.csv", index = False)

In [8]:
"""
Comment the line below if the Yellow Taxi Data sample already generated.
"""

# generate_yellow_taxi_df()

'\nComment the line below if the Yellow Taxi Data sample already generated.\n'

In [9]:
yellow_taxi_ride_sample = pd.read_csv("yellow_taxi_ride_sample.csv")
yellow_taxi_ride_sample.head()

Unnamed: 0,trip_distance,tip_amount,tpep_dropoff_datetime,PULocationID,total_amount,DOLocationID,passenger_count,tpep_pickup_datetime,Passenger_Count,Tip_Amt,...,Trip_Distance,End_Lat,Trip_Pickup_DateTime,End_Lon,dropoff_latitude,dropoff_datetime,dropoff_longitude,pickup_latitude,pickup_datetime,pickup_longitude
0,1.5,1.5,2011-07-20 06:58:52,170.0,8.1,230.0,1.0,2011-07-20 06:52:08,,,...,,,,,,,,,,
1,0.9,1.26,2011-07-09 00:53:28,48.0,7.56,48.0,2.0,2011-07-09 00:48:14,,,...,,,,,,,,,,
2,1.9,0.0,2011-07-14 10:36:55,140.0,9.0,142.0,2.0,2011-07-14 10:25:30,,,...,,,,,,,,,,
3,1.5,0.0,2011-07-02 14:06:27,43.0,6.6,161.0,1.0,2011-07-02 13:59:48,,,...,,,,,,,,,,
4,12.5,5.0,2011-07-16 21:26:28,138.0,36.1,231.0,3.0,2011-07-16 20:55:22,,,...,,,,,,,,,,


In [10]:
yellow_taxi_ride_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 252000 entries, 0 to 251999
Data columns (total 24 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   trip_distance          216000 non-null  float64
 1   tip_amount             216000 non-null  float64
 2   tpep_dropoff_datetime  180000 non-null  object 
 3   PULocationID           180000 non-null  float64
 4   total_amount           216000 non-null  float64
 5   DOLocationID           180000 non-null  float64
 6   passenger_count        216000 non-null  float64
 7   tpep_pickup_datetime   180000 non-null  object 
 8   Passenger_Count        36000 non-null   float64
 9   Tip_Amt                36000 non-null   float64
 10  Trip_Dropoff_DateTime  36000 non-null   object 
 11  Start_Lat              36000 non-null   float64
 12  Start_Lon              36000 non-null   float64
 13  Total_Amt              36000 non-null   float64
 14  Trip_Distance          36000 non-nul

### Data Preprocessing - Yellow Taxi Data
#### Combine columns with the same thing BUT have different column names

In [11]:
def convert_to_datetime(df):
    """
    Convert specified columns in a pandas DataFrame to datetime format.

    Args:
        df (pandas.DataFrame): A DataFrame with columns to be converted.

    Returns:
        pandas.DataFrame: A DataFrame with converted datetime columns.

    """
    
    df["tpep_pickup_datetime"] = pd.to_datetime(df["tpep_pickup_datetime"])
    df["Trip_Pickup_DateTime"] = pd.to_datetime(df["Trip_Pickup_DateTime"])
    df["pickup_datetime"] = pd.to_datetime(df["pickup_datetime"])

    df["tpep_dropoff_datetime"] = pd.to_datetime(df["tpep_dropoff_datetime"])
    df["Trip_Dropoff_DateTime"] = pd.to_datetime(df["Trip_Dropoff_DateTime"])
    df["dropoff_datetime"] = pd.to_datetime(df["dropoff_datetime"])
    return df

In [12]:
yellow_taxi_ride_sample = convert_to_datetime(yellow_taxi_ride_sample)
yellow_taxi_ride_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 252000 entries, 0 to 251999
Data columns (total 24 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   trip_distance          216000 non-null  float64       
 1   tip_amount             216000 non-null  float64       
 2   tpep_dropoff_datetime  180000 non-null  datetime64[ns]
 3   PULocationID           180000 non-null  float64       
 4   total_amount           216000 non-null  float64       
 5   DOLocationID           180000 non-null  float64       
 6   passenger_count        216000 non-null  float64       
 7   tpep_pickup_datetime   180000 non-null  datetime64[ns]
 8   Passenger_Count        36000 non-null   float64       
 9   Tip_Amt                36000 non-null   float64       
 10  Trip_Dropoff_DateTime  36000 non-null   datetime64[ns]
 11  Start_Lat              36000 non-null   float64       
 12  Start_Lon              36000 non-null   floa

In [13]:
assert yellow_taxi_ride_sample.shape[0] == 252000
assert yellow_taxi_ride_sample.shape[1] == 24

In [15]:
def impute_pickup_datetime(row):
    """
    Impute missing pickup datetime.

    If `tpep_pickup_datetime` is not NaN, return it.
    Else if `pickup_datetime` is not NaN, return it.
    Else return `Trip_Pickup_DateTime`.

    Args:
        row: a pandas Series containing information about a taxi ride.

    Returns:
        A pandas Timestamp object representing the pickup datetime.
    """
    if pd.notna(row['tpep_pickup_datetime']):
        return row['tpep_pickup_datetime']
    elif pd.notna(row['pickup_datetime']):
        return row['pickup_datetime']
    else:
        return row['Trip_Pickup_DateTime']
    
def impute_dropoff_datetime(row):
    """
    Impute missing dropoff datetime.

    If `tpep_dropoff_datetime` is not NaN, return it.
    Else if `dropoff_datetime` is not NaN, return it.
    Else return `Trip_Dropoff_DateTime`.

    Args:
        row: a pandas Series containing information about a taxi ride.

    Returns:
        A pandas Timestamp object representing the dropoff datetime.
    """
    if pd.notna(row['tpep_dropoff_datetime']):
        return row['tpep_dropoff_datetime']
    elif pd.notna(row['dropoff_datetime']):
        return row['dropoff_datetime']
    else:
        return row['Trip_Dropoff_DateTime']
    
def impute_trip_distance(row):
    """
    Impute missing trip distance.

    If `trip_distance` is not NaN, return it.
    Else return `Trip_Distance`.

    Args:
        row: a pandas Series containing information about a taxi ride.

    Returns:
        A float representing the trip distance.
    """
    if pd.notna(row['trip_distance']):
        return row['trip_distance']
    else:
        return row['Trip_Distance']
    
def impute_total_amount(row):
    """
    Impute missing total amount.

    If `Total_Amt` is not NaN, return it.
    Else return `total_amount`.

    Args:
        row: a pandas Series containing information about a taxi ride.

    Returns:
        A float representing the total amount.
    """ 
    if pd.notna(row['Total_Amt']):
        return row['Total_Amt']
    else:
        return row['total_amount']
    
def impute_pickup_lon(row):
    """
    Impute missing pickup longitude.

    If `pickup_longitude` is not NaN, return it.
    Else return `Start_Lon`.

    Args:
        row: a pandas Series containing information about a taxi ride.

    Returns:
        A float representing the pickup longitude.
    """
    if pd.notna(row['pickup_longitude']):
        return row['pickup_longitude']
    else:
        return row['Start_Lon']
    
def impute_pickup_lat(row):
    """
    Impute missing pickup latitude.

    If `pickup_latitude` is not NaN, return it.
    Else return `Start_Lat`.

    Args:
        row: a pandas Series containing information about a taxi ride.

    Returns:
        A float representing the pickup latitude.
    """
    if pd.notna(row['pickup_latitude']):
        return row['pickup_latitude']
    else:
        return row['Start_Lat']

def impute_dropoff_lon(row):
    """Impute missing dropoff longitude.

    If `dropoff_longitude` is not NaN, return it.
    Else return `End_Lon`.

    Args:
        row: a pandas Series containing information about a taxi ride.

    Returns:
        A float representing the dropoff longitude.
    """
    if pd.notna(row['dropoff_longitude']):
        return row['dropoff_longitude']
    else:
        return row['End_Lon']

def impute_dropoff_lat(row):
    """Impute missing dropoff latitude.

    If `dropoff_latitude` is not NaN, return it.
    Else return `End_Lat`.

    Args:
        row: a pandas Series containing information about a taxi ride.

    Returns:
        A float representing the dropoff latitude.
    """
    if pd.notna(row['dropoff_latitude']):
        return row['dropoff_latitude']
    else:
        return row['End_Lat']
    
def impute_passenger_count(row):
     """Impute missing passenger count.

    If `passenger_count` is not NaN, return it.
    Else return `Passenger_Count`.

    Args:
        row: a pandas Series containing information about a taxi ride.

    Returns:
        An integer representing the number of passengers.
    """
    if pd.notna(row['passenger_count']):
        return row['passenger_count']
    else:
        return row['Passenger_Count']
    
def impute_tip(row):
    """Impute missing tip.

    If `tip_amount` is not NaN, return it.
    Else return `Tip_Amt`.

    Args:
        row: a pandas Series containing information about a taxi ride.

    Returns:
        A float representing the tip amount.
    """
    if pd.notna(row['tip_amount']):
        return row['tip_amount']
    else:
        return row['Tip_Amt']

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 161)

In [None]:
yellow_taxi_ride_sample['pickup_datetime'] = yellow_taxi_ride_sample.apply(impute_pickup_datetime, axis=1)
yellow_taxi_ride_sample['dropoff_datetime'] = yellow_taxi_ride_sample.apply(impute_dropoff_datetime, axis=1)
yellow_taxi_ride_sample['trip_distance'] = yellow_taxi_ride_sample.apply(impute_trip_distance, axis=1)
yellow_taxi_ride_sample['total_amount'] = yellow_taxi_ride_sample.apply(impute_total_amount, axis=1)
yellow_taxi_ride_sample['pickup_longitude'] = yellow_taxi_ride_sample.apply(impute_pickup_lon, axis=1)
yellow_taxi_ride_sample['pickup_latitude'] = yellow_taxi_ride_sample.apply(impute_pickup_lat, axis=1)
yellow_taxi_ride_sample['dropoff_longitude'] = yellow_taxi_ride_sample.apply(impute_dropoff_lon, axis=1)
yellow_taxi_ride_sample['dropoff_latitude'] = yellow_taxi_ride_sample.apply(impute_dropoff_lat, axis=1)
yellow_taxi_ride_sample['passenger_count'] = yellow_taxi_ride_sample.apply(impute_passenger_count, axis=1)
yellow_taxi_ride_sample['passenger_count'] = yellow_taxi_ride_sample['passenger_count'].astype(int)
yellow_taxi_ride_sample['tip_amount'] = yellow_taxi_ride_sample.apply(impute_tip, axis=1)
yellow_taxi_ride_sample = yellow_taxi_ride_sample.drop(columns=['tpep_pickup_datetime', 'Trip_Pickup_DateTime',\
                                                                'tpep_dropoff_datetime', 'Trip_Dropoff_DateTime',\
                                                                'Trip_Distance', 'Total_Amt',\
                                                                'Start_Lon', 'Start_Lat',\
                                                                'End_Lon', 'End_Lat', 'Passenger_Count', 'Tip_Amt'])
yellow_taxi_ride_sample.info()

#### Deal with  `longitude` , `latitude`, and `Location ID` related problems

In [None]:
nan_rows_PUlon = yellow_taxi_ride_sample[yellow_taxi_ride_sample['pickup_longitude'].isnull() & yellow_taxi_ride_sample['PULocationID'].notnull()]
nan_rows_PUlat = yellow_taxi_ride_sample[yellow_taxi_ride_sample['pickup_latitude'].isnull() & yellow_taxi_ride_sample['PULocationID'].notnull()]
nan_rows_DOlon = yellow_taxi_ride_sample[yellow_taxi_ride_sample['dropoff_longitude'].isnull() & yellow_taxi_ride_sample['DOLocationID'].notnull()]
nan_rows_DOlat = yellow_taxi_ride_sample[yellow_taxi_ride_sample['dropoff_latitude'].isnull() & yellow_taxi_ride_sample['DOLocationID'].notnull()]
nan_rows_lon_lat = pd.concat([nan_rows_PUlon, nan_rows_PUlat, nan_rows_DOlon, nan_rows_DOlat], axis=1)
nan_rows_lon_lat = nan_rows_lon_lat.loc[:, ~nan_rows_lon_lat.columns.duplicated()]
nan_rows_lon_lat

In [None]:
rest_rows = yellow_taxi_ride_sample.loc[~yellow_taxi_ride_sample.index.isin(nan_rows_lon_lat.index)]
rows_lon_lat_exist = yellow_taxi_ride_sample[yellow_taxi_ride_sample['pickup_longitude'].notnull() & yellow_taxi_ride_sample['pickup_latitude'].notnull()]
print(f"Check 'rows_lon_lat_exist' and 'nan_rows_lon_lat' forms a partition of 'yellow_taxi_ride_sample': {(rest_rows.copy().drop(['PULocationID', 'DOLocationID'], axis=1)).equals(rows_lon_lat_exist.copy().drop(['PULocationID', 'DOLocationID'], axis=1))}")

In [None]:
gdf_polygons = gpd.read_file('./taxi_zones/taxi_zones.shp')
gdf_polygons.head()

In [None]:
gdf_polygons.info()

In [None]:
def gdf_get_location(df):
    """
    Returns a GeoDataFrame with the centroid coordinates (latitude and longitude) 
    for each pickup and dropoff location in the input GeoDataFrame.

    Args:
        df (GeoDataFrame): a GeoDataFrame with polygon geometries representing 
        pickup and dropoff zones.

    Returns:
        GeoDataFrame: a GeoDataFrame with the same columns as the input GeoDataFrame, 
        plus four additional columns: 'pickup_lon', 'pickup_lat', 'dropoff_lon', 
        and 'dropoff_lat', which represent the centroid coordinates for each pickup 
        and dropoff location in decimal degrees (WGS84).

    """
    df = gdf_polygons.to_crs(4326)
    df['pickup_lon'] = df['geometry'].centroid.x
    df['pickup_lat'] = df['geometry'].centroid.y
    df['dropoff_lon'] = df['geometry'].centroid.x
    df['dropoff_lat'] = df['geometry'].centroid.y
    return df

In [None]:
gdf_polygons = gdf_get_location(gdf_polygons)
gdf_polygons.head()

In [None]:
gdf_polygons.info()

In [None]:
def merge_geo_df(geo_df, df):  
    """
    Merge the missing longitude and latitude values in `df` with the corresponding values in `geo_df`
    using the location IDs in the 'PULocationID' and 'DOLocationID' columns.

    Args:
        geo_df (pandas.DataFrame): A dataframe containing the mapping between location IDs and their
                                   corresponding longitude and latitude values.
        df (pandas.DataFrame): A dataframe containing the trip data with missing longitude and latitude values.

    Returns:
        pandas.DataFrame: A merged dataframe containing the original trip data from `df` with the missing
                           longitude and latitude values filled in using the corresponding values from `geo_df`.
    """
    # Merge nan_rows with geo_df to get the missing longtitude/latitude values
    # We use 'inner' merge to filter the rows with invalid location ID
    merged_nan_rows_PU = pd.merge(df, geo_df[['LocationID', 'pickup_lon', 'pickup_lat']],
                           left_on='PULocationID', right_on='LocationID', how='inner')
    merged_nan_rows = pd.merge(merged_nan_rows_PU, geo_df[['LocationID', 'dropoff_lon', 'dropoff_lat']],
                           left_on='DOLocationID', right_on='LocationID', how='inner')  
    return merged_nan_rows

In [None]:
nan_rows_lon_lat = merge_geo_df(gdf_polygons, nan_rows_lon_lat)
nan_rows_lon_lat['pickup_longitude'] = nan_rows_lon_lat['pickup_lon']
nan_rows_lon_lat['pickup_latitude'] = nan_rows_lon_lat['pickup_lat']
nan_rows_lon_lat['dropoff_longitude'] = nan_rows_lon_lat['dropoff_lon']
nan_rows_lon_lat['dropoff_latitude'] = nan_rows_lon_lat['dropoff_lat']
nan_rows_lon_lat = nan_rows_lon_lat.drop(columns=['pickup_lon', 'pickup_lat', 'dropoff_lon', 'dropoff_lat',\
                                                  'LocationID_x', 'LocationID_y'])
nan_rows_lon_lat.info()

In [None]:
yellow_taxi_ride_sample = pd.concat([nan_rows_lon_lat, rest_rows])
yellow_taxi_ride_sample = yellow_taxi_ride_sample.drop(columns=['PULocationID', 'DOLocationID'])
yellow_taxi_ride_sample

In [None]:
yellow_taxi_ride_sample.info()

### Data Preprocessing - Uber Data

In [None]:
uber_rides_sample.head()

In [None]:
uber_rides_sample.info()

In [None]:
#Check missing values
uber_rides_sample.isnull().sum()

In [None]:
uber_rides_sample['pickup_datetime'] = pd.to_datetime(uber_rides_sample['pickup_datetime'])
uber_rides_sample = uber_rides_sample.drop('key', axis=1) #Since Column 'key' has same values as Column 'pickup_datetime'
uber_rides_sample = uber_rides_sample.dropna() #Since there are very less missing values, we simply drop them
uber_rides_sample.info()

### For Uber and Yellow Taxi data, 
#### remove out of region records

In [None]:
def remove_out_region(df):
    """
    Remove rows from `df` that have pickup or dropoff locations outside the specified region of New York City.

    The region is defined by the latitude and longitude boundaries:
    - Minimum latitude: 40.560445
    - Maximum latitude: 40.908524
    - Minimum longitude: -74.242330
    - Maximum longitude: -73.71704

    Args:
        df (pandas.DataFrame): A dataframe containing the trip data with latitude and longitude values.

    Returns:
        pandas.DataFrame: A filtered dataframe containing the trip data with only the rows that have pickup
                           and dropoff locations within the specified region of New York City.
    """
    df = df[df['pickup_latitude'] >= 40.560445]
    df = df[df['pickup_latitude'] <= 40.908524]
    df = df[df['dropoff_latitude'] >= 40.560445]
    df = df[df['dropoff_latitude'] <= 40.908524]
    
    df = df[df['pickup_longitude'] >= -74.242330]
    df = df[df['pickup_longitude'] <= -73.71704]
    df = df[df['dropoff_longitude'] >= -74.242330]
    df = df[df['dropoff_longitude'] <= -73.71704]
    return df

In [None]:
yellow_taxi_ride_sample = remove_out_region(yellow_taxi_ride_sample)
uber_rides_sample = remove_out_region(remove_out_region(uber_rides_sample))

####  Calculate trip distance between the pickup and dropoff location

In [None]:
def calculate_trip_distance(row):
    """
    Calculate the Euclidean distance (in kilometers) between the pickup location and the dropoff location of a trip.

    The function uses the latitude and longitude coordinates of the pickup and dropoff locations to calculate
    the distance between them. It assumes that the Earth is a perfect sphere with a radius of 6,371 kilometers.

    Args:
        row (pandas.Series): A row of a dataframe containing the latitude and longitude values of the pickup
                             and dropoff locations of a trip.

    Returns:
        float: The distance (in kilometers) between the pickup location and the dropoff location of the trip.
    """
    pickup_latitude = row['pickup_latitude']
    pickup_longitude = row['pickup_longitude']
    dropoff_latitude = row['dropoff_latitude']
    dropoff_longitude = row['dropoff_longitude']
    
    lat1, lon1, lat2, lon2 = map(math.radians, [pickup_latitude, pickup_longitude,  dropoff_latitude, dropoff_longitude])
    
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    
    dist = math.sqrt(dlat**2 + dlon**2)
    
    R = 6371
    distance = dist * R
    return distance
    

In [None]:
yellow_taxi_ride_sample['distance_PD'] = yellow_taxi_ride_sample.apply(calculate_trip_distance, axis=1)
uber_rides_sample['distance_PD'] = uber_rides_sample.apply(calculate_trip_distance, axis=1)

#### Filter out zero distance

In [None]:
def remove_zero_distance(df):
    """
    Remove trips from a dataframe where the distance between the pickup and dropoff locations is zero.

    The function takes a dataframe containing information about taxi trips, including the distance between
    the pickup and dropoff locations of each trip. It removes all rows where the distance is zero, as these
    trips are likely to be erroneous or incomplete.

    Args:
        df (pandas.DataFrame): A dataframe containing information about taxi trips, including the distance
                               between the pickup and dropoff locations of each trip.

    Returns:
        pandas.DataFrame: A dataframe with the same columns as the input dataframe, but with all rows where
                           the distance between the pickup and dropoff locations is zero removed.
    """
    condition = df['distance_PD'] == 0
    df = df.drop(df[condition].index)
    return df

In [None]:
yellow_taxi_ride_sample = remove_zero_distance(yellow_taxi_ride_sample)
uber_rides_sample = remove_zero_distance(uber_rides_sample)

### Cleaned Yellow Taxi Data sample and Uber Data sample

In [None]:
yellow_taxi_ride_sample

In [None]:
uber_rides_sample

## Weather Data

In [None]:
path = './weather/'
all_files = glob.glob(path + "*.csv")
data_frames = []
for filename in all_files:
    df = pd.read_csv(filename)
    data_frames.append(df)
weather_data = pd.concat(data_frames, axis=0, ignore_index=True)

In [None]:
weather_data.head()

### Hourly Weather Dataset

In [None]:
weather_data_hourly = weather_data.iloc[:, 1:4].join(weather_data[['HourlyPrecipitation', 'HourlyWindSpeed']])
weather_data_hourly.head()

In [None]:
weather_data_hourly.info()

In [None]:
#Check missing values
weather_data_hourly.isnull().sum()

In [None]:
print("Unique values in Latitude: ",weather_data_hourly['LATITUDE'].unique())
print("Unique values in Longtitude: ",weather_data_hourly['LONGITUDE'].unique())

In [None]:
#Since there is only one unique longtitude and latitude, which means that all the weather data are from the same place
#We drop the two columns "LATITUDE", "LONGTITUDE"

weather_data_hourly = weather_data_hourly.drop(columns=['LATITUDE', 'LONGITUDE'])

#### Fill in missing values

In [None]:
# Fills missing values in 'HourlyPrecipitation' with 0
weather_data_hourly['HourlyPrecipitation'].fillna('0.0', inplace=True)

# Fills missing values in 'HourlyWindSpeed' with the last known value in the same column
weather_data_hourly['HourlyWindSpeed'] = weather_data_hourly['HourlyWindSpeed'].fillna(method='ffill')

#Change 'DATE' column to Datetime type
weather_data_hourly['DATE'] = pd.to_datetime(weather_data_hourly['DATE'])

#### In `HourlyPrecipitation` change all value `T`  to be `0.00001`

In [None]:
weather_data_hourly['HourlyPrecipitation'] = weather_data_hourly['HourlyPrecipitation'].str.replace('s', '')
weather_data_hourly['HourlyPrecipitation'] = weather_data_hourly['HourlyPrecipitation'].replace('T', '0.00001')
weather_data_hourly['HourlyPrecipitation'] = weather_data_hourly['HourlyPrecipitation'].astype(float)

### Cleaned Hourly Weather Dataset

In [None]:
weather_data_hourly

In [None]:
weather_data_hourly.info()

### Daily Weather Dataset

In [None]:
weather_data_daily = weather_data[['DATE','Sunrise','Sunset','DailyAverageWindSpeed','DailyPeakWindSpeed',
                                   'DailySustainedWindSpeed','DailyPrecipitation']]
weather_data_daily.head()

In [None]:
weather_data_daily.info()

In [None]:
#Group by 'DATE' (daily)
date_format = '%Y-%m-%d'
weather_data_daily['DATE'] = pd.to_datetime(weather_data_daily['DATE'])
weather_data_daily['DATE'] = pd.to_datetime(weather_data_daily['DATE'], format=date_format).dt.date
weather_data_daily['DATE'] = pd.to_datetime(weather_data_daily['DATE'])

def filter_rows(group):
    """
    Filter rows of a group based on missing values.

    The function takes a group of rows from a larger dataframe and filters out any rows where all the columns
    except the first one are missing (i.e., contain NaN values). If all the rows in the group have this property,
    the function returns the first row of the group.

    Args:
        group (pandas.DataFrame): A dataframe representing a group of rows from a larger dataframe.

    Returns:
        pandas.DataFrame: A dataframe with the same columns as the input dataframe, but with any rows where
                           all the columns except the first one are missing removed. If all the rows in the
                           group have this property, the function returns the first row of the group.
    """
    is_all_nan = group.iloc[:, 1:].isna().all(axis=1)
    if is_all_nan.all():
        return group.head(1)
    return group.dropna(subset=group.columns[1:], how='all')

date_group = weather_data_daily.groupby('DATE')
weather_data_daily = date_group.apply(filter_rows)
weather_data_daily['DATE'] = weather_data_daily['DATE'].dt.strftime('%Y-%m-%d')
weather_data_daily = weather_data_daily.reset_index(drop=True)
weather_data_daily

In [None]:
weather_data_daily.info()

#### Fill in the missing values according the Hourly Weather Data

In [None]:
weather_data_hourly_copy = weather_data_hourly.copy()
weather_data_hourly_copy['DATE'] = pd.to_datetime(weather_data_hourly_copy['DATE'], format=date_format).dt.date
weather_data_hourly_copy['DATE'] = pd.to_datetime(weather_data_hourly_copy['DATE'])

daily_avg_windspeed = weather_data_hourly_copy.groupby('DATE')['HourlyWindSpeed'].mean()
daily_peak_windspeed = weather_data_hourly_copy.groupby('DATE')['HourlyWindSpeed'].max()
daily_sustained_windspeed = weather_data_hourly_copy.groupby('DATE')['HourlyWindSpeed'].apply(lambda x: x.value_counts().idxmax())
daily_precipitation = weather_data_hourly_copy.groupby('DATE')['HourlyPrecipitation'].sum()

weather_data_daily['DailyAverageWindSpeed'] = weather_data_daily.apply(lambda row: 
                                                row['DailyAverageWindSpeed'] 
                                                if pd.notna(row['DailyAverageWindSpeed']) 
                                                else daily_avg_windspeed[row['DATE']], 
                                                axis=1)
weather_data_daily['DailyPeakWindSpeed'] = weather_data_daily.apply(lambda row: 
                                                row['DailyPeakWindSpeed'] 
                                                if pd.notna(row['DailyPeakWindSpeed']) 
                                                else daily_peak_windspeed[row['DATE']], 
                                                axis=1)
weather_data_daily['DailySustainedWindSpeed'] = weather_data_daily.apply(lambda row: 
                                                row['DailySustainedWindSpeed'] 
                                                if pd.notna(row['DailySustainedWindSpeed']) 
                                                else daily_sustained_windspeed[row['DATE']], 
                                                axis=1)
weather_data_daily['DailyPrecipitation'] = weather_data_daily.apply(lambda row: 
                                                row['DailyPrecipitation'] 
                                                if pd.notna(row['DailyPrecipitation']) 
                                                else daily_precipitation[row['DATE']], 
                                                axis=1)

weather_data_daily['DATE'] = pd.to_datetime(weather_data_daily['DATE'])


In [None]:
weather_data_daily.info()

#### Change all value `T` in `DailyPrecipitation` to be `0.00001`

In [None]:
weather_data_daily['DailyPrecipitation'] = weather_data_daily['DailyPrecipitation'].replace('T', '0.00001')
weather_data_daily['DailyPrecipitation'] = weather_data_daily['DailyPrecipitation'].astype(float)

### Cleaned Daily Weather Datasets

In [None]:
sunrise_sunset = weather_data_daily.copy().dropna()
sunrise_sunset

In [None]:
sunrise_sunset.info()

In [None]:
weather_data_hourly.info()

In [None]:
weather_data_daily = weather_data_daily.drop(columns=['Sunrise','Sunset'])


In [None]:
weather_data_daily.info()

# Part 2: Storing Data

In [None]:
engine = create_engine('sqlite:///project.db', echo=True)

# COMMAND = ["""
# DROP TABLE IF EXISTS yellow_taxi_ride""",
# """
# DROP TABLE IF EXISTS hourly_weather_data""",
# """
# DROP TABLE IF EXISTS daily_weather_data""",
# """
# DROP TABLE IF EXISTS uber_rides""",
# """
# DROP TABLE IF EXISTS sunrise_sunset_data"""
# ]

# for i in COMMAND:
#     engine.execute(i)

### Five tables (Including Sunrise_Sunset)

In [None]:
connection = sqlite3.connect("project.db")

#Yellow Taxi trips
with connection:
    connection.execute(
        """
        CREATE TABLE IF NOT EXISTS yellow_taxi_ride (
            id INTEGER PRIMARY KEY,
            trip_distance REAL,
            passenger_count INTEGER,
            total_amount REAL,
            dropoff_datetime DATETIME,
            pickup_longitude REAL,
            pickup_latitude REAL,
            dropoff_latitude REAL,
            dropoff_longitude REAL,
            pickup_datetime DATETIME,
            distance_PD REAL,
            tip_amount REAL
        );
        """
    )

#Uber trips
with connection:
    connection.execute(
        """
        CREATE TABLE IF NOT EXISTS uber_rides (
            id INTEGER PRIMARY KEY,
            fare_amount REAL,
            pickup_datetime DATETIME,
            pickup_longitude REAL,
            pickup_latitude REAL,
            dropoff_longitude REAL,
            dropoff_latitude REAL,
            passenger_count INTEGER,
            distance_PD REAL
        );
        """
    )
    
#Hourly Weather
with connection:
    connection.execute(
        """
        CREATE TABLE IF NOT EXISTS hourly_weather_data (
            id INTEGER PRIMARY KEY,
            DATE DATETIME,
            HourlyPrecipitation REAL,
            HourlyWindSpeed REAL
        );
        """
    )
    
#Daily Weather
with connection:
    connection.execute(
        """
        CREATE TABLE IF NOT EXISTS daily_weather_data (
            id INTEGER PRIMARY KEY,
            DATE DATETIME,
            DailyAverageWindSpeed REAL,
            DailyPeakWindSpeed REAL,
            DailySustainedWindSpeed REAL,
            DailyPrecipitation REAL
        );
        """
    )
    
# Sunrise Sunset
with connection:
    connection.execute(
        """
        CREATE TABLE IF NOT EXISTS sunrise_sunset_data (
            id INTEGER PRIMARY KEY,
            DATE DATETIME,
            Sunrise  REAL,
            Sunset REAL,
            DailyAverageWindSpeed REAL,
            DailyPeakWindSpeed REAL，
            DailySustainedWindSpeed REAL,
            DailyPrecipitation REAL
        );
        """
    )

In [None]:
yellow_taxi_df = yellow_taxi_ride_sample.copy()
yellow_taxi_df.reset_index(inplace=True)
yellow_taxi_df.rename(columns={'index': 'id'}, inplace=True)

uber_df = uber_rides_sample.copy()
uber_df.reset_index(inplace=True)
uber_df.rename(columns={'index': 'id'}, inplace=True)

hourly_weather_df = weather_data_hourly.copy()
hourly_weather_df.reset_index(inplace=True)
hourly_weather_df.rename(columns={'index': 'id'}, inplace=True)

daily_weather_df = weather_data_daily.copy()
daily_weather_df.reset_index(inplace=True)
daily_weather_df.rename(columns={'index': 'id'}, inplace=True)

sunrise_set_df = sunrise_sunset.copy()
sunrise_set_df.reset_index(inplace=True)
sunrise_set_df.rename(columns={'index': 'id'}, inplace=True)

with engine.connect() as conn:
    yellow_taxi_df.to_sql('yellow_taxi_ride', conn, if_exists='replace', index=False)
    uber_df.to_sql('uber_rides', conn, if_exists='replace', index=False)
    hourly_weather_df.to_sql('hourly_weather_data', conn, if_exists='replace', index=False)
    daily_weather_df.to_sql('daily_weather_data', conn, if_exists='replace', index=False)
    sunrise_set_df.to_sql('sunrise_sunset_data', conn, if_exists='replace', index=False)

In [None]:
conn = sqlite3.connect('project.db')
cur = conn.cursor()
cur.execute("SELECT name, sql FROM sqlite_master WHERE type='table'")
tables = cur.fetchall()

with open('schema.sql', 'w') as f:
    for table in tables:
        f.write(f'{table[1]};\n')
        
cur.close()
conn.close()

# Part 3: Understanding Data

## Query 1

In [None]:
query1 = """
SELECT 
    strftime('%H', pickup_datetime) AS hour_of_day,
    COUNT(*) AS ride_count
FROM 
    yellow_taxi_ride 
WHERE 
    pickup_datetime BETWEEN '2009-01-01' AND '2015-06-30'
GROUP BY 
    hour_of_day 
ORDER BY 
    ride_count DESC;
"""

In [None]:
with open("popularity_yellow_taxi_01_2009_06-2015.sql", "w") as f:
    f.write(query1)

In [None]:
conn = sqlite3.connect('project.db')
cur = conn.cursor()
cur.execute(query1)
results1 = cur.fetchall()

for row in results1:
    print(row)

conn.close()

## Query 2

In [None]:
query2 = """
SELECT 
    strftime('%w', pickup_datetime) AS day_of_week,
    COUNT(*) AS ride_count
FROM 
    uber_rides
WHERE 
    pickup_datetime BETWEEN '2009-01-01' AND '2015-06-30'
GROUP BY 
    day_of_week 
ORDER BY 
    ride_count DESC;
"""

In [None]:
with open("popularity_Uber_rides_week_days.sql", "w") as f:
    f.write(query2)

In [None]:
conn = sqlite3.connect('project.db')
cur = conn.cursor()
cur.execute(query2)
results2 = cur.fetchall()

for row in results2:
    print(row)

conn.close()

## Query 3

In [None]:
query3 = """
SELECT CAST((SELECT distance_PD
             FROM (
               SELECT distance_PD, NTILE(100) OVER (ORDER BY distance_PD) AS percentile
               FROM (
                 SELECT distance_PD FROM yellow_taxi_ride WHERE pickup_datetime BETWEEN '2013-07-01' AND '2013-07-31'
                 UNION ALL
                 SELECT distance_PD FROM uber_rides WHERE pickup_datetime BETWEEN '2013-07-01' AND '2013-07-31'
               ) AS hired_trips
             )
             WHERE percentile = 95
             LIMIT 1) AS REAL) AS percentile_95;
"""

In [None]:
with open("hired_trips_distance_95_percentile_07_2013.sql", "w") as f:
    f.write(query3)

In [None]:
conn = sqlite3.connect('project.db')
cur = conn.cursor()
cur.execute(query3)
results3 = cur.fetchall()

for row in results3:
    print(row)

conn.close()

## Query 4

In [None]:
query4 = """
SELECT
    date(date_time) AS date,
    SUM(num_rides) AS num_rides,
    AVG(avg_distance) AS avg_distance
FROM
    (
    SELECT
        dropoff_datetime AS date_time,
        1 AS num_rides,
        trip_distance AS avg_distance
    FROM
        yellow_taxi_ride
    WHERE
        strftime('%Y-%m', dropoff_datetime) BETWEEN '2009-01' AND '2009-06'
    
    UNION ALL
    
    SELECT
        pickup_datetime AS date_time,
        1 AS num_rides,
        distance_PD AS avg_distance
    FROM
        uber_rides
    WHERE
        strftime('%Y-%m', pickup_datetime) BETWEEN '2009-01' AND '2009-06'
    ) AS hired_rides
GROUP BY
    date
ORDER BY
    num_rides DESC
LIMIT
    10;
"""

In [None]:
with open("2009_top_10_hired_rides_average_distance.sql", "w") as f:
    f.write(query4)

In [None]:
conn = sqlite3.connect('project.db')
cur = conn.cursor()
cur.execute(query4)
results4 = cur.fetchall()

for row in results4:
    print(row)

conn.close()

## Query 5

In [None]:
query5 = """
SELECT date, avg_wind_speed, SUM(hired_trips + total_rides) AS total_trips
FROM (
    SELECT d.date, d.DailyAverageWindSpeed AS avg_wind_speed, COUNT(y.id) AS hired_trips, 0 AS total_rides
    FROM daily_weather_data d
    JOIN uber_rides y ON DATE(d.date) = DATE(y.pickup_datetime)
    WHERE strftime('%Y', d.date) = '2014'
    GROUP BY d.date

    UNION ALL

    SELECT d.date, d.DailyAverageWindSpeed AS avg_wind_speed, 0 AS hired_trips, COUNT(y.id) AS total_rides
    FROM daily_weather_data d
    JOIN yellow_taxi_ride y ON DATE(d.date) = DATE(y.pickup_datetime)
    WHERE strftime('%Y', d.date) = '2014'
    GROUP BY d.date
)
GROUP BY date
ORDER BY avg_wind_speed DESC
LIMIT 10;
"""

In [None]:
with open("windest_10_days_2014_hired_trips.sql", "w") as f:
    f.write(query5)

In [None]:
conn = sqlite3.connect('project.db')
cur = conn.cursor()
cur.execute(query5)
results5 = cur.fetchall()

for row in results5:
    print(row)

conn.close()

## Query 6

In [None]:
query6 = """
WITH total_hired_trips AS 
(
    SELECT strftime('%Y-%m-%d %H:00:00', pickup_datetime) AS date_hour_time FROM uber_rides
    WHERE strftime('%Y-%m-%d', pickup_datetime) BETWEEN '2012-10-22' AND '2012-11-07'
    
    UNION ALL
    
    SELECT strftime('%Y-%m-%d %H:00:00', pickup_datetime) AS date_hour_time FROM yellow_taxi_ride
    WHERE strftime('%Y-%m-%d', pickup_datetime) BETWEEN '2012-10-22' AND '2012-11-07'
)

SELECT strftime('%Y-%m-%d %H:00:00', hourly_weather_data.DATE) AS weather_date_hour, COUNT(*) AS number_hired_rides, SUM(HourlyPrecipitation) AS total_precipitation, AVG(HourlyWindSpeed) AS average_wind_speed
FROM hourly_weather_data
JOIN total_hired_trips ON weather_date_hour = total_hired_trips.date_hour_time
WHERE strftime('%Y-%m-%d', hourly_weather_data.DATE) BETWEEN '2012-10-22' AND '2012-11-07'
GROUP BY weather_date_hour
ORDER BY weather_date_hour
"""

In [None]:
with open("hurricane_trips_precipitation_wind_speed_hourly.sql", "w") as f:
    f.write(query6)

In [None]:
conn = sqlite3.connect('project.db')
cur = conn.cursor()
cur.execute(query6)
results6 = cur.fetchall()

for row in results6:
    print(row)

conn.close()

# Part 4: Visualizing Data

### Plot 1:

In [None]:
x = [int(row[0]) for row in results1]
y = [row[1] for row in results1]

fig, ax = plt.subplots()
ax.bar(x, y)

ax.set_xlabel('Hour of the day')
ax.set_ylabel('Number of rides')
ax.set_title('Popularity of Yellow Taxi rides (2009-01 to 2015-06)')

plt.show()

### Plot 2: the average distance traveled per month

In [None]:
yellow_taxi_rides_distance = yellow_taxi_ride_sample[['pickup_datetime','distance_PD']]
uber_rides_distance = uber_rides_sample[['pickup_datetime','distance_PD']]
uber_rides_distance.loc[:, 'pickup_datetime'] = uber_rides_distance.loc[:, 'pickup_datetime'].dt.tz_localize(None)
rides_distance = pd.concat([yellow_taxi_rides_distance, uber_rides_distance], ignore_index=True)
rides_distance.sort_values('pickup_datetime')

In [None]:
rides_distance['pickup_datetime'] = pd.to_datetime(rides_distance['pickup_datetime'])
rides_distance['pickup_datetime'] = rides_distance['pickup_datetime'].apply(lambda x: x.strftime('%m'))

# Group by month and calculate mean and standard error of mean (sem)
rides_distance_monthly = rides_distance.groupby('pickup_datetime')['distance_PD'].agg(['mean', 'sem'])

# Calculate 90% confidence interval
rides_distance_monthly['lower'] = rides_distance_monthly['mean'] - 1.645*rides_distance_monthly['sem']
rides_distance_monthly['upper'] = rides_distance_monthly['mean'] + 1.645*rides_distance_monthly['sem']

fig, ax = plt.subplots(figsize=(12,6))
ax.errorbar(x=rides_distance_monthly.index, y=rides_distance_monthly['mean'], yerr=rides_distance_monthly['sem'], label='Mean distance', fmt='o')
ax.fill_between(rides_distance_monthly.index, rides_distance_monthly['lower'], rides_distance_monthly['upper'], alpha=0.2, label='90% CI')
ax.set_xlabel('Month')
ax.set_ylabel('Distance')
ax.set_title('Average distance traveled per month (2009 to 2015)')
ax.legend()
plt.show()


### Plot 3: drop offs at three major New York airports: LGA, JFK, and EWR

In [None]:
yellow_taxi_dropoffs = yellow_taxi_ride_sample[['pickup_datetime','dropoff_longitude','dropoff_latitude']]
uber_dropoffs = uber_rides_sample[['pickup_datetime','dropoff_longitude','dropoff_latitude']]
uber_dropoffs.loc[:, 'pickup_datetime'] = uber_dropoffs.loc[:, 'pickup_datetime'].dt.tz_localize(None)
rides_dropoffs = pd.concat([yellow_taxi_dropoffs, uber_dropoffs], ignore_index=True)
rides_dropoffs.sort_values('pickup_datetime')

In [None]:
# [latitude, longtitude] coordinates
LGA_coords = [40.7769, -73.8740]
JFK_coords = [40.6413, -73.7781]
EWR_coords = [40.6895, -74.1745]

# Load the dataset and filter by drop offs at each airport
LGA_df = rides_dropoffs[(rides_dropoffs.dropoff_latitude >= LGA_coords[0]-0.02) & (rides_dropoffs.dropoff_latitude <= LGA_coords[0]+0.02) &
                        (rides_dropoffs.dropoff_longitude >= LGA_coords[1]-0.02) & (rides_dropoffs.dropoff_longitude <= LGA_coords[1]+0.02)]
JFK_df = rides_dropoffs[(rides_dropoffs.dropoff_latitude >= JFK_coords[0]-0.02) & (rides_dropoffs.dropoff_latitude <= JFK_coords[0]+0.02) &
                        (rides_dropoffs.dropoff_longitude >= JFK_coords[1]-0.02) & (rides_dropoffs.dropoff_longitude <= JFK_coords[1]+0.02)]
EWR_df = rides_dropoffs[(rides_dropoffs.dropoff_latitude >= EWR_coords[0]-0.02) & (rides_dropoffs.dropoff_latitude <= EWR_coords[0]+0.02) &
                        (rides_dropoffs.dropoff_longitude >= EWR_coords[1]-0.02) & (rides_dropoffs.dropoff_longitude <= EWR_coords[1]+0.02)]

# Group the data by day of the week and count the number of drop offs
LGA_day_counts = LGA_df.groupby(LGA_df['pickup_datetime'].dt.dayofweek).size()
JFK_day_counts = JFK_df.groupby(JFK_df['pickup_datetime'].dt.dayofweek).size()
EWR_day_counts = EWR_df.groupby(EWR_df['pickup_datetime'].dt.dayofweek).size()

fig, ax = plt.subplots(figsize=(10,6))
bar_width = 0.25
ax.bar(LGA_day_counts.index - bar_width, LGA_day_counts.values, width=bar_width, label='LGA')
ax.bar(JFK_day_counts.index, JFK_day_counts.values, width=bar_width, label='JFK')
ax.bar(EWR_day_counts.index + bar_width, EWR_day_counts.values, width=bar_width, label='EWR')
ax.set_xticks(range(7))
ax.set_xticklabels(['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])
ax.set_xlabel('Day of the week')
ax.set_ylabel('Number of drop offs')
ax.set_title('Drop offs at airports(LGA, JFK, EWR) vs. Day of Week (2009 to 2015)')
ax.legend()
ax.grid(axis="y",linestyle='--')
plt.show()

### Plot 4: heatmap of all hired trips

In [None]:
yellow_taxi_rides_hired = yellow_taxi_ride_sample[['pickup_longitude','pickup_latitude']]
uber_rides_hired = uber_rides_sample[['pickup_longitude','pickup_latitude']]
rides_hired = pd.concat([yellow_taxi_rides_hired, uber_rides_hired], ignore_index=True)
rides_hired

In [None]:
heatmap = folium.Map(location=[rides_hired['pickup_latitude'].mean(), rides_hired['pickup_longitude'].mean()], zoom_start=10)
title_html = '''
             <h3 align="center" style="font-size:20px"><b>Hired Trips Heatmap</b></h3>
             '''
heatmap.get_root().html.add_child(folium.Element(title_html))
heatmap.add_child(folium.plugins.HeatMap(rides_hired[['pickup_latitude', 'pickup_longitude']].values.tolist(), name='Heatmap', control=False))
folium.LayerControl().add_to(heatmap)
heatmap

### Plot 5: `tip amount` vs. `distance` for Yellow Taxi rides

In [None]:
tip_distance = yellow_taxi_ride_sample[(yellow_taxi_ride_sample["tip_amount"] >= 0) & 
                                       (yellow_taxi_ride_sample["tip_amount"] < 30) & 
                                       (yellow_taxi_ride_sample["distance_PD"] >= 0) & 
                                       (yellow_taxi_ride_sample["distance_PD"] <= 35)]

plt.scatter(tip_distance["distance_PD"], tip_distance["tip_amount"])
plt.xlabel("Straight-line Distance")
plt.ylabel("Tip Amount")
plt.title("Tip Amount vs Straight-line Distance for Yellow Taxi Rides")
plt.show()

In [None]:
tip_distance = yellow_taxi_ride_sample[(yellow_taxi_ride_sample["tip_amount"] >= 0) & 
                                       (yellow_taxi_ride_sample["tip_amount"] < 30) & 
                                       (yellow_taxi_ride_sample["trip_distance"] >= 0) & 
                                       (yellow_taxi_ride_sample["trip_distance"] <= 35)]

plt.scatter(tip_distance["trip_distance"], tip_distance["tip_amount"])
plt.xlabel("Actual Distance")
plt.ylabel("Tip Amount")
plt.title("Tip Amount vs Actual Distance for Yellow Taxi Rides")
plt.show()

### Plot 6: Daily `tip amount` vs. `precipitation`  for Yellow Taxi rides

In [None]:
yellow_taxi_ride_daily = yellow_taxi_ride_sample.copy()
yellow_taxi_ride_daily['DATE'] = pd.to_datetime(yellow_taxi_ride_daily['pickup_datetime'], format=date_format).dt.date
yellow_taxi_ride_daily['DATE'] = pd.to_datetime(yellow_taxi_ride_daily['DATE'])

daily_tip = yellow_taxi_ride_daily.groupby('DATE')['tip_amount'].sum()
tip_weather_daily = pd.merge(weather_data_daily, daily_tip,
                           left_on='DATE', right_on=daily_tip.index, how='inner')
tip_weather_daily['DailyPrecipitation'] = tip_weather_daily['DailyPrecipitation'].replace('T', 0.0).astype(float)
tip_weather_daily

In [None]:
tip_weather = tip_weather_daily[(tip_weather_daily["tip_amount"] >= 0) & 
                                       (tip_weather_daily["tip_amount"] < 300) & 
                                       (tip_weather_daily["DailyPrecipitation"] >= 0) & 
                                       (tip_weather_daily["DailyPrecipitation"] <= 5)]

plt.scatter(tip_weather["DailyPrecipitation"], tip_weather["tip_amount"])
plt.xlabel("Precipitation Amount")
plt.ylabel("Tip Amount")
plt.title("Daily Tip Amount vs Precipitation Amount for Yellow Taxi Rides")
plt.show()

### Plot 7: Animation

#### Question: How does the DailyAverageWindSpeed vary over time

In [None]:
%matplotlib notebook

def animate(i):
    plt.cla()
    data = weather_data_daily.iloc[:i+1]
    plt.plot(data['DATE'], data['DailyAverageWindSpeed'], color='blue')
    plt.xlabel('Date')
    plt.xticks(rotation=45, ha='right')
    plt.ylabel('Daily Average Wind Speed')
    plt.title('Daily Average Wind Speed Over Time')
    plt.grid(True)
    
ani = FuncAnimation(plt.gcf(), animate, frames=len(weather_data_daily), interval=1000)
plt.show()