# Understanding Hired Rides in NYC

## Project Setup

In [None]:
# install needed libraries
!pip install pyarrow
!pip install fastparquet
!pip install geopandas
!pip install pytest
!pip install keplergl
!pip install bs4

In [1]:
# all import statements needed for the project, for example:
import math
import bs4
import matplotlib.pyplot as plt
import pandas as pd
import geopandas as gpd
import requests
import sqlalchemy as db
import sqlite3
import numpy as np
import matplotlib.animation as animation
import keplergl
from keplergl import KeplerGl
import statistics
import unittest

In [2]:
# add other constants
TAXI_URL = "https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page"
UBER_CSV = "uber_rides_sample.csv"
NEW_YORK_BOX_COORDS = ((40.560445, -74.242330), (40.908524, -73.717047))
DATABASE_URL = "sqlite:///project.db"
DATABASE_SCHEMA_FILE = "schema.sql"
QUERY_DIRECTORY = "queries"

# Data Preprocessing

## Calculating distance
_**TODO:** Since we do not have trip distance information in our data, we need to calculate the distance from the latitude and longitude information of the pickup and dropoff locations. Here we use functions to complete the calculation and add the resulting trip distances to the dataframe._

In [3]:
# Calculate the distance between the two coordinates
def calculate_distance(from_coord: list, to_coord: list) -> float:
    R = 6373.0
    lat1 = math.radians(from_coord[0])
    lon1 = math.radians(from_coord[1])
    lat2 = math.radians(to_coord[0])
    lon2 = math.radians(to_coord[1])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = math.sin(dlat / 2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    distance = R * c
    return distance

In [5]:
# For the dataset that is not given the trip distance, calculate the distance using the given coordinate data and add it to the dataframe
def add_distance_column(dataframe: pd.core.frame.DataFrame) -> pd.core.frame.DataFrame:
    distance = []
    for index, row in dataframe.iterrows():
        distance.append(calculate_distance((row['pickup_latitude'], row['pickup_longitude']), (row['dropoff_latitude'], row['dropoff_longitude'])))
    dataframe['trip_distance'] = distance
    return dataframe

## Processing Uber Data

_**TODO:** Read uber's trip data and process it._

In [None]:
# load uber data and clean the data
def load_and_clean_uber_data(csv_file: str) -> pd.core.frame.DataFrame:
    df = pd.read_csv(csv_file, on_bad_lines='skip')
    df.columns = df.columns.str.lower()
    add_distance_column(df)
    df.drop(df.columns.difference(['pickup_datetime',
                                     'trip_distance', 
                                     'pickup_latitude', 
                                     'pickup_longitude', 
                                     'dropoff_latitude', 
                                     'dropoff_longitude']), 1, inplace=True)

    # remove rows start and/or end outside of the following latitude/longitude coordinate box: 
    # (40.560445, -74.242330) and (40.908524, -73.717047)
    df=df[df["pickup_longitude"] <= -73.717047]  
    df=df[df["pickup_longitude"] >= -74.242330]
    df=df[df["pickup_latitude"] >= 40.560445]
    df=df[df["pickup_latitude"] <= 40.908524]
    df=df[df["dropoff_longitude"] <= -73.717047]
    df=df[df["dropoff_longitude"] >= -74.242330]
    df=df[df["dropoff_latitude"] >= 40.560445]
    df=df[df["dropoff_latitude"] <= 40.908524]

    # remove invalid rows thtat pickup time is 0
    df = df.loc[df["pickup_datetime"] != 0.0]

    df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])

    return df

In [34]:
uber_data = load_and_clean_uber_data(UBER_CSV)
uber_data.to_csv("uber_data") 

  df.drop(df.columns.difference(['pickup_datetime',


## Processing Taxi Data

_**TODO:** Use Beautiful Soup to get the required taxi data links from the New York government data website (https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page), read the data and convert the data into the same format._

In [6]:
def find_taxi_parquet_urls() -> list:
    response = requests.get(TAXI_URL)
    soup = bs4.BeautifulSoup(response.text, "html.parser")
    links = soup.find_all(lambda tag:'title' in tag.attrs and tag.attrs['title'] == "Yellow Taxi Trip Records")
    hrefs = [link.get('href') for link in links]
    # Filter the links based on the desired years (2009 to 2015)
    hrefs_filtered = [href for href in hrefs 
                  if any(year in href for year in map(str, range(2009, 2015)))
                  or (any(f"2015-{month:02}" in href for month in range(1, 7)))]
    return hrefs_filtered


In [7]:
# Define a function that converts location to coordinates, and generate a dataframe
def convert_id_to_coord(df: pd.core.frame.DataFrame) -> pd.core.frame.DataFrame:
    shapefile = gpd.read_file(r"C:\Users\Silvia\Documents\GitHub\4501FinalProject_Group14\taxi_zones\taxi_zones.shp")
    # Convert the geometry column in the shapefile into specific coordinates of latitude and longitude
    shapefile = shapefile.to_crs(4326)
    shapefile['latitude'] = shapefile['geometry'].centroid.y
    shapefile['longitude'] = shapefile['geometry'].centroid.x
    
    df = df
    df = df.loc[df["pulocationid"] <= 263]
    df = df.loc[df["pulocationid"] != 0]
    df = df.loc[df["dolocationid"] <= 263]
    df = df.loc[df["dolocationid"] != 0]
    # convert location IDs into longitude and latitude
    PUlongitude = []
    PUlatitude = []
    DOlongitude = []
    DOlatitude = []
    # convert the pickup location IDs into longitude and latitude
    for i in df['pulocationid']:
        PUlatitude.append(shapefile['latitude'][i-1])
        PUlongitude.append(shapefile['longitude'][i-1])
    for i in df['dolocationid']:
        DOlatitude.append(shapefile['latitude'][i-1])
        DOlongitude.append(shapefile['longitude'][i-1])
        
    df['pickup_longitude'] = PUlongitude
    df['pickup_latitude'] = PUlatitude
    df['dropoff_longitude'] = DOlongitude
    df['dropoff_latitude'] = DOlatitude
    # convert the drop off location IDs into longitude and latitude
    
    return df

In [8]:
# Obtain taxi data and clean the data
def get_and_clean_month_taxi_data(url: str) -> pd.core.frame.DataFrame:

    df = pd.read_parquet(url)
    df.columns = df.columns.str.lower()
    df_taxi = pd.DataFrame()

    # keep necessary columns into a new dataframe
    if 'tpep_pickup_datetime' in df.columns:
        df=df.rename(columns = {'tpep_pickup_datetime':'pickup_datetime',
                                'tip_amount' : 'tip_amount'})
        df=convert_id_to_coord(df)
        
    elif 'trip_pickup_datetime' in df.columns:
        df=df.rename(columns = {'trip_pickup_datetime':'pickup_datetime', 
                                'start_lon': 'pickup_longitude',
                                'start_lat': 'pickup_latitude',
                                'end_lon': 'dropoff_longitude',
                                'end_lat': 'dropoff_latitude',
                                'tip_amt' : 'tip_amount'})
        
    df.drop(df.columns.difference(['pickup_datetime',
                                    'trip_distance', 
                                    'pickup_latitude', 
                                    'pickup_longitude', 
                                    'dropoff_latitude', 
                                    'dropoff_longitude',
                                   'tip_amount']), 1, inplace=True)
    
    df=df[df["pickup_longitude"] <= -73.717047]  
    df=df[df["pickup_longitude"] >= -74.242330]
    df=df[df["pickup_latitude"] >= 40.560445]
    df=df[df["pickup_latitude"] <= 40.908524]
    df=df[df["dropoff_longitude"] <= -73.717047]
    df=df[df["dropoff_longitude"] >= -74.242330]
    df=df[df["dropoff_latitude"] >= 40.560445]
    df=df[df["dropoff_latitude"] <= 40.908524]

    df = df.loc[df["pickup_datetime"] != 0.0]
    
    return df

### Sampling Taxi Data

_**TODO:** Sampling the taxi data according to the number of uber trips each month. Because the number of yellow taxi data is so large (over 67 million) that it is much larger than the uber data for each time period. Therefore, a random sampling method was used to select a number of yellow cab data equal to the number of uber data in each month to facilitate the analysis._

In [10]:
# Get the number of uber data per month
def get_number_to_sample() -> int:
    uber = load_and_clean_uber_data(r"C:\Users\Silvia\Documents\GitHub\Projects-portfolio\Uber_and_Yellow_Taxi\uber_rides_sample.csv")
    uber.index = pd.to_datetime(uber['pickup_datetime'])
    number_each_month = uber.groupby(by=[uber.index.year, uber.index.month]).size()
    number = []
    for i in range(2009,2015):
        for j in range(1,13):
            number.append(number_each_month[i][j])
    for i in range(1,7):
        number.append(number_each_month[2015][i])
    return number

In [24]:
def get_sample_taxi_data() -> pd.core.frame.DataFrame:
    sample_taxi_dataframes = []
    
    all_parquet_urls = find_taxi_parquet_urls()
    number = get_number_to_sample()
    # do the sampling of taxi data according to the number of uber trip each month
    for i in range(0, 78): 
        n = number[i]
        url = all_parquet_urls[i]
        df = get_and_clean_month_taxi_data(url)
        df_sample = df.sample(n)
        sample_taxi_dataframes.append(df_sample)

    taxi_data = pd.concat(sample_taxi_dataframes)
    taxi_data['pickup_datetime'] = pd.to_datetime(taxi_data['pickup_datetime'])
    taxi_data = taxi_data.reset_index(drop = True)
    return taxi_data

In [None]:
taxi_data = get_sample_taxi_data()

In [None]:
taxi_data.to_csv("taxi_sample") 

## Processing Weather Data

_**TODO:** Read and process weather data._

In [12]:
def clean_month_weather_data_hourly(csv_file: str) -> pd.core.frame.DataFrame:
    # read file
    df = pd.read_csv(csv_file)
    #drop unnecessary colums
    df.drop(df.columns.difference(['DATE',
                                   'HourlyPrecipitation', 
                                   'HourlyWindSpeed']), 1, inplace=True)
    df['HourlyPrecipitation'] = df['HourlyPrecipitation'].replace('T', 0.0)
    # drop na values
    df.dropna(subset=['HourlyWindSpeed'], inplace=True)
    # convert "DATE" to datetime type
    df['DATE'] = pd.to_datetime(df['DATE'])
    # convert "HourlyPrecipitation" to float type
    df['HourlyPrecipitation'] = pd.to_numeric(df['HourlyPrecipitation'], errors='coerce')
    # fill in missing values
    df['HourlyPrecipitation'].fillna(0, inplace=True)
    # cast "df" to specified type
    df = df.astype({'HourlyWindSpeed': 'float32', 'HourlyPrecipitation': 'float32'})

    return df

In [13]:
def clean_month_weather_data_daily(csv_file: str) -> pd.core.frame.DataFrame:
    # read file
    df = pd.read_csv(csv_file)
    # Replace data of the string type
    df['HourlyPrecipitation'] = df['HourlyPrecipitation'].replace('T', 0.0)
    # convert "DATE" to datetime type
    df['DATE'] = pd.to_datetime(df['DATE'])
    # convert "HourlyPrecipitation" to numeric type
    df['HourlyPrecipitation'] = pd.to_numeric(df['HourlyPrecipitation'], errors='coerce')
    # convert value of 'na' into 0.0
    df['HourlyPrecipitation'].fillna(0, inplace=True)
    #drop unnecessary colums
    df.drop(df.columns.difference(['DATE',
                                   'HourlyPrecipitation', 
                                   'HourlyWindSpeed']), 1, inplace=True)
    # calculate hourly average as a daily values
    df['DATE'] = df['DATE'].dt.date
    df = df.groupby('DATE', as_index=False).agg({'HourlyWindSpeed': np.mean, 'HourlyPrecipitation': np.mean})
    df['HourlyWindSpeed'] = df['HourlyWindSpeed'].map(lambda x: round(x, 2))
    # remame columns
    df.rename(columns={'HourlyWindSpeed': 'DailyAverageWindSpeed', 'HourlyPrecipitation': 'DailyPrecipitation'}, inplace=True)
    df = df.astype({'DailyAverageWindSpeed':'float32', 'DailyPrecipitation':'float32', 'DATE' : 'datetime64[ns]'})

    return df

In [20]:
def clean_sunset_sunrise_daily(csv_file: str) -> pd.core.frame.DataFrame:
    df = pd.read_csv(csv_file)
    df.drop(df.columns.difference(['DATE','Sunset','Sunrise']), 1, inplace=True)
    df = df.dropna()
    df['DATE'] = pd.to_datetime(df['DATE'])
    df = df.astype({'Sunrise': 'int32', 'Sunset': 'int32', 'DATE':'datetime64[ns]' })
    return df

In [25]:
def load_and_clean_weather_data() -> pd.core.frame.DataFrame:
    hourly_dataframes = []
    daily_dataframes = []
    
    # add some way to find all weather CSV files
    # or just add the name/paths manually
    weather_csv_files = [
            r"C:\Users\Silvia\Documents\GitHub\Projects-portfolio\Uber_and_Yellow_Taxi\2009_weather.csv",
            r"C:\Users\Silvia\Documents\GitHub\Projects-portfolio\Uber_and_Yellow_Taxi\2010_weather.csv",
            r"C:\Users\Silvia\Documents\GitHub\Projects-portfolio\Uber_and_Yellow_Taxi\2011_weather.csv",
            r"C:\Users\Silvia\Documents\GitHub\Projects-portfolio\Uber_and_Yellow_Taxi\2012_weather.csv",
            r"C:\Users\Silvia\Documents\GitHub\Projects-portfolio\Uber_and_Yellow_Taxi\2013_weather.csv",
            r"C:\Users\Silvia\Documents\GitHub\Projects-portfolio\Uber_and_Yellow_Taxi\2014_weather.csv",
            r"C:\Users\Silvia\Documents\GitHub\Projects-portfolio\Uber_and_Yellow_Taxi\2015_weather.csv"
        ]
    
    for csv_file in weather_csv_files:
        hourly_dataframe = clean_month_weather_data_hourly(csv_file)
        daily_dataframe = clean_month_weather_data_daily(csv_file)
        hourly_dataframes.append(hourly_dataframe)
        daily_dataframes.append(daily_dataframe)
        
    # create two dataframes with hourly & daily data from every month
    hourly_data = pd.concat(hourly_dataframes)
    daily_data = pd.concat(daily_dataframes)
    
    return hourly_data, daily_data

In [16]:
def load_and_clean_sunrise_sunset_data() -> pd.core.frame.DataFrame:
    sunrise_sunset_dataframes =[]
    
    weather_csv_files = [
            r"C:\Users\Silvia\Documents\GitHub\Projects-portfolio\Uber_and_Yellow_Taxi\2009_weather.csv",
            r"C:\Users\Silvia\Documents\GitHub\Projects-portfolio\Uber_and_Yellow_Taxi\2010_weather.csv",
            r"C:\Users\Silvia\Documents\GitHub\Projects-portfolio\Uber_and_Yellow_Taxi\2011_weather.csv",
            r"C:\Users\Silvia\Documents\GitHub\Projects-portfolio\Uber_and_Yellow_Taxi\2012_weather.csv",
            r"C:\Users\Silvia\Documents\GitHub\Projects-portfolio\Uber_and_Yellow_Taxi\2013_weather.csv",
            r"C:\Users\Silvia\Documents\GitHub\Projects-portfolio\Uber_and_Yellow_Taxi\2014_weather.csv",
            r"C:\Users\Silvia\Documents\GitHub\Projects-portfolio\Uber_and_Yellow_Taxi\2015_weather.csv"
        ]
    
    for csv_file in weather_csv_files:
        sunrise_sunset_dataframe = clean_sunset_sunrise_daily(csv_file)
        sunrise_sunset_dataframes.append(sunrise_sunset_dataframe)
        
    sunrise_sunset_data = pd.concat(sunrise_sunset_dataframes)
    sunrise_sunset_data['DATE'] = pd.to_datetime(sunrise_sunset_data['DATE'])
    sunrise_sunset_data = sunrise_sunset_data.astype({'Sunrise': 'int32', 'Sunset': 'int32'})
    
    return sunrise_sunset_data

In [21]:
hourly_weather_data, daily_weather_data = load_and_clean_weather_data()
sunrise_sunset_data = load_and_clean_sunrise_sunset_data()

  df = pd.read_csv(csv_file)
  df.drop(df.columns.difference(['DATE',
  df = pd.read_csv(csv_file)
  df.drop(df.columns.difference(['DATE',
  df = pd.read_csv(csv_file)
  df.drop(df.columns.difference(['DATE',
  df = pd.read_csv(csv_file)
  df.drop(df.columns.difference(['DATE',
  df = pd.read_csv(csv_file)
  df.drop(df.columns.difference(['DATE',
  df = pd.read_csv(csv_file)
  df.drop(df.columns.difference(['DATE',
  df = pd.read_csv(csv_file)
  df.drop(df.columns.difference(['DATE',
  df = pd.read_csv(csv_file)
  df.drop(df.columns.difference(['DATE',
  df = pd.read_csv(csv_file)
  df.drop(df.columns.difference(['DATE',
  df = pd.read_csv(csv_file)
  df.drop(df.columns.difference(['DATE',
  df = pd.read_csv(csv_file)
  df.drop(df.columns.difference(['DATE',
  df = pd.read_csv(csv_file)
  df.drop(df.columns.difference(['DATE',
  df = pd.read_csv(csv_file)
  df.drop(df.columns.difference(['DATE',
  df = pd.read_csv(csv_file)
  df.drop(df.columns.difference(['DATE',
  df = pd.read_csv(c

In [27]:
hourly_weather_data

Unnamed: 0,DATE,HourlyPrecipitation,HourlyWindSpeed
0,2009-01-01 00:51:00,0.0,18.0
1,2009-01-01 01:51:00,0.0,18.0
2,2009-01-01 02:51:00,0.0,18.0
3,2009-01-01 03:51:00,0.0,8.0
4,2009-01-01 04:51:00,0.0,11.0
...,...,...,...
11379,2015-12-31 18:51:00,0.0,3.0
11380,2015-12-31 19:51:00,0.0,6.0
11381,2015-12-31 20:51:00,0.0,10.0
11383,2015-12-31 22:51:00,0.0,7.0


In [32]:
hourly_weather_data.to_csv("hourly_weather_data") 
daily_weather_data.to_csv("daily_weather_data") 
sunrise_sunset_data.to_csv("sunrise_sunset_data") 