# IEOR 4501 Final Project: Understanding Hired Rides in NYC

#### Contributors: Joy Ren(jr4154), Yiwen Qian(yq2346)

## Project Setup

In [1]:
# all import statements needed for the project

import math
from math import sin, cos, sqrt, atan2, radians
import os
import warnings
import re
import bs4
import matplotlib.pyplot as plt
import pandas as pd
import requests
import sqlalchemy as db
import geopandas as gpd
warnings.filterwarnings('ignore')

In [2]:
# any constants need; some have been added for you, and 
# some you need to fill in

TAXI_URL = "https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page"

TAXI_ZONES_DIR = "taxi_zones"
TAXI_ZONES_SHAPEFILE = f"{TAXI_ZONES_DIR}/taxi_zones.shp"
UBER_DATA = "uber_rides_sample.csv"
WEATHER_CSV_DIR = "weather"

CRS = 4326  # coordinate reference system

# (lat, lon)
NEW_YORK_BOX_COORDS = ((40.560445, -74.242330), (40.908524, -73.717047))
LGA_BOX_COORDS = ((40.763589, -73.891745), (40.778865, -73.854838))
JFK_BOX_COORDS = ((40.639263, -73.795642), (40.651376, -73.766264))
EWR_BOX_COORDS = ((40.686794, -74.194028), (40.699680, -74.165205))

DATABASE_URL = "sqlite:///project.db"
DATABASE_SCHEMA_FILE = "schema.sql"
QUERY_DIRECTORY = "queries"

In [3]:
# Make sure the QUERY_DIRECTORY exists
try:
    os.mkdir(QUERY_DIRECTORY)
except Exception as e:
    if e.errno == 17:
        # the directory already exists
        pass
    else:
        raise

## Part 1: Data Preprocessing
In Part 1, we downloaded Parquet files, cleaning and filtering for the relevant data, filling in missing data, and generating samples of these raw datasets.
Our processing stages can be illustrated as:
* 1. Load the Taxi Zones by x and y coordinates.
* 2. Calculate distance using latitude and longtitude.
* 3. Use `bs4` and `requests` module to parse html, then we can process Yellow Taxi data of NYC.
* 4. Processing Uber Data.
* 5. Processing Weather Data.
* 6. Save the cleaned data for further analyis. 

### Load Taxi Zones

In [4]:
def load_taxi_zones(shapefile):
    return gpd.read_file(shapefile)     #read shapefile

In [5]:
load_taxi_zones(TAXI_ZONES_SHAPEFILE)

Unnamed: 0,OBJECTID,Shape_Leng,Shape_Area,zone,LocationID,borough,geometry
0,1,0.116357,0.000782,Newark Airport,1,EWR,"POLYGON ((933100.918 192536.086, 933091.011 19..."
1,2,0.433470,0.004866,Jamaica Bay,2,Queens,"MULTIPOLYGON (((1033269.244 172126.008, 103343..."
2,3,0.084341,0.000314,Allerton/Pelham Gardens,3,Bronx,"POLYGON ((1026308.770 256767.698, 1026495.593 ..."
3,4,0.043567,0.000112,Alphabet City,4,Manhattan,"POLYGON ((992073.467 203714.076, 992068.667 20..."
4,5,0.092146,0.000498,Arden Heights,5,Staten Island,"POLYGON ((935843.310 144283.336, 936046.565 14..."
...,...,...,...,...,...,...,...
258,259,0.126750,0.000395,Woodlawn/Wakefield,259,Bronx,"POLYGON ((1025414.782 270986.139, 1025138.624 ..."
259,260,0.133514,0.000422,Woodside,260,Queens,"POLYGON ((1011466.966 216463.005, 1011545.889 ..."
260,261,0.027120,0.000034,World Trade Center,261,Manhattan,"POLYGON ((980555.204 196138.486, 980570.792 19..."
261,262,0.049064,0.000122,Yorkville East,262,Manhattan,"MULTIPOLYGON (((999804.795 224498.527, 999824...."


In [6]:
def lookup_coords_for_taxi_zone_id(zone_loc_id):
    zones= load_taxi_zones(TAXI_ZONES_SHAPEFILE)
    centroid= zones[zones['LocationID'] == zone_loc_id].centroid.values[0]
    return (centroid.y, centroid.x)        # return the centroid coordinates

### Calculate distance

In [7]:
def calculate_distance_with_coords(from_coord, to_coord):
    ## "from_coord": columns 'pickup_latitude' and 'pickup_longitude' from datarame;
    ## "to_coord":  columns 'dropoff_latitude' and 'dropoff_longitude' from datarame;
    
    R = 6373.0  # approximate radius of earth in km    
    lat1, lon1 = from_coord
    lat2, lon2 = to_coord

    dlat = radians(lat2-lat1)
    dlon = radians(lon2-lon1)
    
    ## Calculate the distance between two coordinates in kilometers using the Haversine formula
    a = sin(dlat/2) ** 2 + cos(radians(lat1)) * cos(radians(lat2)) * (sin(dlon/2) ** 2 )
    coord_distance = 2 * R * atan2(math.sqrt(a), sqrt(1-a))
    
    return coord_distance

In [8]:
def calculate_distance_with_zones(from_zone, to_zone):
    ##Calculate the distance between two taxi zones in kilometers using the centroid of each zone
    
    from_zone_coord = lookup_coords_for_taxi_zone_id(from_zone)
    to_zone_coord = lookup_coords_for_taxi_zone_id(to_zone)   # get two coordinates from two zone ids
    
    return calculate_distance_with_coords(from_zone_coord, to_zone_coord)

In [9]:
def add_distance_column(dataframe):               
    distance=[]
    for index, row in dataframe.iterrows():
        from_coord=(row['pickup_latitude'],row['pickup_longitude'])
        to_coord=(row['dropoff_latitude'],row['dropoff_longitude'])
        distance.append(calculate_distance_with_coords(from_coord,to_coord))  
    dataframe= pd.concat([dataframe,pd.DataFrame(distance,columns=['distance'])], axis=1)
    return dataframe

In [10]:
aa=pd.read_parquet("yellow taxi/yellow_tripdata_2014-02.parquet")
aa

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2014-02-01 00:30:05,2014-02-01 00:30:17,1,0.00,1,N,140,140,2,2.5,0.5,0.5,0.0,0.0,0.0,3.5,,
1,1,2014-02-01 00:16:52,2014-02-01 00:18:22,1,0.00,1,N,186,186,2,3.0,0.5,0.5,0.0,0.0,0.0,4.0,,
2,1,2014-02-01 00:05:03,2014-02-01 00:15:44,1,2.00,1,N,107,249,1,9.5,0.5,0.5,2.1,0.0,0.0,12.6,,
3,1,2014-02-01 00:17:19,2014-02-01 00:20:15,1,0.70,1,N,158,90,1,4.5,0.5,0.5,1.5,0.0,0.0,7.0,,
4,1,2014-02-01 00:24:08,2014-02-01 00:37:07,2,2.40,1,N,234,246,1,11.5,0.5,0.5,2.5,0.0,0.0,15.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13063789,2,2014-02-28 23:15:00,2014-02-28 23:38:00,1,5.67,1,,231,61,1,21.0,0.5,0.5,4.0,0.0,0.0,26.0,,
13063790,2,2014-02-28 23:53:00,2014-03-01 00:12:00,1,1.96,1,,225,177,2,14.0,0.5,0.5,0.0,0.0,0.0,15.0,,
13063791,2,2014-02-28 23:12:00,2014-02-28 23:29:00,1,4.06,1,,181,71,1,16.0,0.5,0.5,0.0,0.0,0.0,17.0,,
13063792,1,2014-02-28 23:20:56,2014-02-28 23:55:20,1,7.20,1,N,92,92,2,27.5,0.5,0.5,0.0,0.0,0.0,28.5,,


### Process Taxi Data
Here, We want to obtain the Yellow Taxi Data January 2009 through June 2015, then do the cleaning and combining to dataframes.

In [11]:
def get_all_urls_from_taxi_page(taxi_page):
    # Get the HTML content of the page
    res = requests.get(taxi_page)
    soup = bs4.BeautifulSoup(res.content, 'html.parser')
    # Find all <a> tags with href attribute
    urls = [a['href'] for a in soup.find_all('a', href=re.compile(".*(2009|201[0-4]|2015-0[1-6]).*"))]
    return urls

In [12]:
all_urls=get_all_urls_from_taxi_page(TAXI_URL)
all_urls

['https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2015-01.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2015-01.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/fhv_tripdata_2015-01.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2015-02.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2015-02.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/fhv_tripdata_2015-02.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2015-03.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2015-03.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/fhv_tripdata_2015-03.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2015-04.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2015-04.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/fhv_tripdata_2015-04.parquet',
 'https:

In [13]:
def filter_taxi_parquet_urls(all_urls):
    """
    Given a list of URLs, filters out the URLs that do not contain yellow taxi data parquet files.
    """
    parquet_urls = []
    for url in all_urls:
        if 'yellow_tripdata' in url:
            parquet_urls.append(url)
    return parquet_urls

In [14]:
def get_and_clean_month(url):
    """
    clean the data, and return a pandas DataFrame.
    """          

    df = pd.read_parquet(url)
    df = df[['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'PULocationID', 'DOLocationID', 'fare_amount']]
    df = df.dropna()


    
    zone=load_taxi_zones(TAXI_ZONES_SHAPEFILE)
    zone = zone.to_crs(4326)
    zone['latitude'] = zone.geometry.centroid.y
    zone['longitude'] = zone.geometry.centroid.x
    

    # remove unnecessary columns 
    # create mapping methods for latitude & longitude
    # aim to create lat & longitude which do not exist in some taxi data
    zone = zone[['LocationID', 'longitude', 'latitude', 'zone', 'borough']]
    lat_map = dict(zip(zone['LocationID'], zone['latitude']))
    lon_map = dict(zip(zone['LocationID'], zone['longitude']))
    
    
    # process dataframe which only has LocationIDs 
    # match the LocationID in 'df' with dataframe 'g'
    # add 4 new columns to 'df' after matching
    if 'DOLocationID' in df:
        df['pickup_latitude']  = df['PULocationID'].map(lat_map)
        df['pickup_longitude'] = df['PULocationID'].map(lon_map)
        df['dropoff_latitude'] = df['DOLocationID'].map(lat_map)
        df['dropoff_longitude']= df['DOLocationID'].map(lon_map)
    
    # normalizing column names
    df = df.rename(columns={'tpep_pickup_datetime':'pickup_datetime', 
                            'Trip_Pickup_DateTime':'pickup_datetime', 
                            'tpep_dropoff_datetime':'dropoff_datetime',
                            'Fare_Amt':'fare_amount'         
                            })
        
    
    df = df[(df['fare_amount'] >0)]     
    df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'], format='%Y-%m-%d %H:%M:%S')
    df['dropoff_datetime'] = pd.to_datetime(df['dropoff_datetime'], format='%Y-%m-%d %H:%M:%S')
    df['pickup_hour'] = df['pickup_datetime'].dt.hour
    df['pickup_day'] = df['pickup_datetime'].dt.day
    df['pickup_weekday'] = df['pickup_datetime'].dt.weekday
    df['pickup_month'] = df['pickup_datetime'].dt.month
    df['pickup_year'] = df['pickup_datetime'].dt.year
    df['pickup_date'] = df['pickup_datetime'].dt.date
    df['pickup_time'] = df['pickup_datetime'].dt.time
    
   
    
    #df['distance_km'] = df.apply(lambda row: calculate_distance_with_zones(row['PULocationID'], row['DOLocationID']), axis=1)
    return df

In [15]:
pd.read_parquet("yellow taxi/yellow_tripdata_2014-02.parquet")

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2014-02-01 00:30:05,2014-02-01 00:30:17,1,0.00,1,N,140,140,2,2.5,0.5,0.5,0.0,0.0,0.0,3.5,,
1,1,2014-02-01 00:16:52,2014-02-01 00:18:22,1,0.00,1,N,186,186,2,3.0,0.5,0.5,0.0,0.0,0.0,4.0,,
2,1,2014-02-01 00:05:03,2014-02-01 00:15:44,1,2.00,1,N,107,249,1,9.5,0.5,0.5,2.1,0.0,0.0,12.6,,
3,1,2014-02-01 00:17:19,2014-02-01 00:20:15,1,0.70,1,N,158,90,1,4.5,0.5,0.5,1.5,0.0,0.0,7.0,,
4,1,2014-02-01 00:24:08,2014-02-01 00:37:07,2,2.40,1,N,234,246,1,11.5,0.5,0.5,2.5,0.0,0.0,15.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13063789,2,2014-02-28 23:15:00,2014-02-28 23:38:00,1,5.67,1,,231,61,1,21.0,0.5,0.5,4.0,0.0,0.0,26.0,,
13063790,2,2014-02-28 23:53:00,2014-03-01 00:12:00,1,1.96,1,,225,177,2,14.0,0.5,0.5,0.0,0.0,0.0,15.0,,
13063791,2,2014-02-28 23:12:00,2014-02-28 23:29:00,1,4.06,1,,181,71,1,16.0,0.5,0.5,0.0,0.0,0.0,17.0,,
13063792,1,2014-02-28 23:20:56,2014-02-28 23:55:20,1,7.20,1,N,92,92,2,27.5,0.5,0.5,0.0,0.0,0.0,28.5,,


In [16]:
def get_and_clean_taxi_data(parquet_urls):
    all_taxi_dataframes = []
    
    for parquet_url in parquet_urls:
        # maybe: first try to see if you've downloaded this exact
        # file already and saved it before trying again
        dataframe = get_and_clean_month(parquet_url)
    
        # maybe: if the file hasn't been saved, save it so you can
        # avoid re-downloading it if you re-run the function
        
        
        
        all_taxi_dataframes.append(dataframe)
        
    # create one gigantic dataframe with data from every month needed
    taxi_data = pd.concat(all_taxi_dataframes)
    return taxi_data

In [17]:
def get_taxi_data():
    # download file if not exist
    all_urls = get_all_urls_from_taxi_page(TAXI_URL)
    all_parquet_urls = filter_taxi_parquet_urls(all_urls)
    for url in all_parquet_urls:
        response = requests.get(url, stream=True)
        file_name = url.split("/")[-1]
        local_file_path = os.path.join("yellow taxi", file_name)
        if not os.path.exists(local_file_path):
            with open(local_file_path, "wb") as file:
                file.write(response.content)
    
    
    taxi_data = get_and_clean_taxi_data(all_parquet_urls)
    return taxi_data

In [18]:
taxi_data = get_taxi_data()

HTTPError: HTTP Error 403: Forbidden

In [None]:
taxi_data.head()

### Processing Uber Data

In [None]:
def load_and_clean_uber_data(csv_file):
    df = pd.read_csv(csv_file)
    df = df.dropna(how='any')      #remove missing values
    df = df.drop(columns=['Unnamed: 0','key','passenger_count'])      #remove unnecessary columns
    df = df.loc[df.pickup_longitude.between(-74.242330,-73.717047) & df.dropoff_longitude.between(-74.242330,-73.717047) 
         & df.pickup_latitude.between(40.560445,40.908524)& df.dropoff_latitude.between(40.560445,40.908524)]
          # remove locations out of range
    df.reset_index(drop=True, inplace=True)    #reset index
    return df

In [None]:
def get_uber_data():
    uber_dataframe = load_and_clean_uber_data(UBER_DATA)
    uber_dataframe = add_distance_column(uber_dataframe)
    uber_dataframe["pick_date"] = pd.to_datetime(uber_dataframe['pickup_datetime'])
    uber_dataframe["year"] = uber_dataframe["pick_date"].dt.year
    uber_dataframe["month"] = uber_dataframe["pick_date"].dt.month
    uber_dataframe["day"] = uber_dataframe["pick_date"].dt.day
    uber_dataframe["hour"] = uber_dataframe["pick_date"].dt.hour
    uber_dataframe["dayofweek"] = uber_dataframe["pick_date"].dt.dayofweek
    return uber_dataframe

In [None]:
uber_data = get_uber_data()

In [None]:
uber_data.head()

### Processing Weather Data

In [None]:
def get_all_weather_csvs(directory):
    csv_files = [file for file in os.listdir(directory) if file.endswith(".csv")]
    return csv_files

In [None]:
get_all_weather_csvs(WEATHER_CSV_DIR)

In [None]:
def clean_month_weather_data_hourly(csv_file):
    we = pd.read_csv(csv_file)
    # keep the necessary columns
    f1 = ['DATE', 'HourlyPrecipitation', 'HourlyWindSpeed']
    we_1 = pd.DataFrame()
    for i in f1:
        col1=we.loc[:,we.columns.str.contains(i)]
        we_1=pd.concat([we_1, col1], axis=1)
    hourly_clean= we_1.dropna()
    hourly_clean.reset_index(drop=True, inplace=True)
    # Add the columns of year, month, day and hour according to the date
    hourly_clean["DATE"] = pd.to_datetime(hourly_clean['DATE'])
    hourly_clean["YEAR"] = hourly_clean["DATE"].dt.year
    hourly_clean["MONTH"] = hourly_clean["DATE"].dt.month
    hourly_clean["DAY"] = hourly_clean["DATE"].dt.day
    hourly_clean["HOUR"] = hourly_clean["DATE"].dt.hour
    hourly_clean.round({'HourlyPrecipitation': 2})
    return hourly_clean

In [None]:
def clean_month_weather_data_daily(csv_file):
    we = pd.read_csv(csv_file)
    # keep the necessary columns
    f2 = ['DATE', 'DailySustainedWindSpeed']
    we_2 = pd.DataFrame()
    for i in f2:
        col2=we.loc[:,we.columns.str.contains(i)]
        we_2=pd.concat([we_2, col2], axis=1)
    daily_clean= we_2.dropna()
    daily_clean.reset_index(drop=True, inplace=True)
    # Add the columns of year, month, day and hour according to the date
    daily_clean["DATE"] = pd.to_datetime(daily_clean['DATE'])
    daily_clean["YEAR"] = daily_clean["DATE"].dt.year
    daily_clean["MONTH"] = daily_clean["DATE"].dt.month
    daily_clean["DAY"] = daily_clean["DATE"].dt.day
    daily_clean["HOUR"] = daily_clean["DATE"].dt.hour
    return daily_clean

In [None]:
def load_and_clean_weather_data():
    weather_csv_files = get_all_weather_csvs(WEATHER_CSV_DIR)
    
    hourly_dataframes = []
    daily_dataframes = []
        
    for csv_file in weather_csv_files:
        hourly_dataframe = clean_month_weather_data_hourly(csv_file)
        daily_dataframe = clean_month_weather_data_daily(csv_file)
        hourly_dataframes.append(hourly_dataframe)
        daily_dataframes.append(daily_dataframe)
        
    # create two dataframes with hourly & daily data from every month
    hourly_data = pd.concat(hourly_dataframes)
    daily_data = pd.concat(daily_dataframes)
    
    return hourly_data, daily_data

In [None]:
hourly_weather_data, daily_weather_data = load_and_clean_weather_data()

In [None]:
hourly_weather_data.head()

In [None]:
daily_weather_data.head()

## Part 2: Storing Cleaned Data

In [None]:
engine = db.create_engine(DATABASE_URL)

In [None]:
# if using SQL (as opposed to SQLAlchemy), define the commands 
# to create your 4 tables/dataframes
HOURLY_WEATHER_SCHEMA = """
TODO
"""

DAILY_WEATHER_SCHEMA = """
TODO
"""

TAXI_TRIPS_SCHEMA = """
TODO
"""

UBER_TRIPS_SCHEMA = """
TODO
"""

In [None]:
# create that required schema.sql file
with open(DATABASE_SCHEMA_FILE, "w") as f:
    f.write(HOURLY_WEATHER_SCHEMA)
    f.write(DAILY_WEATHER_SCHEMA)
    f.write(TAXI_TRIPS_SCHEMA)
    f.write(UBER_TRIPS_SCHEMA)

In [None]:
# create the tables with the schema files
with engine.connect() as connection:
    pass

### Add Data to Database

In [None]:
def write_dataframes_to_table(table_to_df_dict):
    raise NotImplemented()

In [None]:
map_table_name_to_dataframe = {
    "taxi_trips": taxi_data,
    "uber_trips": uber_data,
    "hourly_weather": hourly_data,
    "daily_weather": daily_data,
}

In [None]:
write_dataframes_to_table(map_table_name_to_dataframe)

## Part 3: Understanding the Data

In [None]:
# Helper function to write the queries to file
def write_query_to_file(query, outfile):
    raise NotImplementedError()

### Query 1

In [None]:
QUERY_1_FILENAME = ""

QUERY_1 = """
TODO
"""

In [None]:
engine.execute(QUERY_1).fetchall()

In [None]:
write_query_to_file(QUERY_1, QUERY_1_FILENAME)

## Part 4: Visualizing the Data

### Visualization 1

In [None]:
# use a more descriptive name for your function
def plot_visual_1(dataframe):
    figure, axes = plt.subplots(figsize=(20, 10))
    
    values = "..."  # use the dataframe to pull out values needed to plot
    
    # you may want to use matplotlib to plot your visualizations;
    # there are also many other plot types (other 
    # than axes.plot) you can use
    axes.plot(values, "...")
    # there are other methods to use to label your axes, to style 
    # and set up axes labels, etc
    axes.set_title("Some Descriptive Title")
    
    plt.show()

In [None]:
def get_data_for_visual_1():
    # Query SQL database for the data needed.
    # You can put the data queried into a pandas dataframe, if you wish
    raise NotImplementedError()

In [None]:
some_dataframe = get_data_for_visual_1()
plot_visual_1(some_dataframe)