# Understanding Hired Rides in NYC

_[Project prompt](https://docs.google.com/document/d/1VERPjEZcC1XSs4-02aM-DbkNr_yaJVbFjLJxaYQswqA/edit#)_

_This scaffolding notebook may be used to help setup your final project. It's **totally optional** whether you make use of this or not._

_If you do use this notebook, everything provided is optional as well - you may remove or add prose and code as you wish._

_Anything in italics (prose) or comments (in code) is meant to provide you with guidance. **Remove the italic lines and provided comments** before submitting the project, if you choose to use this scaffolding. We don't need the guidance when grading._

_**All code below should be consider "pseudo-code" - not functional by itself, and only a suggestion at the approach.**_

## Project Setup

In [87]:
pip install geopandas

Note: you may need to restart the kernel to use updated packages.


In [42]:
# all import statements needed for the project, for example:

import os

import bs4
import matplotlib.pyplot as plt
import pandas as pd
import requests
import sqlalchemy as db
import re
import geopandas as gpd
import math
import glob
import numpy as np

In [4]:
# any constants you might need; some have been added for you, and 
# some you need to fill in

TLC_URL = "https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page"

PARQUET_FILES = "parquet_files"
TAXI_ZONES_DIR = "taxi_zones"
TAXI_ZONES_SHAPEFILE = f"{TAXI_ZONES_DIR}/taxi_zones.shp"
WEATHER_CSV_DIR = "weather_data"

#CRS = 4326  # coordinate reference system

# (lat, lon)
#NEW_YORK_BOX_COORDS = ((40.560445, -74.242330), (40.908524, -73.717047))
LGA_BOX_COORDS = ((40.763589, -73.891745), (40.778865, -73.854838))
JFK_BOX_COORDS = ((40.639263, -73.795642), (40.651376, -73.766264))
EWR_BOX_COORDS = ((40.686794, -74.194028), (40.699680, -74.165205))

DATABASE_URL = "sqlite:///project.db"
DATABASE_SCHEMA_FILE = "schema.sql"
QUERY_DIRECTORY = "queries"

In [5]:
# Make sure the QUERY_DIRECTORY exists
try:
    os.mkdir(QUERY_DIRECTORY)
except Exception as e:
    if e.errno == 17:
        # the directory already exists
        pass
    else:
        raise

## Part 1: Data Preprocessing

### Download All data

In [5]:
def get_all_urls_from_tlc_page():
    response = requests.get(TLC_URL)
    html = response.content
    return html

In [6]:
#Identifies all of the yellow and fhvhv parquet files for years 2020 - 2024
pattern = re.compile(r".*(yellow|fhvhv).*(2020|2021|2022|2023|2024)-\d{2}\.parquet")

def filter_parquet_urls():
    html = get_all_urls_from_tlc_page()
    soup = bs4.BeautifulSoup(html, "html.parser")
    urls = soup.find_all("a", href=pattern)
    parquet_urls = [link["href"].strip() for link in urls]
    return parquet_urls

In [None]:
folder_name = "parquet_files"

# Check if the folder exists
if not os.path.exists(folder_name):
    os.mkdir(folder_name)
    print(f"Folder '{folder_name}' created successfully!")
else:
    print(f"Folder '{folder_name}' already exists.")

In [None]:
# parses the filename from the link and then downloads the files one by one
def download_parquet_files():
    for link in filter_parquet_urls():
        filename = link.split("/")[-1]
        r = requests.get(link)
        with open(f"parquet_files/{filename}", "wb") as f:
            f.write(r.content)

#run the first time to download data
#download_parquet_files()

### Load Taxi Zones & Parquet Files

In [61]:
#Reads the shape file
def load_taxi_zones(shapefile):
    taxi_zones = gpd.read_file(shapefile)
    return taxi_zones

In [62]:
gdf_taxi_zones = load_taxi_zones(TAXI_ZONES_SHAPEFILE)
print(gdf_taxi_zones.head())

   OBJECTID  Shape_Leng  Shape_Area                     zone  LocationID  \
0         1    0.116357    0.000782           Newark Airport           1   
1         2    0.433470    0.004866              Jamaica Bay           2   
2         3    0.084341    0.000314  Allerton/Pelham Gardens           3   
3         4    0.043567    0.000112            Alphabet City           4   
4         5    0.092146    0.000498            Arden Heights           5   

         borough                                           geometry  
0            EWR  POLYGON ((933100.918 192536.086, 933091.011 19...  
1         Queens  MULTIPOLYGON (((1033269.244 172126.008, 103343...  
2          Bronx  POLYGON ((1026308.77 256767.698, 1026495.593 2...  
3      Manhattan  POLYGON ((992073.467 203714.076, 992068.667 20...  
4  Staten Island  POLYGON ((935843.31 144283.336, 936046.565 144...  


In [63]:
# converts taxi zone geometry coordinates to the appropriate coordinate system  
gdf_taxi_zones = gdf_taxi_zones.to_crs(epsg=4326)
gdf_taxi_zones

Unnamed: 0,OBJECTID,Shape_Leng,Shape_Area,zone,LocationID,borough,geometry
0,1,0.116357,0.000782,Newark Airport,1,EWR,"POLYGON ((-74.18445 40.695, -74.18449 40.6951,..."
1,2,0.433470,0.004866,Jamaica Bay,2,Queens,"MULTIPOLYGON (((-73.82338 40.63899, -73.82277 ..."
2,3,0.084341,0.000314,Allerton/Pelham Gardens,3,Bronx,"POLYGON ((-73.84793 40.87134, -73.84725 40.870..."
3,4,0.043567,0.000112,Alphabet City,4,Manhattan,"POLYGON ((-73.97177 40.72582, -73.97179 40.725..."
4,5,0.092146,0.000498,Arden Heights,5,Staten Island,"POLYGON ((-74.17422 40.56257, -74.17349 40.562..."
...,...,...,...,...,...,...,...
258,259,0.126750,0.000395,Woodlawn/Wakefield,259,Bronx,"POLYGON ((-73.85107 40.91037, -73.85207 40.909..."
259,260,0.133514,0.000422,Woodside,260,Queens,"POLYGON ((-73.90175 40.76078, -73.90147 40.759..."
260,261,0.027120,0.000034,World Trade Center,261,Manhattan,"POLYGON ((-74.01333 40.70503, -74.01327 40.704..."
261,262,0.049064,0.000122,Yorkville East,262,Manhattan,"MULTIPOLYGON (((-73.94383 40.78286, -73.94376 ..."


In [9]:
# load parquet file into a pandas DataFrame
def load_parquet_file(file_path):
    df = pd.read_parquet(file_path)
    return df

In [10]:
# load a random yellow taxi trip parquet file to check if the function works correctly for testing purposes
example = os.path.join(PARQUET_FILES, "yellow_tripdata_2023-01.parquet")
example_df = load_parquet_file(example)

# preview the data
print(example_df.head())

   VendorID tpep_pickup_datetime tpep_dropoff_datetime  passenger_count  \
0         2  2023-01-01 00:32:10   2023-01-01 00:40:36              1.0   
1         2  2023-01-01 00:55:08   2023-01-01 01:01:27              1.0   
2         2  2023-01-01 00:25:04   2023-01-01 00:37:49              1.0   
3         1  2023-01-01 00:03:48   2023-01-01 00:13:25              0.0   
4         2  2023-01-01 00:10:29   2023-01-01 00:21:19              1.0   

   trip_distance  RatecodeID store_and_fwd_flag  PULocationID  DOLocationID  \
0           0.97         1.0                  N           161           141   
1           1.10         1.0                  N            43           237   
2           2.51         1.0                  N            48           238   
3           1.90         1.0                  N           138             7   
4           1.43         1.0                  N           107            79   

   payment_type  fare_amount  extra  mta_tax  tip_amount  tolls_amount  \


In [11]:
# load a random High-Volume For-Hire Vehicle trip parquet file to check if the function works correctly
example2 = os.path.join(PARQUET_FILES, "fhvhv_tripdata_2023-01.parquet")
example_df2 = load_parquet_file(example2)

# preview the data
print(example_df2.head())

  hvfhs_license_num dispatching_base_num originating_base_num  \
0            HV0003               B03404               B03404   
1            HV0003               B03404               B03404   
2            HV0003               B03404               B03404   
3            HV0003               B03404               B03404   
4            HV0003               B03404               B03404   

     request_datetime   on_scene_datetime     pickup_datetime  \
0 2023-01-01 00:18:06 2023-01-01 00:19:24 2023-01-01 00:19:38   
1 2023-01-01 00:48:42 2023-01-01 00:56:20 2023-01-01 00:58:39   
2 2023-01-01 00:15:35 2023-01-01 00:20:14 2023-01-01 00:20:27   
3 2023-01-01 00:35:24 2023-01-01 00:39:30 2023-01-01 00:41:05   
4 2023-01-01 00:43:15 2023-01-01 00:51:10 2023-01-01 00:52:47   

     dropoff_datetime  PULocationID  DOLocationID  trip_miles  ...  sales_tax  \
0 2023-01-01 00:48:07            48            68        0.94  ...       2.30   
1 2023-01-01 01:33:08           246           163       

In [12]:
example_df2.columns

Index(['hvfhs_license_num', 'dispatching_base_num', 'originating_base_num',
       'request_datetime', 'on_scene_datetime', 'pickup_datetime',
       'dropoff_datetime', 'PULocationID', 'DOLocationID', 'trip_miles',
       'trip_time', 'base_passenger_fare', 'tolls', 'bcf', 'sales_tax',
       'congestion_surcharge', 'airport_fee', 'tips', 'driver_pay',
       'shared_request_flag', 'shared_match_flag', 'access_a_ride_flag',
       'wav_request_flag', 'wav_match_flag'],
      dtype='object')

In [13]:
example_df.columns

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'airport_fee'],
      dtype='object')

### Cleaning and Filtering
* Remove all non-Uber data from fhvhv
* Remove all invalid pickup and dropoff location IDs for both uber and yellow taxi, where ID is greater than 263 using the `shp` file
* Remove unnecessary columns and only keeping columns needed to answer questions in the other parts of this project
* Remove invalid data points (use your discretion!)
* normalize column names; 
normalieg and using appropriate column types for the respective dat

* Remove trips from both uber and yellow taxi that start and/or end outside of the following latitude/longitude coordinate box: (40.560445, -74.242330) and (40.908524, -73.71704).


In [64]:
#Compute the center of the taxi zones for easier comparison and adds a column to the df of our shapefile 
gdf_taxi_zones['centroid'] = gdf_taxi_zones.geometry.centroid

#Removes the bulky geometry column after using it to compute centroid. 
gdf_taxi_zones = gdf_taxi_zones[['zone','LocationID','centroid']]


  gdf_taxi_zones['centroid'] = gdf_taxi_zones.geometry.centroid


In [65]:
gdf_taxi_zones

Unnamed: 0,zone,LocationID,centroid
0,Newark Airport,1,POINT (-74.174 40.69183)
1,Jamaica Bay,2,POINT (-73.8313 40.61675)
2,Allerton/Pelham Gardens,3,POINT (-73.84742 40.86447)
3,Alphabet City,4,POINT (-73.97697 40.72375)
4,Arden Heights,5,POINT (-74.18848 40.55266)
...,...,...,...
258,Woodlawn/Wakefield,259,POINT (-73.85222 40.89793)
259,Woodside,260,POINT (-73.90631 40.74423)
260,World Trade Center,261,POINT (-74.01302 40.70914)
261,Yorkville East,262,POINT (-73.94651 40.77593)


### Calculate Sample Size

In [81]:
# default: 95% confidence interval, 5% margin of error, p of 0.5 (estimated) proportion of the population which has the attribute in question
def cochran_sample_size(population_size):
    z_score=1.96
    margin_of_error=0.05
    p=0.5
    sample_size = ((z_score**2)*p*(1-p)) / (margin_of_error**2)
    adjusted_sample_size = sample_size / (1 + ((sample_size-1)/population_size))

    return int(adjusted_sample_size)

### Common Functions

In [98]:
#Filter function to remove unecessary rows
def filter_data(data):
    #Ensure PU and DO locations are within valid location IDs (<= 263)
    data = data[(data['PULocationID'] <= 263) & (data['DOLocationID'] <= 263)]
    #Filters out rides where PU and DO locations are the same
    if 'trip_distance' in data.columns:
        filtered_data = data[data['trip_distance'] != 0]

    # If 'trip_miles' is present instead of 'trip_distance'
    elif 'trip_miles' in data.columns:
        filtered_data = data[data['trip_miles'] != 0]
        
    return filtered_data

In [99]:
# Removes trips from both uber and yellow taxi that start and/or end outside of the following latitude/longitude coordinate box:
def find_centroid(data):
    LAT_MIN, LON_MIN = 40.560445, -74.242330
    LAT_MAX, LON_MAX = 40.908524, -73.717047
    
    # Extract latitude and longitude from the 'centroid' column using .apply()
    data['centroid_lat'] = data['centroid'].apply(lambda point: point.y)
    data['centroid_lon'] = data['centroid'].apply(lambda point: point.x)
    
    # Filter rows where the centroid coordinates are within the bounding coordinate box
    centroid_data = data[
        (data['centroid_lat'] >= LAT_MIN) & (data['centroid_lat'] <= LAT_MAX) &
        (data['centroid_lon'] >= LON_MIN) & (data['centroid_lon'] <= LON_MAX)
    ]
    return centroid_data

### Process Taxi Data

In [88]:
#Grab all of the parquet files in the directory. glob.glob is used to identify/match the pattern, path.join retrieves all the paths 
all_taxi_parquet_files = glob.glob(os.path.join(PARQUET_FILES, "*yellow*.parquet"))

In [100]:
#Make a list of just the columns we need for analysis
columns_to_keep = [
    'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'trip_distance', 'RatecodeID',
    'PULocationID', 'DOLocationID', 'fare_amount', 'extra', 'mta_tax',
    'tip_amount', 'tolls_amount', 'improvement_surcharge', 'total_amount',
    'congestion_surcharge', 'airport_fee'
]

#Create samples of all taxi parquet files according to cochran's sample size formula. Later, we concatenate all sample dfs into one df. 
sampled_taxi_dfs = []

for file_path in all_taxi_parquet_files:      
    taxi_df = load_parquet_file(file_path) #Makes a df for every parquet file 
    population_size = len(taxi_df)
    sample_size = cochran_sample_size(population_size)
    sampled_taxi_df = taxi_df.sample(n=sample_size, random_state=42)
    #We found that there were a few files that did not have airport_fee as a column. We populate airport_fee with NaN for such parquet files.
    for col in columns_to_keep:  
        if col not in sampled_taxi_df.columns:
            sampled_taxi_df[col] = np.nan 
    sampled_taxi_df = sampled_taxi_df[columns_to_keep]
    sampled_taxi_dfs.append(sampled_taxi_df)

    # create one gigantic dataframe with data from every month needed
sampled_taxi_data = pd.concat(sampled_taxi_dfs)

sampled_taxi_data = filter_data(sampled_taxi_data)

# Make a single df that includes the taxi rides and their corresponding coordinates by merging the shape file with the ride files.
final_taxi_data = pd.merge(sampled_taxi_data, gdf_taxi_zones, left_on = 'PULocationID', right_on = 'LocationID', how="inner")

final_taxi_data = find_centroid(final_taxi_data)

  sampled_taxi_data = pd.concat(sampled_taxi_dfs)


In [101]:
final_taxi_data.head()

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,trip_distance,RatecodeID,PULocationID,DOLocationID,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,zone,LocationID,centroid,centroid_lat,centroid_lon
0,2020-01-25 10:49:58,2020-01-25 11:07:35,3.28,1.0,142,246,14.0,0.0,0.5,1.7,0.0,0.3,19.0,2.5,,Lincoln Square East,142,POINT (-73.98153 40.77363),40.773633,-73.981532
1,2020-01-15 07:30:08,2020-01-15 07:40:01,1.75,1.0,238,166,8.5,0.0,0.5,1.2,0.0,0.3,13.0,2.5,,Upper West Side North,238,POINT (-73.97305 40.7917),40.791705,-73.973049
2,2020-01-09 06:29:09,2020-01-09 06:35:44,0.87,1.0,100,164,5.5,0.0,0.5,0.0,0.0,0.3,8.8,2.5,,Garment District,100,POINT (-73.98879 40.75351),40.753513,-73.988787
3,2020-01-26 12:24:04,2020-01-26 12:29:15,0.98,1.0,161,43,5.5,0.0,0.5,0.0,0.0,0.3,8.8,2.5,,Midtown Center,161,POINT (-73.9777 40.75803),40.758028,-73.977698
4,2020-01-30 07:57:53,2020-01-30 08:10:19,1.3,1.0,229,262,9.0,2.5,0.5,2.45,0.0,0.3,14.75,2.5,,Sutton Place/Turtle Bay North,229,POINT (-73.96515 40.75673),40.756729,-73.965146


In [77]:
final_taxi_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16744 entries, 0 to 17067
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   tpep_pickup_datetime   16744 non-null  datetime64[us]
 1   tpep_dropoff_datetime  16744 non-null  datetime64[us]
 2   trip_distance          16744 non-null  float64       
 3   RatecodeID             16012 non-null  float64       
 4   PULocationID           16744 non-null  int64         
 5   DOLocationID           16744 non-null  int64         
 6   fare_amount            16744 non-null  float64       
 7   extra                  16744 non-null  float64       
 8   mta_tax                16744 non-null  float64       
 9   tip_amount             16744 non-null  float64       
 10  tolls_amount           16744 non-null  float64       
 11  improvement_surcharge  16744 non-null  float64       
 12  total_amount           16744 non-null  float64       
 13  conges

In [78]:
final_taxi_data.describe()

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,trip_distance,RatecodeID,PULocationID,DOLocationID,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,LocationID,centroid_lat,centroid_lon
count,16744,16744,16744.0,16012.0,16744.0,16744.0,16744.0,16744.0,16744.0,16744.0,16744.0,16744.0,16744.0,16012.0,7947.0,16744.0,16744.0,16744.0
mean,2021-11-15 08:41:08.601588,2021-11-15 08:57:04.603678,3.291102,1.199663,163.774546,160.121715,14.395644,1.156294,0.491728,2.531626,0.415078,0.438915,21.058254,2.297652,0.086825,163.774546,40.753793,-73.966838
min,2020-01-01 00:11:06,2020-01-01 00:30:50,0.01,1.0,4.0,1.0,-81.52,-7.5,-0.5,0.0,-6.55,-1.0,-80.05,-2.5,-1.25,4.0,40.576961,-74.029892
25%,2020-12-08 16:10:13,2020-12-08 16:15:40,1.1,1.0,132.0,107.0,7.0,0.0,0.5,0.0,0.0,0.3,12.25,2.5,0.0,132.0,40.740439,-73.989845
50%,2021-11-15 15:59:40.500000,2021-11-15 16:12:57.500000,1.83,1.0,162.0,161.0,10.0,0.5,0.5,2.06,0.0,0.3,15.96,2.5,0.0,162.0,40.758028,-73.977698
75%,2022-10-22 21:01:30.500000,2022-10-22 21:11:01.750000,3.36,1.0,234.0,234.0,16.3,2.5,0.5,3.24,0.0,0.3,23.1,2.5,0.0,234.0,40.773633,-73.959635
max,2023-09-30 23:06:46,2023-09-30 23:51:36,67.9,99.0,263.0,263.0,209.5,11.75,0.5,50.0,40.0,1.0,262.7,2.5,1.25,263.0,40.899529,-73.739337
std,,,4.122359,3.952569,65.458809,70.814713,13.229554,1.428814,0.08381,2.949458,1.751952,0.293415,16.677861,0.728378,0.321505,65.458809,0.032691,0.044686


### Processing Uber Data

In [79]:
#Grab all of the parquet files in the directory. glob.glob is used to identify/match the pattern, path.join retrieves all the paths 
all_fhvhv_parquet_files = glob.glob(os.path.join(PARQUET_FILES, "*fhvhv*.parquet"))

In [80]:
#Create samples of all uber parquet files according to cochran's sample size formula. Later, we concatenate all sample dfs into one df. 
sampled_uber_dfs = []
columns_to_keep = ['hvfhs_license_num',
       'request_datetime', 'pickup_datetime',
       'dropoff_datetime', 'PULocationID', 'DOLocationID', 'trip_miles',
        'base_passenger_fare', 'tolls', 'bcf', 'sales_tax',
       'congestion_surcharge', 'airport_fee', 'tips', 'driver_pay']

for file_path in all_fhvhv_parquet_files:      
    uber_df = load_parquet_file(file_path) #Makes a df for every parquet file 
    uber_df = uber_df[uber_df['hvfhs_license_num'] == 'HV0003'] #Filters out non-uber rides from the hvfhs files before creating samples
    population_size = len(uber_df)
    sample_size = cochran_sample_size(population_size)
    sampled_uber_df = uber_df.sample(n=sample_size, random_state=42)
    sampled_uber_df = sampled_uber_df[columns_to_keep]
    sampled_uber_dfs.append(sampled_uber_df)

    # create one gigantic dataframe with data from every month needed
sampled_uber_data = pd.concat(sampled_uber_dfs)

sampled_uber_data = filter_data(sampled_uber_data)

# Make a single df that includes the taxi rides and their corresponding coordinates by merging the shape file with the ride files.
final_uber_data = pd.merge(sampled_uber_data, gdf_taxi_zones, left_on = 'PULocationID', right_on = 'LocationID', how="inner")

final_uber_data = find_centroid(final_uber_data)

  uber_data = pd.concat(sampled_uber_dfs)


In [None]:
final_uber_data.head()

In [None]:
final_uber_data.info()

In [None]:
final_uber_data.describe()

### Processing Weather Data

In [109]:
def get_all_weather_csvs(directory):
    weather_dfs = []

    # Iterate over all files in the given directory
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file_path, low_memory=False)
        # Append the DataFrame to the list
        weather_dfs.append(df)
    weather_dfs = pd.concat(weather_dfs, ignore_index=True)
    return weather_dfs


In [138]:

columns_to_keep = ['DATE','LATITUDE', 'LONGITUDE', 'MonthlyTotalLiquidPrecipitation', 'DailyPrecipitation', 'HourlyPrecipitation', 'DailyAverageWindSpeed', 'HourlyWindSpeed', 'DailySnowfall']

weather_data = get_all_weather_csvs(WEATHER_CSV_DIR)
weather_data = weather_data[columns_to_keep]
weather_data['DATE'] = pd.to_datetime(weather_data['DATE'])

# weather_data[weather_data["DailyPrecipitation"].isna()] #54343
weather_data[weather_data["MonthlyTotalLiquidPrecipitation"].isna()] #8,523
# weather_data


Unnamed: 0,DATE,LATITUDE,LONGITUDE,MonthlyTotalLiquidPrecipitation,DailyPrecipitation,HourlyPrecipitation,DailyAverageWindSpeed,HourlyWindSpeed,DailySnowfall
0,2020-01-01 00:51:00,40.77898,-73.96925,,,0.00,,8.0,
1,2020-01-01 01:51:00,40.77898,-73.96925,,,0.00,,8.0,
2,2020-01-01 02:51:00,40.77898,-73.96925,,,0.00,,14.0,
3,2020-01-01 03:51:00,40.77898,-73.96925,,,0.00,,11.0,
4,2020-01-01 04:51:00,40.77898,-73.96925,,,0.00,,6.0,
...,...,...,...,...,...,...,...,...,...
56093,2024-10-22 14:51:00,40.77898,-73.96925,,,,,3.0,
56094,2024-10-22 15:51:00,40.77898,-73.96925,,,,,0.0,
56095,2024-10-22 16:51:00,40.77898,-73.96925,,,,,0.0,
56096,2024-10-22 17:51:00,40.77898,-73.96925,,,,,0.0,


In [136]:
# Extract 'YEAR' and 'MONTH' for grouping purposes
weather_data['YEAR'] = weather_data['DATE'].dt.year
weather_data['MONTH'] = weather_data['DATE'].dt.month

# Group by year and month
monthly_groups = weather_data.groupby(['YEAR', 'MONTH'])

# Loop over each month and distribute the remaining monthly total to daily values
for (year, month), group in monthly_groups:
    # Get the monthly total precipitation for this group
    monthly_total_precip = group['MonthlyTotalLiquidPrecipitation'].iloc[0]

    # Calculate how much daily precipitation is already accounted for
    existing_precip = group['DailyPrecipitation'].sum(skipna=True)
    remaining_precip = monthly_total_precip - existing_precip

    # Find the number of days with missing DailyPrecipitation
    missing_days = group['DailyPrecipitation'].isna().sum()

    # Calculate how much to distribute per missing day
    if missing_days > 0:
        daily_precipitation = remaining_precip / missing_days
    else:
        daily_precipitation = 0

    # Fill missing DailyPrecipitation values with calculated daily average
    weather_data.loc[group.index, 'DailyPrecipitation'] = group['DailyPrecipitation'].fillna(daily_precipitation)

# Preview the DataFrame after filling in DailyPrecipitation
print(weather_data.head(30))


KeyError: 'MonthlyTotalLiquidPrecipitation'

In [None]:
def clean_month_weather_data_hourly(csv_file):
    raise NotImplementedError()

In [None]:
def clean_month_weather_data_daily(csv_file):
    raise NotImplementedError()

In [None]:
def load_and_clean_weather_data():
    weather_csv_files = get_all_weather_csvs(WEATHER_CSV_DIR)
    
    hourly_dataframes = []
    daily_dataframes = []
        
    for csv_file in weather_csv_files:
        hourly_dataframe = clean_month_weather_data_hourly(csv_file)
        daily_dataframe = clean_month_weather_data_daily(csv_file)
        hourly_dataframes.append(hourly_dataframe)
        daily_dataframes.append(daily_dataframe)
        
    # create two dataframes with hourly & daily data from every month
    hourly_data = pd.concat(hourly_dataframes)
    daily_data = pd.concat(daily_dataframes)
    
    return hourly_data, daily_data

In [None]:
hourly_weather_data, daily_weather_data = load_and_clean_weather_data()

In [None]:
hourly_weather_data.head()

In [None]:
hourly_weather_data.info()

In [None]:
hourly_weather_data.describe()

In [None]:
daily_weather_data.head()

In [None]:
daily_weather_data.info()

In [None]:
daily_weather_data.describe()

## Part 2: Storing Cleaned Data

In [None]:
engine = db.create_engine(DATABASE_URL)

In [None]:
# if using SQL (as opposed to SQLAlchemy), define the commands 
# to create your 4 tables/dataframes
HOURLY_WEATHER_SCHEMA = """
TODO
"""

DAILY_WEATHER_SCHEMA = """
TODO
"""

TAXI_TRIPS_SCHEMA = """
TODO
"""

UBER_TRIPS_SCHEMA = """
TODO
"""

In [None]:
# create that required schema.sql file
with open(DATABASE_SCHEMA_FILE, "w") as f:
    f.write(HOURLY_WEATHER_SCHEMA)
    f.write(DAILY_WEATHER_SCHEMA)
    f.write(TAXI_TRIPS_SCHEMA)
    f.write(UBER_TRIPS_SCHEMA)

In [None]:
# create the tables with the schema files
with engine.connect() as connection:
    pass

### Add Data to Database

In [None]:
def write_dataframes_to_table(table_to_df_dict):
    raise NotImplemented()

In [None]:
map_table_name_to_dataframe = {
    "taxi_trips": taxi_data,
    "uber_trips": uber_data,
    "hourly_weather": hourly_data,
    "daily_weather": daily_data,
}

In [None]:
write_dataframes_to_table(map_table_name_to_dataframe)

## Part 3: Understanding the Data

In [None]:
# Helper function to write the queries to file
def write_query_to_file(query, outfile):
    raise NotImplementedError()

### Query 1

In [None]:
QUERY_1_FILENAME = ""

QUERY_1 = """
TODO
"""

In [None]:
# execute query either via sqlalchemy
with engine.connect() as con:
    results = con.execute(db.text(QUERY_1)).fetchall()
results

# or via pandas
pd.read_sql(QUERY_1, con=engine)

In [None]:
write_query_to_file(QUERY_1, QUERY_1_FILENAME)

## Part 4: Visualizing the Data

### Visualization 1

In [None]:
# use a more descriptive name for your function
def plot_visual_1(dataframe):
    figure, axes = plt.subplots(figsize=(20, 10))
    
    values = "..."  # use the dataframe to pull out values needed to plot
    
    # you may want to use matplotlib to plot your visualizations;
    # there are also many other plot types (other 
    # than axes.plot) you can use
    axes.plot(values, "...")
    # there are other methods to use to label your axes, to style 
    # and set up axes labels, etc
    axes.set_title("Some Descriptive Title")
    
    plt.show()

In [None]:
def get_data_for_visual_1():
    # Query SQL database for the data needed.
    # You can put the data queried into a pandas dataframe, if you wish
    raise NotImplementedError()

In [None]:
some_dataframe = get_data_for_visual_1()
plot_visual_1(some_dataframe)