In [1]:
# all import statements needed for the project, for example:
import math
import bs4
import matplotlib.pyplot as plt
import pandas as pd
import geopandas as gpd
import requests
import sqlalchemy as db
import sqlite3
import numpy as np
import matplotlib.animation as animation
import keplergl
from keplergl import KeplerGl
import statistics
import unittest

In [2]:
# any constants you might need, for example:
TAXI_URL = "https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page"
# add other constants to refer to any local data, e.g. uber & weather
UBER_CSV = "uber_rides_sample.csv"
NEW_YORK_BOX_COORDS = ((40.560445, -74.242330), (40.908524, -73.717047))
DATABASE_URL = "sqlite:///project.db"
DATABASE_SCHEMA_FILE = "schema.sql"
QUERY_DIRECTORY = "queries"

In [3]:
# Calculate the distance between the two coordinates
def calculate_distance(from_coord: list, to_coord: list) -> float:
    R = 6373.0
    lat1 = math.radians(from_coord[0])
    lon1 = math.radians(from_coord[1])
    lat2 = math.radians(to_coord[0])
    lon2 = math.radians(to_coord[1])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = math.sin(dlat / 2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    distance = R * c
    return distance

In [4]:
# For the dataset that is not given the trip distance, calculate the distance using the given coordinate data and add it to the dataframe
def add_distance_column(dataframe: pd.core.frame.DataFrame) -> pd.core.frame.DataFrame:
    distance = []
    for index, row in dataframe.iterrows():
        distance.append(calculate_distance((row['pickup_latitude'], row['pickup_longitude']), (row['dropoff_latitude'], row['dropoff_longitude'])))
    dataframe['trip_distance'] = distance
    return dataframe

In [5]:
def find_taxi_parquet_urls() -> list:
    response = requests.get(TAXI_URL)
    soup = bs4.BeautifulSoup(response.text, "html.parser")
    links = soup.find_all(lambda tag:'title' in tag.attrs and tag.attrs['title'] == "Yellow Taxi Trip Records")
    hrefs = [link.get('href') for link in links]
    # Filter the links based on the desired years (2009 to 2015)
    hrefs_filtered = [href for href in hrefs 
                  if any(year in href for year in map(str, range(2009, 2015)))
                  or (any(f"2015-{month:02}" in href for month in range(1, 7)))]
    return hrefs_filtered


In [26]:
find_taxi_parquet_urls()

['https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2015-01.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2015-02.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2015-03.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2015-04.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2015-05.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2015-06.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2014-01.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2014-02.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2014-03.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2014-04.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2014-05.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2014-06.pa

In [7]:
# Define a function that converts location to coordinates, and generate a dataframe
def convert_id_to_coord(df: pd.core.frame.DataFrame) -> pd.core.frame.DataFrame:
    shapefile = gpd.read_file(r"C:\Users\Silvia\Documents\GitHub\4501FinalProject_Group14\taxi_zones\taxi_zones.shp")
    # Convert the geometry column in the shapefile into specific coordinates of latitude and longitude
    shapefile = shapefile.to_crs(4326)
    shapefile['latitude'] = shapefile['geometry'].centroid.y
    shapefile['longitude'] = shapefile['geometry'].centroid.x
    
    df = df
    df = df.loc[df["pulocationid"] <= 263]
    df = df.loc[df["pulocationid"] != 0]
    df = df.loc[df["dolocationid"] <= 263]
    df = df.loc[df["dolocationid"] != 0]
    # convert location IDs into longitude and latitude
    PUlongitude = []
    PUlatitude = []
    DOlongitude = []
    DOlatitude = []
    # convert the pickup location IDs into longitude and latitude
    for i in df['pulocationid']:
        PUlatitude.append(shapefile['latitude'][i-1])
        PUlongitude.append(shapefile['longitude'][i-1])
    for i in df['dolocationid']:
        DOlatitude.append(shapefile['latitude'][i-1])
        DOlongitude.append(shapefile['longitude'][i-1])
        
    df['pickup_longitude'] = PUlongitude
    df['pickup_latitude'] = PUlatitude
    df['dropoff_longitude'] = DOlongitude
    df['dropoff_latitude'] = DOlatitude
    # convert the drop off location IDs into longitude and latitude
    
    return df

In [8]:
df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2015-01.parquet')
df

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2015-01-01 00:11:33,2015-01-01 00:16:48,1,1.0,1,N,41,166,1,5.7,0.5,0.5,1.40,0.0,0.0,8.40,,
1,1,2015-01-01 00:18:24,2015-01-01 00:24:20,1,0.9,1,N,166,238,3,6.0,0.5,0.5,0.00,0.0,0.0,7.30,,
2,1,2015-01-01 00:26:19,2015-01-01 00:41:06,1,3.5,1,N,238,162,1,13.2,0.5,0.5,2.90,0.0,0.0,17.40,,
3,1,2015-01-01 00:45:26,2015-01-01 00:53:20,1,2.1,1,N,162,263,1,8.2,0.5,0.5,2.37,0.0,0.0,11.87,,
4,1,2015-01-01 00:59:21,2015-01-01 01:05:24,1,1.0,1,N,236,141,3,6.0,0.5,0.5,0.00,0.0,0.0,7.30,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12741030,1,2015-01-31 23:21:42,2015-01-31 23:31:00,1,1.6,1,N,90,249,1,8.0,0.5,0.5,2.32,0.0,0.3,11.62,,
12741031,1,2015-01-31 23:42:43,2015-01-31 23:49:32,1,0.6,1,N,90,68,1,6.0,0.5,0.5,1.46,0.0,0.3,8.76,,
12741032,1,2015-01-31 23:55:16,2015-02-01 00:16:45,1,3.0,1,N,68,148,1,15.0,0.5,0.5,4.07,0.0,0.3,20.37,,
12741033,1,2015-01-31 23:20:53,2015-02-01 00:07:35,1,6.9,1,N,189,237,1,32.5,0.5,0.5,6.00,0.0,0.3,39.80,,


In [43]:
# Obtain taxi data and clean the data
def get_and_clean_month_taxi_data(url: str) -> pd.core.frame.DataFrame:

    df = pd.read_parquet(url)
    df = df[:10]
    df.columns = df.columns.str.lower()
    df_taxi = pd.DataFrame()

    # keep necessary columns into a new dataframe
    if 'tpep_pickup_datetime' in df.columns:
        df=df.rename(columns = {'tpep_pickup_datetime':'pickup_datetime',
                                'tip_amount' : 'tip_amount'})
        df=convert_id_to_coord(df)
        
    elif 'trip_pickup_datetime' in df.columns:
        df=df.rename(columns = {'trip_pickup_datetime':'pickup_datetime', 
                                'start_lon': 'pickup_longitude',
                                'start_lat': 'pickup_latitude',
                                'end_lon': 'dropoff_longitude',
                                'end_lat': 'dropoff_latitude',
                                'tip_amt' : 'tip_amount'})
        
    df.drop(df.columns.difference(['pickup_datetime',
                                    'trip_distance', 
                                    'pickup_latitude', 
                                    'pickup_longitude', 
                                    'dropoff_latitude', 
                                    'dropoff_longitude',
                                   'tip_amount']), 1, inplace=True)
    
    df=df[df["pickup_longitude"] <= -73.717047]  
    df=df[df["pickup_longitude"] >= -74.242330]
    df=df[df["pickup_latitude"] >= 40.560445]
    df=df[df["pickup_latitude"] <= 40.908524]
    df=df[df["dropoff_longitude"] <= -73.717047]
    df=df[df["dropoff_longitude"] >= -74.242330]
    df=df[df["dropoff_latitude"] >= 40.560445]
    df=df[df["dropoff_latitude"] <= 40.908524]

    df = df.loc[df["pickup_datetime"] != 0.0]
    
    return df

In [37]:
df = get_and_clean_month_taxi_data('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2009-09.parquet')

  df.drop(df.columns.difference(['pickup_datetime',


In [44]:
def get_and_clean_taxi_data(parquet_urls):

    all_taxi_dataframes = []

    for parquet_url in parquet_urls:
        dataframe = get_and_clean_month_taxi_data(parquet_url)
        add_distance_column(dataframe)

        all_taxi_dataframes.append(dataframe)
        
    taxi_data = pd.concat(all_taxi_dataframes)

    return taxi_data

In [12]:
# load uber data and clean the data
def load_and_clean_uber_data(csv_file: str) -> pd.core.frame.DataFrame:
    df = pd.read_csv(csv_file, on_bad_lines='skip')
    df.columns = df.columns.str.lower()
    add_distance_column(df)
    df.drop(df.columns.difference(['pickup_datetime',
                                     'trip_distance', 
                                     'pickup_latitude', 
                                     'pickup_longitude', 
                                     'dropoff_latitude', 
                                     'dropoff_longitude']), 1, inplace=True)

    # remove rows start and/or end outside of the following latitude/longitude coordinate box: 
    # (40.560445, -74.242330) and (40.908524, -73.717047)
    df=df[df["pickup_longitude"] <= -73.717047]  
    df=df[df["pickup_longitude"] >= -74.242330]
    df=df[df["pickup_latitude"] >= 40.560445]
    df=df[df["pickup_latitude"] <= 40.908524]
    df=df[df["dropoff_longitude"] <= -73.717047]
    df=df[df["dropoff_longitude"] >= -74.242330]
    df=df[df["dropoff_latitude"] >= 40.560445]
    df=df[df["dropoff_latitude"] <= 40.908524]

    # remove invalid rows thtat pickup time is 0
    df = df.loc[df["pickup_datetime"] != 0.0]

    df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])

    return df

In [13]:
load_and_clean_uber_data(UBER_CSV)

  df.drop(df.columns.difference(['pickup_datetime',


Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_distance
0,2015-05-07 19:52:06+00:00,-73.999817,40.738354,-73.999512,40.723217,1.683851
1,2009-07-17 20:04:56+00:00,-73.994355,40.728225,-73.994710,40.750325,2.458361
2,2009-08-24 21:45:00+00:00,-74.005043,40.740770,-73.962565,40.772647,5.037958
3,2009-06-26 08:22:21+00:00,-73.976124,40.790844,-73.965316,40.803349,1.662205
4,2014-08-28 17:47:00+00:00,-73.925023,40.744085,-73.973082,40.761247,4.476855
...,...,...,...,...,...,...
199995,2012-10-28 10:49:00+00:00,-73.987042,40.739367,-73.986525,40.740297,0.112245
199996,2014-03-14 01:09:00+00:00,-73.984722,40.736837,-74.006672,40.739620,1.875639
199997,2009-06-29 00:42:00+00:00,-73.986017,40.756487,-73.858957,40.692588,12.854353
199998,2015-05-20 14:56:25+00:00,-73.997124,40.725452,-73.983215,40.695415,3.540827


In [14]:
def clean_month_weather_data_hourly(csv_file: str) -> pd.core.frame.DataFrame:
    # read file
    df = pd.read_csv(csv_file)
    #drop unnecessary colums
    df.drop(df.columns.difference(['DATE',
                                   'HourlyPrecipitation', 
                                   'HourlyWindSpeed']), 1, inplace=True)
    df['HourlyPrecipitation'] = df['HourlyPrecipitation'].replace('T', 0.0)
    # drop na values
    df.dropna(subset=['HourlyWindSpeed'], inplace=True)
    # convert "DATE" to datetime type
    df['DATE'] = pd.to_datetime(df['DATE'])
    # convert "HourlyPrecipitation" to float type
    df['HourlyPrecipitation'] = pd.to_numeric(df['HourlyPrecipitation'], errors='coerce')
    # fill in missing values
    df['HourlyPrecipitation'].fillna(0, inplace=True)
    # cast "df" to specified type
    df = df.astype({'HourlyWindSpeed': 'float32', 'HourlyPrecipitation': 'float32'})

    return df

In [15]:
def clean_month_weather_data_daily(csv_file: str) -> pd.core.frame.DataFrame:
    # read file
    df = pd.read_csv(csv_file)
    # Replace data of the string type
    df['HourlyPrecipitation'] = df['HourlyPrecipitation'].replace('T', 0.0)
    # convert "DATE" to datetime type
    df['DATE'] = pd.to_datetime(df['DATE'])
    # convert "HourlyPrecipitation" to numeric type
    df['HourlyPrecipitation'] = pd.to_numeric(df['HourlyPrecipitation'], errors='coerce')
    # convert value of 'na' into 0.0
    df['HourlyPrecipitation'].fillna(0, inplace=True)
    #drop unnecessary colums
    df.drop(df.columns.difference(['DATE',
                                   'HourlyPrecipitation', 
                                   'HourlyWindSpeed']), 1, inplace=True)
    # calculate hourly average as a daily values
    df['DATE'] = df['DATE'].dt.date
    df = df.groupby('DATE', as_index=False).agg({'HourlyWindSpeed': np.mean, 'HourlyPrecipitation': np.mean})
    df['HourlyWindSpeed'] = df['HourlyWindSpeed'].map(lambda x: round(x, 2))
    # remame columns
    df.rename(columns={'HourlyWindSpeed': 'DailyAverageWindSpeed', 'HourlyPrecipitation': 'DailyPrecipitation'}, inplace=True)
    df = df.astype({'DailyAverageWindSpeed':'float32', 'DailyPrecipitation':'float32', 'DATE' : 'datetime64[ns]'})

    return df

In [16]:
def load_and_clean_weather_data() -> pd.core.frame.DataFrame:
    hourly_dataframes = []
    daily_dataframes = []
    
    # add some way to find all weather CSV files
    # or just add the name/paths manually
    weather_csv_files = [
            r"C:\Users\Silvia\Documents\GitHub\4501FinalProject_Group14\2009_weather.csv",
            r"C:\Users\Silvia\Documents\GitHub\4501FinalProject_Group14\2010_weather.csv",
            r"C:\Users\Silvia\Documents\GitHub\4501FinalProject_Group14\2011_weather.csv",
            r"C:\Users\Silvia\Documents\GitHub\4501FinalProject_Group14\2012_weather.csv",
            r"C:\Users\Silvia\Documents\GitHub\4501FinalProject_Group14\2013_weather.csv",
            r"C:\Users\Silvia\Documents\GitHub\4501FinalProject_Group14\2014_weather.csv",
            r"C:\Users\Silvia\Documents\GitHub\4501FinalProject_Group14\2015_weather.csv"
        ]
    
    for csv_file in weather_csv_files:
        hourly_dataframe = clean_month_weather_data_hourly(csv_file)
        daily_dataframe = clean_month_weather_data_daily(csv_file)
        hourly_dataframes.append(hourly_dataframe)
        daily_dataframes.append(daily_dataframe)
        
    # create two dataframes with hourly & daily data from every month
    hourly_data = pd.concat(hourly_dataframes)
    daily_data = pd.concat(daily_dataframes)
    
    return hourly_data, daily_data

In [17]:
hourly_data, daily_data = load_and_clean_weather_data()

  df = pd.read_csv(csv_file)
  df.drop(df.columns.difference(['DATE',
  df = pd.read_csv(csv_file)
  df.drop(df.columns.difference(['DATE',
  df = pd.read_csv(csv_file)
  df.drop(df.columns.difference(['DATE',
  df = pd.read_csv(csv_file)
  df.drop(df.columns.difference(['DATE',
  df = pd.read_csv(csv_file)
  df.drop(df.columns.difference(['DATE',
  df = pd.read_csv(csv_file)
  df.drop(df.columns.difference(['DATE',
  df = pd.read_csv(csv_file)
  df.drop(df.columns.difference(['DATE',
  df = pd.read_csv(csv_file)
  df.drop(df.columns.difference(['DATE',
  df = pd.read_csv(csv_file)
  df.drop(df.columns.difference(['DATE',
  df = pd.read_csv(csv_file)
  df.drop(df.columns.difference(['DATE',
  df = pd.read_csv(csv_file)
  df.drop(df.columns.difference(['DATE',
  df = pd.read_csv(csv_file)
  df.drop(df.columns.difference(['DATE',
  df = pd.read_csv(csv_file)
  df.drop(df.columns.difference(['DATE',
  df = pd.read_csv(csv_file)
  df.drop(df.columns.difference(['DATE',


In [18]:
hourly_data

Unnamed: 0,DATE,HourlyPrecipitation,HourlyWindSpeed
0,2009-01-01 00:51:00,0.0,18.0
1,2009-01-01 01:51:00,0.0,18.0
2,2009-01-01 02:51:00,0.0,18.0
3,2009-01-01 03:51:00,0.0,8.0
4,2009-01-01 04:51:00,0.0,11.0
...,...,...,...
11379,2015-12-31 18:51:00,0.0,3.0
11380,2015-12-31 19:51:00,0.0,6.0
11381,2015-12-31 20:51:00,0.0,10.0
11383,2015-12-31 22:51:00,0.0,7.0


In [19]:
daily_data

Unnamed: 0,DATE,DailyAverageWindSpeed,DailyPrecipitation
0,2009-01-01,11.04,0.000000
1,2009-01-02,6.81,0.000000
2,2009-01-03,9.88,0.000000
3,2009-01-04,7.37,0.000000
4,2009-01-05,6.93,0.000000
...,...,...,...
360,2015-12-27,4.91,0.003542
361,2015-12-28,8.21,0.001154
362,2015-12-29,7.79,0.019375
363,2015-12-30,4.18,0.007436


In [20]:
def clean_sunset_sunrise_daily(csv_file: str) -> pd.core.frame.DataFrame:
    df = pd.read_csv(csv_file)
    df.drop(df.columns.difference(['DATE','Sunset','Sunrise']), 1, inplace=True)
    df = df.dropna()
    df['DATE'] = pd.to_datetime(df['DATE'])
    df = df.astype({'Sunrise': 'int32', 'Sunset': 'int32', 'DATE':'datetime64[ns]' })
    return df

In [21]:
def load_and_clean_sunrise_sunset_data() -> pd.core.frame.DataFrame:
    sunrise_sunset_dataframes =[]
    
    weather_csv_files = [
            r"C:\Users\Silvia\Documents\GitHub\4501FinalProject_Group14\2009_weather.csv",
            r"C:\Users\Silvia\Documents\GitHub\4501FinalProject_Group14\2010_weather.csv",
            r"C:\Users\Silvia\Documents\GitHub\4501FinalProject_Group14\2011_weather.csv",
            r"C:\Users\Silvia\Documents\GitHub\4501FinalProject_Group14\2012_weather.csv",
            r"C:\Users\Silvia\Documents\GitHub\4501FinalProject_Group14\2013_weather.csv",
            r"C:\Users\Silvia\Documents\GitHub\4501FinalProject_Group14\2014_weather.csv",
            r"C:\Users\Silvia\Documents\GitHub\4501FinalProject_Group14\2015_weather.csv"
        ]
    
    for csv_file in weather_csv_files:
        sunrise_sunset_dataframe = clean_sunset_sunrise_daily(csv_file)
        sunrise_sunset_dataframes.append(sunrise_sunset_dataframe)
        
    sunrise_sunset_data = pd.concat(sunrise_sunset_dataframes)
    sunrise_sunset_data['DATE'] = pd.to_datetime(sunrise_sunset_data['DATE'])
    sunrise_sunset_data = sunrise_sunset_data.astype({'Sunrise': 'int32', 'Sunset': 'int32'})
    
    return sunrise_sunset_data

In [22]:
sunrise_sunset_data = load_and_clean_sunrise_sunset_data()
sunrise_sunset_data

  df = pd.read_csv(csv_file)
  df.drop(df.columns.difference(['DATE','Sunset','Sunrise']), 1, inplace=True)
  df = pd.read_csv(csv_file)
  df.drop(df.columns.difference(['DATE','Sunset','Sunrise']), 1, inplace=True)
  df = pd.read_csv(csv_file)
  df.drop(df.columns.difference(['DATE','Sunset','Sunrise']), 1, inplace=True)
  df = pd.read_csv(csv_file)
  df.drop(df.columns.difference(['DATE','Sunset','Sunrise']), 1, inplace=True)
  df = pd.read_csv(csv_file)
  df.drop(df.columns.difference(['DATE','Sunset','Sunrise']), 1, inplace=True)
  df = pd.read_csv(csv_file)
  df.drop(df.columns.difference(['DATE','Sunset','Sunrise']), 1, inplace=True)
  df = pd.read_csv(csv_file)
  df.drop(df.columns.difference(['DATE','Sunset','Sunrise']), 1, inplace=True)


Unnamed: 0,DATE,Sunrise,Sunset
55,2009-01-02 23:59:00,720,1640
163,2009-01-06 23:59:00,720,1644
202,2009-01-07 23:59:00,720,1645
305,2009-01-10 23:59:00,720,1648
343,2009-01-11 23:59:00,720,1649
...,...,...,...
11238,2015-12-27 23:59:00,719,1635
11264,2015-12-28 23:59:00,719,1636
11312,2015-12-29 23:59:00,720,1636
11351,2015-12-30 23:59:00,720,1637


In [33]:
all_parquet_urls = find_taxi_parquet_urls()
taxi_data = get_and_clean_taxi_data(all_parquet_urls)
uber_data = load_and_clean_uber_data(UBER_CSV)
hourly_weather_data, daily_weather_data = load_and_clean_weather_data()
sunrise_sunset_data = load_and_clean_sunrise_sunset_data()


  shapefile['latitude'] = shapefile['geometry'].centroid.y

  shapefile['longitude'] = shapefile['geometry'].centroid.x


KeyboardInterrupt: 

In [34]:
uber_data = load_and_clean_uber_data(UBER_CSV)
hourly_weather_data, daily_weather_data = load_and_clean_weather_data()
sunrise_sunset_data = load_and_clean_sunrise_sunset_data()

  df.drop(df.columns.difference(['pickup_datetime',
  df = pd.read_csv(csv_file)
  df.drop(df.columns.difference(['DATE',
  df = pd.read_csv(csv_file)
  df.drop(df.columns.difference(['DATE',
  df = pd.read_csv(csv_file)
  df.drop(df.columns.difference(['DATE',
  df = pd.read_csv(csv_file)
  df.drop(df.columns.difference(['DATE',
  df = pd.read_csv(csv_file)
  df.drop(df.columns.difference(['DATE',
  df = pd.read_csv(csv_file)
  df.drop(df.columns.difference(['DATE',
  df = pd.read_csv(csv_file)
  df.drop(df.columns.difference(['DATE',
  df = pd.read_csv(csv_file)
  df.drop(df.columns.difference(['DATE',
  df = pd.read_csv(csv_file)
  df.drop(df.columns.difference(['DATE',
  df = pd.read_csv(csv_file)
  df.drop(df.columns.difference(['DATE',
  df = pd.read_csv(csv_file)
  df.drop(df.columns.difference(['DATE',
  df = pd.read_csv(csv_file)
  df.drop(df.columns.difference(['DATE',
  df = pd.read_csv(csv_file)
  df.drop(df.columns.difference(['DATE',
  df = pd.read_csv(csv_file)
  df.drop

In [35]:
all_parquet_urls = find_taxi_parquet_urls()
all_parquet_urls

['https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2015-01.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2015-02.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2015-03.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2015-04.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2015-05.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2015-06.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2014-01.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2014-02.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2014-03.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2014-04.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2014-05.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2014-06.pa

In [49]:
all_parquet_urls[0:2]

['https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2015-01.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2015-02.parquet']

In [52]:
taxi_data = get_and_clean_taxi_data(all_parquet_urls)


  shapefile['latitude'] = shapefile['geometry'].centroid.y

  shapefile['longitude'] = shapefile['geometry'].centroid.x
  df.drop(df.columns.difference(['pickup_datetime',

  shapefile['latitude'] = shapefile['geometry'].centroid.y

  shapefile['longitude'] = shapefile['geometry'].centroid.x
  df.drop(df.columns.difference(['pickup_datetime',

  shapefile['latitude'] = shapefile['geometry'].centroid.y

  shapefile['longitude'] = shapefile['geometry'].centroid.x
  df.drop(df.columns.difference(['pickup_datetime',

  shapefile['latitude'] = shapefile['geometry'].centroid.y

  shapefile['longitude'] = shapefile['geometry'].centroid.x
  df.drop(df.columns.difference(['pickup_datetime',

  shapefile['latitude'] = shapefile['geometry'].centroid.y

  shapefile['longitude'] = shapefile['geometry'].centroid.x
  df.drop(df.columns.difference(['pickup_datetime',

  shapefile['latitude'] = shapefile['geometry'].centroid.y

  shapefile['longitude'] = shapefile['geometry'].centroid.x
  df.drop(df.c

KeyboardInterrupt: 

In [53]:
taxi_data

Unnamed: 0,pickup_datetime,trip_distance,tip_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude
0,2015-01-01 00:11:33,1.049745,1.4,-73.951292,40.804334,-73.961764,40.809457
1,2015-01-01 00:18:24,2.191297,0.0,-73.961764,40.809457,-73.973049,40.791705
2,2015-01-01 00:26:19,3.895404,2.9,-73.973049,40.791705,-73.972356,40.756688
3,2015-01-01 00:45:26,3.043737,2.37,-73.972356,40.756688,-73.95101,40.778766
4,2015-01-01 00:59:21,1.516475,0.0,-73.957012,40.780436,-73.959635,40.766948
5,2015-01-01 00:07:31,0.981286,0.0,-73.978632,40.783961,-73.973049,40.791705
6,2015-01-01 00:47:08,0.981286,0.0,-73.973049,40.791705,-73.978632,40.783961
7,2015-01-01 00:58:04,4.013493,2.7,-73.973049,40.791705,-73.940772,40.818258
8,2015-01-01 00:29:25,1.986939,0.0,-73.996971,40.742279,-74.007486,40.72629
9,2015-01-01 00:39:02,6.058989,0.0,-74.007486,40.72629,-73.959635,40.766948


In [54]:
engine = db.create_engine(DATABASE_URL)

In [56]:
HOURLY_WEATHER_SCHEMA = """
CREATE TABLE IF NOT EXISTS hourly_weather
(
    weatherId INTEGER PRIMARY KEY AUTOINCREMENT,
    Date DATE,
    HourlyPrecipitation FLOAT,
    HourlyWindSpeed FLOAT
);
"""

DAILY_WEATHER_SCHEMA = """
CREATE TABLE IF NOT EXISTS daily_weather
(
    weatherId INTEGER PRIMARY KEY AUTOINCREMENT
    Date DATE,
    DailyPrecipitation FLOAT,
    DailyAverageWindSpeed FLOAT
);
"""

TAXI_TRIPS_SCHEMA = """
CREATE TABLE IF NOT EXISTS taxi_trips
(
    taxi_tripId INTEGER PRIMARY KEY AUTOINCREMENT,
    pickup_datetime DATE,
    distance FLOAT,
    pickup_longitude FLOAT,
    pickup_latitude FLOAT,
    dropoff_longitude FLOAT,
    dropoff_latitude FLOAT
);
"""

UBER_TRIPS_SCHEMA = """
CREATE TABLE IF NOT EXISTS uber_trips
(
    uber_tripId INTEGER PRIMARY KEY AUTOINCREMENT,
    pickup_datetime DATE,
    distance FLOAT,
    pickup_longitude FLOAT,
    pickup_latitude FLOAT,
    dropoff_longitude FLOAT,
    dropoff_latitude FLOAT
);
"""

SUNRISE_SUNSET_SCHEMA = """
CREATE TABLE IF NOT EXISTS sunrise_sunsets
(
    sunrise_sunsetID INTEGER PRIMARY KEY AUTOINCREMENT,
    Date DATE,
    Sunrise INTEGER,
    Sunset INTEGER
);
"""

In [57]:
with open(DATABASE_SCHEMA_FILE, "w") as f:
    f.write(HOURLY_WEATHER_SCHEMA)
    f.write(DAILY_WEATHER_SCHEMA)
    f.write(TAXI_TRIPS_SCHEMA)
    f.write(UBER_TRIPS_SCHEMA)
    f.write(SUNRISE_SUNSET_SCHEMA)

In [59]:
with engine.connect() as connection:
    with open(DATABASE_SCHEMA_FILE, "r") as f:
        lines = f.readlines()
        query = []
        for line in lines:
            query.append(line)
            # if the line is a semicolon, execute the query
            if ";" in line:
                connection.execute(db.text("".join(query)))
                query = []

OperationalError: (sqlite3.OperationalError) near "Date": syntax error
[SQL: 
CREATE TABLE IF NOT EXISTS daily_weather
(
    weatherId INTEGER PRIMARY KEY AUTOINCREMENT
    Date DATE,
    DailyPrecipitation FLOAT,
    DailyAverageWindSpeed FLOAT
);
]
(Background on this error at: https://sqlalche.me/e/14/e3q8)

In [60]:
map_table_name_to_dataframe = {
    "taxi_trips": taxi_data,
    "uber_trips": uber_data,
    "hourly_weather": hourly_weather_data,
    "daily_weather": daily_weather_data,
    "sun_data": sunrise_sunset_data
}

for table, df in map_table_name_to_dataframe.items():
    print("writing", table)
    df.to_sql(table, engine, if_exists='append', index=False)

writing taxi_trips
writing uber_trips
writing hourly_weather
writing daily_weather
writing sun_data


In [61]:
def write_query_to_file(query: str, outfile: str):
    '''Writes the query to the outfile.

    Keyword arguments:
    query -- The query to write.
    outfile -- The name of the file to write to.
    '''

    with open(QUERY_DIRECTORY + outfile, "w") as f:
        f.write(query)

In [62]:
QUERY_1 = """SELECT strftime('%H', pickup_datetime) AS time, COUNT(*) AS num
FROM taxi_trips
GROUP BY time
ORDER BY num DESC;
"""

In [63]:

engine.execute(QUERY_1).fetchall()

[('00', 20)]

In [64]:
write_query_to_file(QUERY_1, "most_popular_hour.sql")