# Understanding Hired Rides in NYC

_[Project prompt](https://docs.google.com/document/d/1uAUJGEUzfNj6OsWNAimnYCw7eKaHhMUfU1MTj9YwYw4/edit?usp=sharing), [grading rubric](https://docs.google.com/document/d/1hKuRWqFcIdhOkow3Nljcm7PXzIkoa9c_aHkMKZDxWa0/edit?usp=sharing)_

_This scaffolding notebook may be used to help setup your final project. It's **totally optional** whether you make use of this or not._

_If you do use this notebook, everything provided is optional as well - you may remove or add prose and code as you wish._

_**All code below should be consider "pseudo-code" - not functional by itself, and only an outline to help you with your own approach.**_

## Project Setup

In [42]:
# all import statements needed for the project

import math
from math import sin, cos, sqrt, atan2, radians
import os

import bs4
import matplotlib.pyplot as plt
import pandas as pd
import requests
import sqlalchemy as db
import geopandas as gpd


In [39]:
# any constants need; some have been added for you, and 
# some you need to fill in

TAXI_URL = "https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page"

TAXI_ZONES_DIR = "taxi_zones"
TAXI_ZONES_SHAPEFILE = f"{TAXI_ZONES_DIR}/taxi_zones.shp"
UBER_DATA = "uber_rides_sample.csv"
WEATHER_CSV_DIR = "weather"

CRS = 4326  # coordinate reference system

# (lat, lon)
NEW_YORK_BOX_COORDS = ((40.560445, -74.242330), (40.908524, -73.717047))
LGA_BOX_COORDS = ((40.763589, -73.891745), (40.778865, -73.854838))
JFK_BOX_COORDS = ((40.639263, -73.795642), (40.651376, -73.766264))
EWR_BOX_COORDS = ((40.686794, -74.194028), (40.699680, -74.165205))

DATABASE_URL = "sqlite:///project.db"
DATABASE_SCHEMA_FILE = "schema.sql"
QUERY_DIRECTORY = "queries"

In [26]:
# Make sure the QUERY_DIRECTORY exists
try:
    os.mkdir(QUERY_DIRECTORY)
except Exception as e:
    if e.errno == 17:
        # the directory already exists
        pass
    else:
        raise

## Part 1: Data Preprocessing

### Load Taxi Zones

In [60]:
def load_taxi_zones(shapefile):
    return gpd.read_file(shapefile)     #read shapefile

In [74]:
def lookup_coords_for_taxi_zone_id(zone_loc_id):
    zones= load_taxi_zones(TAXI_ZONES_SHAPEFILE)
    centroid= zones[zones['LocationID'] == zone_loc_id].centroid.values[0]
    return (centroid.y, centroid.x)        # return the centroid coordinates

### Calculate distance

In [27]:
def calculate_distance_with_coords(from_coord, to_coord):
    R = 6373 # approximate radius of the earth in km
    lat1, lon1 = from_coord
    lat2, lon2 = to_coord

    dlat = radians(lat2-lat1)
    dlon = radians(lon2-lon1)
    a = sin(dlat/2) * sin(dlat/2) + cos(radians(lat1)) * cos(radians(lat2)) * sin(dlon/2) * sin(dlon/2)
    c = 2 * atan2(math.sqrt(a), sqrt(1-a))
    d = R * c
    return d

In [76]:
def calculate_distance_with_zones(from_zone, to_zone):
    from_zone_coord= lookup_coords_for_taxi_zone_id(from_zone)
    to_zone_coord= lookup_coords_for_taxi_zone_id(to_zone)   # get two coordinates from two zone ids
    
    return calculate_distance_with_coords(from_zone_coord, to_zone_coord)

In [29]:
def add_distance_column(dataframe):                #如果是zone???
    distance=[]
    for index, row in dataframe.iterrows():
        from_coord=(row['pickup_latitude'],row['pickup_longitude'])
        to_coord=(row['dropoff_latitude'],row['dropoff_longitude'])
        distance.append(calculate_distance_with_coords(from_coord,to_coord))  
    dataframe= pd.concat([dataframe,pd.DataFrame(distance,columns=['distance'])], axis=1)
    return dataframe

### Process Taxi Data

In [None]:
def get_all_urls_from_taxi_page(taxi_page):
    raise NotImplementedError()

In [None]:
def filter_taxi_parquet_urls(all_urls):
    raise NotImplementedError()

In [None]:
def get_and_clean_month(url):
    raise NotImplementedError()

In [None]:
def get_and_clean_taxi_data(parquet_urls):
    all_taxi_dataframes = []
    
    for parquet_url in parquet_urls:
        # maybe: first try to see if you've downloaded this exact
        # file already and saved it before trying again
        dataframe = get_and_clean_month(parquet_url)
        add_distance_column(dataframe)
        # maybe: if the file hasn't been saved, save it so you can
        # avoid re-downloading it if you re-run the function
        
        all_taxi_dataframes.append(dataframe)
        
    # create one gigantic dataframe with data from every month needed
    taxi_data = pd.contact(all_taxi_dataframes)
    return taxi_data

In [None]:
def get_taxi_data():
    all_urls = get_all_urls_from_taxi_page(TAXI_URL)
    all_parquet_urls = find_taxi_parquet_urls(all_urls)
    taxi_data = get_and_clean_taxi_data(all_parquet_urls)
    return taxi_data

In [None]:
taxi_data = get_taxi_data()

In [None]:
taxi_data.head()

### Processing Uber Data

In [30]:
def load_and_clean_uber_data(csv_file):
    df = pd.read_csv(csv_file)
    df = df.dropna(how='any')      #remove missing values
    df = df.drop(columns=['Unnamed: 0','key','passenger_count'])      #remove unnecessary columns
    df = df.loc[df.pickup_longitude.between(-74.242330,-73.717047) & df.dropoff_longitude.between(-74.242330,-73.717047) 
         & df.pickup_latitude.between(40.560445,40.908524)& df.dropoff_latitude.between(40.560445,40.908524)]
          # remove locations out of range
    df.reset_index(drop=True, inplace=True)    #reset index
    return df

In [31]:
def get_uber_data():
    uber_dataframe = load_and_clean_uber_data(UBER_DATA)
    uber_dataframe = add_distance_column(uber_dataframe)
    uber_dataframe["pick_date"] = pd.to_datetime(uber_dataframe['pickup_datetime'])
    uber_dataframe["year"] = uber_dataframe["pick_date"].dt.year
    uber_dataframe["month"] = uber_dataframe["pick_date"].dt.month
    uber_dataframe["day"] = uber_dataframe["pick_date"].dt.day
    uber_dataframe["hour"] = uber_dataframe["pick_date"].dt.hour
    uber_dataframe["dayofweek"] = uber_dataframe["pick_date"].dt.dayofweek
    return uber_dataframe

In [32]:
uber_data = get_uber_data()

In [33]:
uber_data.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,distance,pick_date,year,month,day,hour,dayofweek
0,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1.683851,2015-05-07 19:52:06+00:00,2015,5,7,19,3
1,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,2.458361,2009-07-17 20:04:56+00:00,2009,7,17,20,4
2,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,5.037958,2009-08-24 21:45:00+00:00,2009,8,24,21,0
3,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,1.662205,2009-06-26 08:22:21+00:00,2009,6,26,8,4
4,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,4.476855,2014-08-28 17:47:00+00:00,2014,8,28,17,3


### Processing Weather Data

In [34]:
def get_all_weather_csvs(directory):
    csv_files = [file for file in os.listdir(directory) if file.endswith(".csv")]
    return csv_files

In [38]:
get_all_weather_csvs(WEATHER_CSV_DIR)

['2012_weather.csv',
 '2011_weather.csv',
 '2014_weather.csv',
 '2013_weather.csv',
 '2009_weather.csv',
 '2015_weather.csv',
 '2010_weather.csv']

In [None]:
def clean_month_weather_data_hourly(csv_file):
    raise NotImplementedError()

In [None]:
def clean_month_weather_data_daily(csv_file):
    raise NotImplementedError()

In [None]:
def load_and_clean_weather_data():
    weather_csv_files = get_all_weather_csvs(WEATHER_CSV_DIR)
    
    hourly_dataframes = []
    daily_dataframes = []
        
    for csv_file in weather_csv_files:
        hourly_dataframe = clean_month_weather_data_hourly(csv_file)
        daily_dataframe = clean_month_weather_data_daily(csv_file)
        hourly_dataframes.append(hourly_dataframe)
        daily_dataframes.append(daily_dataframe)
        
    # create two dataframes with hourly & daily data from every month
    hourly_data = pd.concat(hourly_dataframes)
    daily_data = pd.concat(daily_dataframes)
    
    return hourly_data, daily_data

In [None]:
hourly_weather_data, daily_weather_data = load_and_clean_weather_data()

In [None]:
hourly_weather_data.head()

In [None]:
daily_weather_data.head()

## Part 2: Storing Cleaned Data

In [None]:
engine = db.create_engine(DATABASE_URL)

In [None]:
# if using SQL (as opposed to SQLAlchemy), define the commands 
# to create your 4 tables/dataframes
HOURLY_WEATHER_SCHEMA = """
TODO
"""

DAILY_WEATHER_SCHEMA = """
TODO
"""

TAXI_TRIPS_SCHEMA = """
TODO
"""

UBER_TRIPS_SCHEMA = """
TODO
"""

In [None]:
# create that required schema.sql file
with open(DATABASE_SCHEMA_FILE, "w") as f:
    f.write(HOURLY_WEATHER_SCHEMA)
    f.write(DAILY_WEATHER_SCHEMA)
    f.write(TAXI_TRIPS_SCHEMA)
    f.write(UBER_TRIPS_SCHEMA)

In [None]:
# create the tables with the schema files
with engine.connect() as connection:
    pass

### Add Data to Database

In [None]:
def write_dataframes_to_table(table_to_df_dict):
    raise NotImplemented()

In [None]:
map_table_name_to_dataframe = {
    "taxi_trips": taxi_data,
    "uber_trips": uber_data,
    "hourly_weather": hourly_data,
    "daily_weather": daily_data,
}

In [None]:
write_dataframes_to_table(map_table_name_to_dataframe)

## Part 3: Understanding the Data

In [None]:
# Helper function to write the queries to file
def write_query_to_file(query, outfile):
    raise NotImplementedError()

### Query 1

In [None]:
QUERY_1_FILENAME = ""

QUERY_1 = """
TODO
"""

In [None]:
engine.execute(QUERY_1).fetchall()

In [None]:
write_query_to_file(QUERY_1, QUERY_1_FILENAME)

## Part 4: Visualizing the Data

### Visualization 1

In [None]:
# use a more descriptive name for your function
def plot_visual_1(dataframe):
    figure, axes = plt.subplots(figsize=(20, 10))
    
    values = "..."  # use the dataframe to pull out values needed to plot
    
    # you may want to use matplotlib to plot your visualizations;
    # there are also many other plot types (other 
    # than axes.plot) you can use
    axes.plot(values, "...")
    # there are other methods to use to label your axes, to style 
    # and set up axes labels, etc
    axes.set_title("Some Descriptive Title")
    
    plt.show()

In [None]:
def get_data_for_visual_1():
    # Query SQL database for the data needed.
    # You can put the data queried into a pandas dataframe, if you wish
    raise NotImplementedError()

In [None]:
some_dataframe = get_data_for_visual_1()
plot_visual_1(some_dataframe)