Before you turn in the homework, make sure everything runs as expected. To do so, select **Kernel**$\rightarrow$**Restart & Run All** in the toolbar above.  Remember to submit both on **DataHub** and **Gradescope**.

Please fill in your name and include a list of your collaborators below.

In [1]:
NAME = "William Sheu"
COLLABORATORS = ""

---

# Project 2: NYC Taxi Rides
# Extras

Put all of your extra work in here. Feel free to save figures to use when completing Part 4.

In [2]:
import os
import pandas as pd
import numpy as np
import sklearn.linear_model as lm
import matplotlib.pyplot as plt
import seaborn as sns
import zipfile
from utils import fetch_and_cache
from pathlib import Path
from sqlalchemy import create_engine
from datetime import datetime
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV

In [3]:
test_df = pd.read_csv("./proj2_test_data.csv")
DB_URI = "sqlite:////srv/db/taxi_2016_student_small.sqlite"
TABLE_NAME = "taxi"
query = """    SELECT *
    FROM (
    SELECT *
    FROM (
SELECT *, julianday(tpep_dropoff_datetime) - julianday(tpep_pickup_datetime) as duration
FROM (
            SELECT * 
            FROM taxi
            WHERE tpep_pickup_datetime
                BETWEEN '2016-01-01' AND '2016-07-01'
                AND record_id % 100 == 0
            ORDER BY tpep_pickup_datetime
            )
WHERE duration < 0.1157407
            )
    WHERE (
            pickup_longitude <= -73.75 AND
            pickup_longitude >= -74.03 AND
            dropoff_longitude <= -73.75 AND
            dropoff_longitude >= -74.03 AND
            pickup_latitude <= 40.85 AND
            pickup_latitude >= 40.63 AND
            dropoff_latitude <= 40.85 AND
            dropoff_latitude >= 40.63 
            )
            )
    WHERE (passenger_count > 0)"""
sql_engine = create_engine(DB_URI)
processed_df = pd.read_sql_query(query, sql_engine)
processed_df['tpep_pickup_datetime'] = pd.to_datetime(processed_df['tpep_pickup_datetime'])
processed_df['tpep_dropoff_datetime'] = pd.to_datetime(processed_df['tpep_dropoff_datetime'])
processed_df['duration'] = processed_df['duration']*86400

In [4]:
# Copied from part 2
def haversine(lat1, lng1, lat2, lng2):
    """
    Compute haversine distance
    """
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    average_earth_radius = 6371
    lat = lat2 - lat1
    lng = lng2 - lng1
    d = np.sin(lat * 0.5) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(lng * 0.5) ** 2
    h = 2 * average_earth_radius * np.arcsin(np.sqrt(d))
    return h

# Copied from part 2
def manhattan_distance(lat1, lng1, lat2, lng2):
    """
    Compute Manhattan distance
    """
    a = haversine(lat1, lng1, lat1, lng2)
    b = haversine(lat1, lng1, lat2, lng1)
    return a + b

# Copied from part 2
def bearing(lat1, lng1, lat2, lng2):
    """
    Compute the bearing, or angle, from (lat1, lng1) to (lat2, lng2).
    A bearing of 0 refers to a NORTH orientation.
    """
    lng_delta_rad = np.radians(lng2 - lng1)
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    y = np.sin(lng_delta_rad) * np.cos(lat2)
    x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(lng_delta_rad)
    return np.degrees(np.arctan2(y, x))

# Copied from part 2
def add_time_columns(df):
    """
    Add temporal features to df
    """
    df.is_copy = False # propogate write to original dataframe
    df.loc[:, 'month'] = df['tpep_pickup_datetime'].dt.month
    df.loc[:, 'week_of_year'] = df['tpep_pickup_datetime'].dt.weekofyear
    df.loc[:, 'day_of_month'] = df['tpep_pickup_datetime'].dt.day
    df.loc[:, 'day_of_week'] = df['tpep_pickup_datetime'].dt.dayofweek
    df.loc[:, 'hour'] = df['tpep_pickup_datetime'].dt.hour
    df.loc[:, 'week_hour'] = df['tpep_pickup_datetime'].dt.weekday * 24 + df['hour']
    return df

# Copied from part 2
def add_distance_columns(df):
    """
    Add distance features to df
    """
    df.is_copy = False # propogate write to original dataframe
    df.loc[:, 'manhattan'] = manhattan_distance(lat1=df['pickup_latitude'],
                                                lng1=df['pickup_longitude'],
                                                lat2=df['dropoff_latitude'],
                                                lng2=df['dropoff_longitude'])

    df.loc[:, 'bearing'] = bearing(lat1=df['pickup_latitude'],
                                   lng1=df['pickup_longitude'],
                                   lat2=df['dropoff_latitude'],
                                   lng2=df['dropoff_longitude'])
    df.loc[:, 'haversine'] = haversine(lat1=df['pickup_latitude'],
                                   lng1=df['pickup_longitude'],
                                   lat2=df['dropoff_latitude'],
                                   lng2=df['dropoff_longitude'])
    return df

def select_columns(data, *columns):
    return data.loc[:, columns]

def mae(actual, predicted):
    """
    Calculates MAE from actual and predicted values
    Input:
      actual (1D array-like): vector of actual values
      predicted (1D array-like): vector of predicted/fitted values
    Output:
      a float, the MAE
    """
    mae = np.mean(np.abs(actual - predicted))
    return mae

def generate_submission(test, predictions, force=False):
    if force:
        if not os.path.isdir("submissions"):
            os.mkdir("submissions")
        submission_df = pd.DataFrame({
            "id": test_df.index.values, 
            "duration": predictions,
        },
            columns=['id', 'duration'])

        timestamp = datetime.isoformat(datetime.now()).split(".")[0]

        submission_df.to_csv(f'submissions/submission_{timestamp}.csv', index=False)

        print(f'Created a CSV file: submission_{timestamp}.csv')
        print('You may now upload this CSV file to Kaggle for scoring.')

In [5]:
train_df, val_df = train_test_split(processed_df, test_size=0.2, random_state=42)

AVERAGE_LONG_DROP=np.mean(train_df['dropoff_longitude'])
AVERAGE_LAT_DROP=np.mean(train_df['dropoff_latitude'])
AVERAGE_LONG_PICK=np.mean(train_df['pickup_longitude'])
AVERAGE_LAT_PICK=np.mean(train_df['pickup_latitude'])

def process_data_gm2(data, test=False):
    if test:
        data1 = data.copy()
    else:
        data1 = data.copy()[(data['duration'] < 10000)]
    if test:
        data1.loc[(data1['dropoff_longitude'] > -70), 'dropoff_longitude'] = AVERAGE_LONG_DROP
        data1.loc[(data1['dropoff_latitude'] < 35), 'dropoff_latitude'] = AVERAGE_LAT_DROP
        data1.loc[(data1['pickup_longitude'] > -70), 'pickup_longitude'] = AVERAGE_LONG_PICK
        data1.loc[(data1['pickup_latitude'] < 35), 'pickup_latitude'] = AVERAGE_LAT_PICK
    X = (data1.pipe(add_time_columns).pipe(add_distance_columns)
        .pipe(select_columns,        
              'pickup_longitude',  
              'pickup_latitude',   
              'dropoff_longitude', 
              'dropoff_latitude',
              'manhattan',
              'fare_amount',
              'hour',
              'month'
             )
    )
    if test:
        y = None
    else:
        y = data1['duration']
        
    return X, y

In [6]:
X_train, y_train = process_data_gm2(train_df)
X_val, y_val = process_data_gm2(val_df)
guided_model_2 = lm.LinearRegression(fit_intercept=True)
guided_model_2.fit(X_train, y_train)
y_val_pred = guided_model_2.predict(X_val)
print(mae(y_val_pred, y_val))

  object.__getattribute__(self, name)
  return object.__setattr__(self, name, value)


175.869207619


In [7]:
print(guided_model_2.coef_)
train_df.head(10)

[-1466.7188094    923.73865708 -1740.60100377   107.43336519     2.50218244
    61.37640329     2.88886216    10.32365152]


Unnamed: 0,record_id,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RatecodeID,store_and_fwd_flag,...,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,duration
30390,15892900,1,2016-02-10 14:46:02,2016-02-10 15:13:52,1,2.7,-73.95636,40.780994,1,N,...,40.750458,2,17.5,0.0,0.5,0.0,0.0,0.3,18.3,1669.999994
124167,59546000,2,2016-06-03 22:19:02,2016-06-03 22:25:05,1,0.66,-73.981102,40.729362,1,N,...,40.723888,2,5.5,0.5,0.5,0.0,0.0,0.3,6.8,363.000003
65114,29985300,2,2016-03-23 10:50:48,2016-03-23 10:59:22,1,2.06,-73.951424,40.774158,1,N,...,40.757431,1,9.0,0.0,0.5,1.96,0.0,0.3,11.76,514.000028
113090,54223000,2,2016-05-20 13:01:23,2016-05-20 13:08:20,1,0.59,-73.977104,40.758656,1,N,...,40.752918,2,6.0,0.0,0.5,0.0,0.0,0.3,6.8,416.999991
92178,44175100,1,2016-04-25 08:39:56,2016-04-25 08:42:36,1,0.6,-73.962822,40.766758,1,N,...,40.773705,2,4.0,0.0,0.5,0.0,0.0,0.3,4.8,159.999977
43260,19244300,1,2016-02-26 05:53:23,2016-02-26 05:55:53,1,1.1,-73.988708,40.748734,1,N,...,40.756901,1,5.0,0.5,0.5,1.7,0.0,0.3,8.0,150.000031
98668,47420300,1,2016-05-03 08:53:52,2016-05-03 09:05:52,1,1.6,-73.971924,40.76009,1,N,...,40.776463,2,9.5,0.0,0.5,0.0,0.0,0.3,10.3,719.999997
61205,27975300,1,2016-03-18 14:01:48,2016-03-18 14:09:26,1,1.4,-73.966522,40.761776,1,N,...,40.778275,2,7.5,0.0,0.5,0.0,0.0,0.3,8.3,458.00001
135630,64839900,1,2016-06-18 11:17:10,2016-06-18 11:34:00,1,2.3,-73.969917,40.784752,1,N,...,40.758629,1,13.0,0.0,0.5,2.75,0.0,0.3,16.55,1009.999993
91719,43943300,1,2016-04-24 14:09:57,2016-04-24 14:28:49,3,4.8,-74.008064,40.739506,1,N,...,40.675411,1,17.5,0.0,0.5,3.65,0.0,0.3,21.95,1132.000016


In [8]:
test_df['tpep_pickup_datetime'] = pd.to_datetime(test_df['tpep_pickup_datetime'])
test_df = test_df.pipe(add_distance_columns)
test_df[test_df['manhattan'] == 0].loc[:,['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']].head(10)

  object.__getattribute__(self, name)
  return object.__setattr__(self, name, value)


Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude
10,0.0,0.0,0.0,0.0
296,0.0,0.0,0.0,0.0
308,0.0,0.0,0.0,0.0
389,0.0,0.0,0.0,0.0
565,-74.004814,40.737522,-74.004814,40.737522
585,0.0,0.0,0.0,0.0
670,-73.818542,40.69482,-73.818542,40.69482
705,-73.971169,40.76429,-73.971169,40.76429
795,-73.984169,40.695702,-73.984169,40.695702
838,0.0,0.0,0.0,0.0


In [10]:
X_test, _ = process_data_gm2(test_df, True)
final_predictions = guided_model_2.predict(X_test)
final_predictions = final_predictions.astype(int)
generate_submission(test_df, final_predictions, False) # Change to true to generate prediction
final_predictions

Created a CSV file: submission_2018-12-04T23:52:45.csv
You may now upload this CSV file to Kaggle for scoring.


  object.__getattribute__(self, name)
  return object.__setattr__(self, name, value)


array([ 409, 1861,  544, ...,  964,  632,  437])

In [12]:
data_file = Path("data", "data_extra.hdf") # Path of hdf file
train_df.to_hdf(data_file, "train_extra_df")

## Submission

You're almost done!

Before submitting this assignment, ensure that you have:

1. Restarted the Kernel (in the menubar, select Kernel$\rightarrow$Restart & Run All)
2. Validated the notebook by clicking the "Validate" button.

Then,

1. **Submit** the assignment via the Assignments tab in **Datahub** 
1. **Upload and tag** the manually reviewed portions of the assignment on **Gradescope**