In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
% matplotlib inline
plt.style.use('seaborn-white')
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [None]:
df = pd.read_csv("yellow_tripdata_2018-03.csv")

In [None]:
df = df.drop(['RatecodeID', 'mta_tax', 'tolls_amount', 'improvement_surcharge', 'store_and_fwd_flag'], axis=1)

In [None]:
df = df.rename(columns={"tpep_pickup_datetime": "pickup_datetime", "tpep_dropoff_datetime": "dropoff_datetime"})

In [None]:
df.describe()

In [None]:
cleasing = df[(df.tip_amount >= 0) & (df.tip_amount <= 100) & (df.fare_amount >= 2.5) & (df.fare_amount <= 100) & 
              (df.total_amount >= 2.5) & (df.total_amount <= 300) & (df.trip_distance > 0) & (df.passenger_count > 0) &
             (df.extra >= 0) & (df.extra <= 1)]

In [None]:
cleasing.describe()

In [None]:
# fare amount
plt.figure(figsize = (14, 4))
n, bins, patches = plt.hist(cleasing.fare_amount, 1000, facecolor='blue', alpha=0.75)
plt.xlabel('Fare amount')
plt.title('Histogram of fare amount')
plt.xlim(0, 60)
plt.show();

In [None]:
cleasing.groupby('fare_amount').size().nlargest(20)

In [None]:
# passenger count
cleasing['passenger_count'].value_counts().plot.bar(color = 'b', edgecolor = 'k');
plt.title('Histogram of passenger counts'); plt.xlabel('Passenger counts'); plt.ylabel('Count');

In [None]:
cleasing.groupby('passenger_count').size()
# remove taxi rides with <= 6
# taxi = taxi.loc[taxi['passenger_count'] <= 6]

In [None]:
# A baseline model
from sklearn.model_selection import train_test_split

train, test = train_test_split(cleasing, test_size=0.3, random_state=42)
import numpy as np
import shutil

def distance_between(lat1, lon1, lat2, lon2):
  # Haversine formula to compute distance 
  dist = np.degrees(np.arccos(np.sin(np.radians(lat1)) * np.sin(np.radians(lat2)) + np.cos(np.radians(lat1)) * np.cos(np.radians(lat2)) * np.cos(np.radians(lon2 - lon1)))) * 60 * 1.515 * 1.609344
  return dist

def estimate_distance(df):
  return distance_between(df['pickup_latitude'], df['pickup_longitude'], df['dropoff_latitude'], df['dropoff_longitude'])

def compute_rmse(actual, predicted):
  return np.sqrt(np.mean((actual - predicted)**2))

def print_rmse(df, rate, name):
  print("{1} RMSE = {0}".format(compute_rmse(df['fare_amount'], rate * estimate_distance(df)), name))
  
rate = train['fare_amount'].mean() / estimate_distance(train).mean()

print("Rate = ${0}/km".format(rate))
print_rmse(train, rate, 'Train')
print_rmse(test, rate, 'Test')

In [None]:
# Feature engineering
cleasing['pickup_datetime'] = pd.to_datetime(cleasing['pickup_datetime'], errors='coerce')

cleasing['year'] = cleasing.pickup_datetime.dt.year
cleasing['month'] = cleasing.pickup_datetime.dt.month
cleasing['day'] = cleasing.pickup_datetime.dt.day
cleasing['weekday'] = cleasing.pickup_datetime.dt.weekday
cleasing['hour'] = cleasing.pickup_datetime.dt.hour

In [None]:
cleasing.describe()

In [None]:
cleasing = cleasing.loc[cleasing['year'] == 2018]
cleasing = cleasing.loc[cleasing['month'] == 3]

In [None]:
cleasing.describe()

In [None]:
from sklearn import linear_model
reg = linear_model.LinearRegression()
reg.fit(cleasing.drop('total_amount', axis=1), cleasing['total_amount'])                                     
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
#reg.coef_

In [None]:
from math import radians, cos, sin, asin, sqrt
import numpy as np

def haversine_np(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees)

    All args must be of equal length.    

    """
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2 * np.arcsin(np.sqrt(a))
    km = 6371 * c  # 6371 is Radius of earth in kilometers. Use 3956 for miles
    return km

cleasing['distance'] = haversine_np(cleasing['pickup_latitude'], cleasing['pickup_longitude'], cleasing['dropoff_latitude'] , cleasing['dropoff_longitude'])

In [None]:
JFK_coord = (40.6413, -73.7781)
pickup_JFK = haversine_np(cleasing['pickup_latitude'], cleasing['pickup_longitude'], JFK_coord[0], JFK_coord[1]) 
dropoff_JFK = haversine_np(JFK_coord[0], JFK_coord[1], cleasing['dropoff_latitude'], cleasing['dropoff_longitude'])
cleasing['JFK_distance'] = pd.concat([pickup_JFK, dropoff_JFK], axis=1).min(axis=1)
del cleasing['tpep_pickup_datetime']
cleasing.head()

In [None]:
plt.figure(figsize = (14, 4))
n, bins, patches = plt.hist(cleasing.distance, 1000, facecolor='blue', alpha=0.75)
plt.xlabel('distance')
plt.title('Histogram of ride distance')
plt.show();

In [None]:
# Linear regression modedl
from sklearn.model_selection import train_test_split
y = cleasing['fare_amount']
X = cleasing.drop(columns=['fare_amount'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

from sklearn.metrics import mean_squared_error
print("Test RMSE: %.3f" % mean_squared_error(y_test, y_pred) ** 0.5)

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(max_depth=2, random_state=0, n_estimators=100)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print("Test RMSE: %.3f" % mean_squared_error(y_test, y_pred) ** 0.5)