# NYC Taxi Fare Prediction
COMP9417 Group Project    
Why Axis Group    
UNSW 2019T2

### Technical Requirements

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv, math, random, os
from haversine import haversine, Unit
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

Using TensorFlow backend.


In [2]:
# randomly pick a sample from the full dataset
#rows = 1_000_000 # number or rows in file
#sample_size = 500_000 # training size

#skip = sorted(random.sample(range(1, rows + 1), rows - sample_size))

#df_raw_train = pd.read_csv('../input/train.csv', nrows=5_000_000)
df_raw_test = pd.read_csv('datasets/test.csv')

In [3]:
df_raw_train.head()

NameError: name 'df_raw_train' is not defined

In [None]:
df_raw_train.describe()

## Data Preprocessing
Lets remove anomalies to make sure the data we are using will not affect our model during training. These are few things that we should consider:
* null and missing values
* duplicate data
* minimum taxi fare should be $2.50 (https://www1.nyc.gov/site/tlc/passengers/taxi-fare.page)
* passenger_count must be greater than or equal to 1 and less than or equal to 4 (0 means the ride was either cancelled or rejected, a car can only fit at most 4 passengers)
* latitude and longitude coordinates should be within NYC (https://www.latlong.net/place/new-york-city-ny-usa-1848.html)

The center coordinates of NYC are (40.730610, -73.935242), we pick a margin from this center to capture all the data points within this margin.

In [8]:
NYC_LONG = -73.935242
NYC_LAT = 40.730610

# margin from the center coordinates. 
# increase this value if you want to capture more data
MARGIN = 0.8

# longitude range
min_long = float(NYC_LONG - MARGIN)
max_long = float(NYC_LONG + MARGIN)

# latitude range
min_lat = float(NYC_LAT - MARGIN)
max_lat = float(NYC_LAT + MARGIN)

def data_cleaning(df):
    # remove null values
    df.dropna(inplace=True)
    
    # remove duplicates
    df.drop_duplicates(inplace=True)
    
    # remove longitude that is out of range
    df = df[(df.pickup_longitude < max_long) & (df.pickup_longitude > min_long)]
    df = df[(df.dropoff_longitude < max_long) & (df.dropoff_longitude > min_long)]

    # remove latitude that is out of range
    df = df[(df.pickup_latitude < max_lat) & (df.pickup_latitude > min_lat)]
    df = df[(df.dropoff_latitude < max_lat) & (df.dropoff_latitude > min_lat)]

    # remove amount less than the minimum taxi fare
    if 'fare_amount' in df:
        df.drop(df[df.fare_amount < 2.50].index, inplace=True)
        df.drop(df[df.fare_amount > 300.00].index, inplace=True)

    # remove passenger count less than 1 and greater than 4
    df.drop(df[df.passenger_count < 1].index, inplace=True) # has to be one passenger in the car
    df.drop(df[df.passenger_count > 4].index, inplace=True) # not more than 4 passengers can fit in a car
    
    return df

## Feature Engineering
The features in our data are important to the predictive models we use and will influence the results we are going to achieve. The quality and quantity of the features will have great influence on whether the model is good or not.

We could say the better the features are, the better the result is. This isn't entirely true, because the results achieved also depend on the model and the data, not just the chosen features. That said, choosing the right features is still very important. Better features can produce simpler and more flexible models, and they often yield better results. (https://en.wikipedia.org/wiki/Feature_engineering)

Date and time of pickups is also a key feature because some charges are different for weekends and weekdays and some depend on the time of day whether you are catching a taxi during the day or at night time.

Trips to the three main airports in New York have different rates from the standard rate https://www1.nyc.gov/site/tlc/passengers/taxi-fare.page. We need to ensure these fixed rates will not affect the standard rates so we include pickups and dropoffs from these main airports as separate features. The three main airports are:
* LaGuardia Airport (LGA) (40.77064 -73.86764)
* John F. Kennedy Airport (JFK) (40.64459 -73.78295)
* Newark Airport (EWR) (40.69211 -74.18288)   

Coordinates from (https://latitude.to/)

### Geolocation Distance
Distance between pickups and dropoffs is the major factor in determining taxi fares. Before adding these features we need to define our distance function to calculate the distance between each pickups and dropoffs.

In [9]:
def haversine(df):
    pickup_location = (df['pickup_latitude'], df['pickup_longitude'])
    dropoff_location = (df['dropoff_latitude'], df['dropoff_longitude'])
    distance = haversine(pickup_location, dropoff_location)
    return distance

def haversine(pickup_lat, pickup_long, dropoff_lat, dropoff_long):
    pickup_lat, pickup_long, dropoff_lat, dropoff_long = map(np.radians, [pickup_lat, pickup_long, dropoff_lat, dropoff_long])
    lat_dist = dropoff_lat - pickup_lat
    long_dist = dropoff_long - pickup_long
    a = np.sin(lat_dist/2.0)**2 + np.cos(pickup_lat) * np.cos(dropoff_lat) * np.sin(long_dist/2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles
    distance = r * c
    return distance

def manhattan(pickup_lat, pickup_long, dropoff_lat, dropoff_long):
    return np.abs(dropoff_lat - pickup_lat) + np.abs(dropoff_long - pickup_long)

def euclidean(pickup_lat, pickup_long, dropoff_lat, dropoff_long):
    return pow(pow((dropoff_lat - pickup_lat), 2) + pow((dropoff_long - pickup_long), 2), 0.5)



In [10]:
LGA_lat = 40.77064 
LGA_long = -73.86764
JFK_lat = 40.64459
JFK_long = -73.78295
EWR_lat = 40.69211
EWR_long = -74.18288


def feature_engineering(df):
    # split pickup_datetime    
    df['pickup_datetime'] =  pd.to_datetime(df['pickup_datetime'], format='%Y-%m-%d %H:%M:%S %Z')
    df['year'] = df['pickup_datetime'].apply(lambda x: x.year)
    df['month'] = df['pickup_datetime'].apply(lambda x: x.month)
    df['day'] = df['pickup_datetime'].apply(lambda x: x.day)
    df['hour'] = df['pickup_datetime'].apply(lambda x: x.hour)
    df['day_of_week'] = df['pickup_datetime'].apply(lambda x: x.dayofweek)
    
    # add distances to major airports
    df['distance'] = haversine(df.pickup_latitude, df.pickup_longitude, df.dropoff_latitude, df.dropoff_longitude) 
    df['manhattan_dist'] = manhattan(df.pickup_latitude, df.pickup_longitude, df.dropoff_latitude, df.dropoff_longitude) 
    #df['euclidean_dist'] = euclidean(df.pickup_latitude, df.pickup_longitude, df.dropoff_latitude, df.dropoff_longitude) 
    df['pickup_LGA_dist'] = haversine(df['pickup_latitude'], df['pickup_longitude'], LGA_lat, LGA_long)
    df['dropoff_LGA_dist'] = haversine(df['dropoff_latitude'], df['dropoff_longitude'], LGA_lat, LGA_long)
    df['pickup_JFK_diste'] = haversine(df['pickup_latitude'], df['pickup_longitude'], JFK_lat, JFK_long)
    df['dropoff_JFK_dist'] = haversine(df['dropoff_latitude'], df['dropoff_longitude'], JFK_lat, JFK_long)
    df['pickup_EWR_dist'] = haversine(df['pickup_latitude'], df['pickup_longitude'], EWR_lat, EWR_long)
    df['dropoff_EWR_dist'] = haversine(df['dropoff_latitude'], df['dropoff_longitude'], EWR_lat, EWR_long)
    
    # 50 cents overnight surcharge 8pm to 6am. 1 indicating theres a surcharge and 0 otherwise.
    df['overnight'] = np.where((df['hour'] >= 20) | (df['hour'] <= 6), 1, 0)
    
    # trips that goes in circle in the same area
    # indicated by high fare but very small distance, benchmark is if you go from Manhattan 
    # to JFK airport with approximately $60 fare for approximately 30kilometers
    # as indicated by TLC
    if 'fare_amount' in df:
        df.drop(df[(df.distance < 10) & (df.fare_amount > 150)].index, inplace=True)

    return df


In [None]:
df_clean_temp = data_cleaning(df_raw_train)
df_temp = feature_engineering(df_clean_temp)
df_temp.plot.scatter('distance', 'fare_amount')
plt.show()

From the distribution above our hypothesis seem to be correct. There is a linear distribution between distance and the fare amount. Lets apply the preprocessing and feature engineering to both our training and test dataset.

In [11]:
# keeping the original dataframe, if any changes are made
#df_train = df_raw_train.copy(deep=True)
df_test = df_raw_test.copy(deep=True)

#df_clean_train = data_cleaning(df_train)
#df_train = feature_engineering(df_clean_train)
df_clean_test = data_cleaning(df_test)
df_test = feature_engineering(df_clean_test)

In [None]:
df_train.describe()

In [None]:
# Check corr of 'fare_amount' to all the other variables
print(df_train.corrwith(df_train['fare_amount']))

### Feature Scaling

We need to scale the features to ensure they have an uniform range so that large number features do not dominate features with low numbers.

In [11]:
cols = ['key','fare_amount','pickup_datetime', 'passenger_count']
df_feature = df_train.drop(columns=cols)
df_target = df_train.fare_amount

# scale features
scaler = MinMaxScaler()
#scaler = StandardScaler()

cols = df_feature.columns.tolist()
df_feature[cols] = scaler.fit_transform(df_feature[cols])
# df_scaled = pd.concat([df_feature, df_target], axis=1)

NameError: name 'df_train' is not defined

In [None]:
df_feature.head()

Lets split our data into train and test subsets. (https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html) with 20% test and 80% train.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_feature, df_target, test_size=0.10)

### DNN Regressor model

In [None]:

model = Sequential()
model.add(Dense(256, activation= 'relu', input_dim=X_train.shape[1]))
model.add(Dense(128, activation= 'relu'))
model.add(Dense(64, activation= 'relu'))
model.add(Dense(32, activation= 'relu'))
#model.add(Dense(16, activation= 'relu'))
#model.add(Dense(8, activation= 'relu'))
model.add(Dense(1))

#LEARNING_RATE = 0.01
#adam = Adam(lr=LEARNING_RATE)

model.compile(loss='mse', optimizer='adam', metrics=['accuracy', 'mae'])
history = model.fit(X_train, y_train, epochs=60, batch_size=256, validation_split=0.3)
# random sample data for bagging
#df.sample(frac=0.5, replace=True, random_state=1)

# check
# https://github.com/dimitreOliveira/NewYorkCityTaxiFare/blob/master/keras_model.py

Save model

In [None]:
model.save('60e_5mil_loss12.0425_valloss_12.5486.h5')

# Prediction

In [None]:
# Results
train_pred = model.predict(X_train)
train_rmse = np.sqrt(mean_squared_error(y_train, train_pred))
test_pred = model.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test, test_pred))
print("Train RMSE: {:0.2f}".format(train_rmse))
print("Test RMSE: {:0.2f}".format(test_rmse))
print('------------------------')
def predict(df, X_test, model):
    sample = X_test.sample(n=1, random_state=np.random.randint(low=0, high=10000))
    idx = sample.index[0]

    actual_fare = df.loc[idx,'fare_amount']
    day_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    day_of_week = day_names[df.loc[idx,'day_of_week']]
    hour = df.loc[idx,'hour']
    predicted_fare = model.predict(sample)[0][0]
    rmse = np.sqrt(np.square(predicted_fare - actual_fare))

    print("Trip Details: {}, {}:00hrs".format(day_of_week, hour))  
    print("Actual fare: ${:0.2f}".format(actual_fare))
    print("Predicted fare: ${:0.2f}".format(predicted_fare))
    print("RMSE: ${:0.2f}".format(rmse))
    

# randomly predict from test sample
predict(df_train, X_test, model)


In [None]:
print(history.history.keys())

Accuracy summary

In [None]:
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

Loss summary

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

# Linear Regression

In [None]:
#y = taxi['fare_amount']
#X = taxi.drop(columns=['fare_amount'])
#X_train, X_test, y_train, y_test = train_test_split(df_feature, df_target, test_size=0.3, random_state=42)

from sklearn.linear_model import LinearRegression

linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred = linear_model.predict(X_test)

from sklearn.metrics import mean_squared_error

print("Test RMSE: %.3f" % mean_squared_error(y_test, y_pred) ** 0.5)

Linear Regression plot

In [None]:
# for the "Training data"
X = X_train.distance
# target data is array of shape (n,) 
y = pd.DataFrame(y_train, columns=['fare_amount'])

#plt.scatter(X, y,color='g')
#plt.plot(X, linear_model.predict(X),color='k')

#plt.show()

# Random Forest Regression

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(max_depth=10, random_state=0, n_estimators=100)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print("Test RMSE: %.3f" % mean_squared_error(y_test, y_pred) ** 0.5)

In [5]:
history.history.keys()

NameError: name 'history' is not defined

Plots of history

# Bagging Ensemble
https://machinelearningmastery.com/how-to-create-a-random-split-cross-validation-and-bagging-ensemble-for-deep-learning-in-keras/
https://www.youtube.com/watch?v=2Mg8QD0F1dQ
Split n training data into m number of bags containing randomly picked n' data.    
n - number of instances    
n' - number in a bag    
m - number of bags    
n'< n usually by 60%    

We train on these bags to have m number of models. Do prediction on these models and the final predition will be the mean.

In [None]:
# multiple train-test splits
n_splits = 10
m_bags = 4500
scores, members = list(), list()
for _ in range(n_splits):
    # select indexes
    ix = [i for i in range(len(X_train))]
    train_ix = resample(ix, replace=True, n_samples=m_bags)
    test_ix = [x for x in ix if x not in train_ix]
    # select data
    trainX, trainy = X[train_ix], y[train_ix]
    testX, testy = X[test_ix], y[test_ix]
    # evaluate model
    model, test_acc = evaluate_model(trainX, trainy, testX, testy)
    print('>%.3f' % test_acc)
    scores.append(test_acc)
    members.append(model)

In [None]:
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['acc'])
plt.title('model accuracy')
plt.ylabel('acc')
plt.xlabel('loss')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

# Submission

Test submission on kaggle (previous score

In [12]:
cols = ['key','pickup_datetime', 'passenger_count']
X_test_sub = df_test.copy(deep=True)
key = X_test_sub.key
scaler = MinMaxScaler()

X_test_sub.drop(columns=cols, inplace=True)
X_test_sub_cols = X_test_sub.columns.tolist()

X_test_sub[X_test_sub_cols] = scaler.fit_transform(X_test_sub[X_test_sub_cols])

from keras.models import load_model
model = load_model('60e_5mil_loss12.0425_valloss_12.5486.h5')

y_test_sub = model.predict(X_test_sub)
#sub = pd.DataFrame(
#    {'key': key, 'fare_amount': y_test_sub[:, 0]},
#    columns = ['key', 'fare_amount'])
#sub.to_csv('1-08-19_submission.csv', index=False)
print(y_test_sub)
#print(os.listdir('.'))


W0802 02:53:36.830245 17100 deprecation_wrapper.py:119] From c:\python37\lib\site-packages\keras\backend\tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0802 02:53:36.876250 17100 deprecation_wrapper.py:119] From c:\python37\lib\site-packages\keras\backend\tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0802 02:53:37.077245 17100 deprecation_wrapper.py:119] From c:\python37\lib\site-packages\keras\backend\tensorflow_backend.py:174: The name tf.get_default_session is deprecated. Please use tf.compat.v1.get_default_session instead.

W0802 02:53:37.080249 17100 deprecation_wrapper.py:119] From c:\python37\lib\site-packages\keras\backend\tensorflow_backend.py:181: The name tf.ConfigProto is deprecated. Please use tf.compat.v1.ConfigProto instead.

W0802 02:53:37.082247 17100 deprecation_wrapper.py:119] From c:\python37\lib\site-packages\keras\backend\tensorflow_bac

[[14.629625]
 [12.599325]
 [ 9.005811]
 ...
 [26.315952]
 [12.107865]
 [13.970414]]


In [13]:
print(os.listdir('.'))

['.git', '.gitignore', '.ipynb_checkpoints', '1-08-19_submission.csv', '100epochs_model.h5', '1mil_model.h5', '60e_5mil_loss12.0425_valloss_12.5486.h5', 'analyse.py', 'chapter', 'datasets', 'images', 'main.py', 'neural_network.py', 'preprocess.py', 'README.md', 'requirements.txt', 'submission_6.84531.csv', 'visual.py', 'Why Axis Group Project.ipynb', 'whyaxis_submission.csv', '__pycache__']
