In [1]:
#Solution to Kaggle Taxi Fare Prediction competition
#This file contains only data cleaning and data preparation
#Test and train data are available for download at Kaggle.com.
#https://www.kaggle.com/c/new-york-city-taxi-fare-prediction
#Placement: 303/1488

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import os
import math
from tqdm import tqdm
from subprocess import call
import matplotlib.pyplot as plt

In [3]:
#inputs
TRAIN_PATH = 'Data/train_1mln.csv'
TEST_PATH = 'Data/test.csv'
#TRAIN_PATH = 'Data/train.csv'


In [4]:
#calculate distance between coordinates using Haversine formula ( https://en.wikipedia.org/wiki/Haversine_formula )
def calc_distance(lat1, lon1, lat2, lon2):
    radius = 6371 # km
    dlat = math.radians(lat2-lat1)
    dlon = math.radians(lon2-lon1)
    a = math.sin(dlat/2) * math.sin(dlat/2) + math.cos(math.radians(lat1)) \
        * math.cos(math.radians(lat2)) * math.sin(dlon/2) * math.sin(dlon/2)
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    d = radius * c

    return d

#use apply function to calculate distance
def get_distance(df):
    df['distance'] = df.apply(lambda row: calc_distance(row['pickup_latitude'], 
                                 row['pickup_longitude'], 
                                 row['dropoff_latitude'], 
                                 row['dropoff_longitude']), axis=1)
    return df

#set max distance, drop values above 100km 
def lower_distance(df):
    df = df[df['distance'] < 100]
    return df

In [5]:
#drop missings, unreal values
def clean_data(df):
    df = df.dropna(how = 'any', axis = 'rows')
    df = df[(df['passenger_count'] > 0) & (df['passenger_count'] < 7)]
    df = df[(df['fare_amount'] > 0) & (df['fare_amount'] < 100)]
    df = df[(df['pickup_latitude'] != 0) & (df['pickup_longitude'] != 0) &
                        (df['dropoff_latitude'] != 0) & (df['dropoff_longitude'] != 0)]
    return df

In [6]:
#Extract hour, day, month and year  from datetime
def get_date_features(df):
    df['hour'] = df.pickup_datetime.dt.hour
    df['dayofweek'] = df.pickup_datetime.dt.dayofweek
    df['month'] = df.pickup_datetime.dt.month
    df['year'] = df.pickup_datetime.dt.year
    
    return df

In [7]:
# define bounding box - max cords values from test set
BB = (-74.55, -72.75, 40.45, 41.85)

# Select trips only within boundingbox - max cordinates values extracted from test_set
def select_within_boundingbox(df, BB):
    df = df[(df.pickup_longitude >= BB[0]) & (df.pickup_longitude <= BB[1]) & \
           (df.pickup_latitude >= BB[2]) & (df.pickup_latitude <= BB[3]) & \
           (df.dropoff_longitude >= BB[0]) & (df.dropoff_longitude <= BB[1]) & \
           (df.dropoff_latitude >= BB[2]) & (df.dropoff_latitude <= BB[3])]
    return df


#Remove incorrect values - locations on water
#Trick from https://www.kaggle.com/breemen/nyc-taxi-fare-data-exploration
def remove_datapoints_from_water(df):
    def lonlat_to_xy(longitude, latitude, dx, dy, BB):
        return (dx*(longitude - BB[0])/(BB[1]-BB[0])).astype('int'), \
               (dy - dy*(latitude - BB[2])/(BB[3]-BB[2])).astype('int')
    # read nyc mask and turn into boolean map with
    # land = True, water = False
    nyc_mask = plt.imread('Data/nyc_mask.png')[:,:,0] > 0.9
    
    # calculate for each lon,lat coordinate the xy coordinate in the mask map
    pickup_x, pickup_y = lonlat_to_xy(df.pickup_longitude, df.pickup_latitude, 
                                      nyc_mask.shape[1], nyc_mask.shape[0], BB)
    dropoff_x, dropoff_y = lonlat_to_xy(df.dropoff_longitude, df.dropoff_latitude, 
                                      nyc_mask.shape[1], nyc_mask.shape[0], BB)    
    # calculate boolean index
    idx = nyc_mask[pickup_y, pickup_x] & nyc_mask[dropoff_y, dropoff_x]
    df = df[idx]
    # return only datapoints on land
    return df



In [8]:
def prepare_data(path_to_file):
    #define types for faster data read
    traintypes = {'fare_amount': 'float32',
                  'pickup_datetime': 'str', 
                  'pickup_longitude': 'float32',
                  'pickup_latitude': 'float32',
                  'dropoff_longitude': 'float32',
                  'dropoff_latitude': 'float32',
                  'passenger_count': 'uint8'}
    cols = list(traintypes.keys())

    #size of single dataframe import(in loop)
    chunksize = 1000 
    #list to hold the batch dataframe
    df_list = [] 
    for df_part in tqdm(pd.read_csv(path_to_file, usecols=cols, dtype=traintypes, chunksize=chunksize)):

        # Neat trick from https://www.kaggle.com/btyuhas/bayesian-optimization-with-xgboost
        # Using parse_dates would be much slower!
        df_part['pickup_datetime'] = df_part['pickup_datetime'].str.slice(0, 16)
        df_part['pickup_datetime'] = pd.to_datetime(df_part['pickup_datetime'], utc=True, format='%Y-%m-%d %H:%M')
        df_part = clean_data(df_part)
        df_part = get_date_features(df_part)
        df_part = get_distance(df_part)
        df_part = lower_distance(df_part)
        df_part = select_within_boundingbox(df_part, BB)
        df_part = remove_datapoints_from_water(df_part)
        df_part = df_part.drop('pickup_datetime', axis=1)
        df_list.append(df_part)
    # Merge all dataframes into one dataframe
    df = pd.concat(df_list)
    del df_list
    return df

In [9]:
df = prepare_data(TRAIN_PATH)

1000it [02:11,  7.62it/s]


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 953977 entries, 0 to 999999
Data columns (total 11 columns):
fare_amount          953977 non-null float32
pickup_longitude     953977 non-null float32
pickup_latitude      953977 non-null float32
dropoff_longitude    953977 non-null float32
dropoff_latitude     953977 non-null float32
passenger_count      953977 non-null uint8
hour                 953977 non-null int64
dayofweek            953977 non-null int64
month                953977 non-null int64
year                 953977 non-null int64
distance             953977 non-null float64
dtypes: float32(5), float64(1), int64(4), uint8(1)
memory usage: 62.8 MB


In [11]:
from xgboost.sklearn import XGBClassifier
from sklearn import metrics

In [12]:
def XGBoost(X_train,X_test,y_train,y_test,num_rounds=50):
    dtrain = xgb.DMatrix(X_train,label=y_train)
    dtest = xgb.DMatrix(X_test,label=y_test)

    return xgb.train(params={
                    'eta':'0.3',
                    'max_depth':'8',
                    'eval_metric':'rmse',
                    'objective':'reg:linear',
                    'tree_method':'approx'}
                    ,dtrain=dtrain,num_boost_round=num_rounds, 
                    early_stopping_rounds=10,evals=[(dtest,'test')],)

In [13]:
X = df.drop(["fare_amount"],axis=1)
y = df.fare_amount
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1,random_state=123)
xgbm = XGBoost(X_train,X_test,y_train,y_test)

[22:32:24] Tree method is selected to be 'approx'
[22:32:24] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 396 extra nodes, 0 pruned nodes, max_depth=8
[0]	test-rmse:9.95188
Will train until test-rmse hasn't improved in 10 rounds.
[22:32:24] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 430 extra nodes, 0 pruned nodes, max_depth=8
[1]	test-rmse:7.46864
[22:32:25] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 432 extra nodes, 0 pruned nodes, max_depth=8
[2]	test-rmse:5.87259
[22:32:25] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 464 extra nodes, 0 pruned nodes, max_depth=8
[3]	test-rmse:4.89419
[22:32:25] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 476 extra nodes, 0 pruned nodes, max_depth=8
[4]	test-rmse:4.32068
[22:32:25] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 454 extra nodes, 0 pruned nodes, max_depth=8
[5]	test-rmse:4.00073
[22:32:26] /works

In [14]:
from sklearn.externals import joblib 

In [15]:
joblib.dump(xgbm, 'Taxi_Fare_Prediction_model.pkl')

['Taxi_Fare_Prediction_model.pkl']

In [16]:
df_test = pd.read_csv(TEST_PATH)

In [17]:
df_key = df_test.key
df_test = df_test.drop("key", axis=1)

In [18]:
traintypes = {
              'pickup_datetime': 'str', 
              'pickup_longitude': 'float32',
              'pickup_latitude': 'float32',
              'dropoff_longitude': 'float32',
              'dropoff_latitude': 'float32',
              'passenger_count': 'uint8'}

df_test['pickup_datetime'] = df_test['pickup_datetime'].str.slice(0, 16)
df_test['pickup_datetime'] = pd.to_datetime(df_test['pickup_datetime'], utc=True, format='%Y-%m-%d %H:%M')
df_test = get_date_features(df_test)
df_test = get_distance(df_test)
df_test = df_test.drop('pickup_datetime', axis=1)

In [19]:
xgbm_pred = xgbm.predict(xgb.DMatrix(df_test), ntree_limit = xgbm.best_ntree_limit)

In [20]:
prediction = pd.DataFrame(xgbm_pred)

In [21]:
submission = pd.concat([df_key, prediction], axis=1)

In [22]:
submission.columns = ["key", "fare_amount"]

In [23]:
submission.shape

(9914, 2)

In [24]:
submission.to_csv('submission_taxi_fare_prediction.csv', index=False)