In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

import os
for dirname, _, filenames in os.walk('new-york-city-taxi-fare-prediction'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

pd.set_option('display.max_columns', None)

# Load the dataset

In [2]:
data_dir = '../input/new-york-city-taxi-fare-prediction'

To load the presampled file, download it from https://drive.google.com/file/d/11-S5q1lawXka_mZlKYfC4nv9LGh23sXO/view?usp=sharing

In [None]:
train = pd.read_csv(data_dir+'/twenty.csv',parse_dates=['pickup_datetime'], index_col=0)

Uncomment this below column to load the dataset from scratch

In [3]:
# import random

# sample_frac = 0.20

# def skip_row(row_idx):
#     if row_idx == 0:
#         return False
#     return random.random() > sample_frac


# selected_cols = 'fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count'.split(',')

# dtypes = {
#     'fare_amount': 'float16',
#     'pickup_longitude': 'float32',
#     'pickup_latitude': 'float32',
#     'dropoff_longitude': 'float32',
#     'passenger_count': 'uint8'
# }

# random.seed(7)
# train = pd.read_csv(data_dir+"/train.csv", 
#                  usecols=selected_cols, 
#                  dtype=dtypes, 
#                  parse_dates=['pickup_datetime'], 
#                  skiprows=skip_row)

In [5]:
test = pd.read_csv(data_dir+'/test.csv',parse_dates=['pickup_datetime'])

# EDA

In [6]:
train.info(verbose=True, null_counts=True)

In [7]:
train.duplicated().sum()

In [8]:
train.describe()

In [9]:
train.head()

# Data Cleaning

In [10]:
import numpy as np

def haversine_np(lon1, lat1, lon2, lat2):
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2 * np.arcsin(np.sqrt(a))
    km = 6367 * c
    return km

In [11]:
def add_trip_distance(df):
    df['trip_distance'] = haversine_np(df['pickup_longitude'], df['pickup_latitude'], df['dropoff_longitude'], df['dropoff_latitude'])

In [12]:
add_trip_distance(train)
add_trip_distance(test)

In [13]:
def add_dateparts(df, col):
    df['year'] = df[col].dt.year
    df['month'] = df[col].dt.month
    df['day'] = df[col].dt.day
    df['weekday'] = df[col].dt.weekday
    df['hour'] = df[col].dt.hour

In [14]:
add_dateparts(train, 'pickup_datetime')
add_dateparts(test, 'pickup_datetime')

In [15]:
def remove_outliers(df):
    return df[(df['fare_amount'] >= 1.) & 
              (df['fare_amount'] <= 500.) &
              (df['pickup_longitude'] >= -75) & 
              (df['pickup_longitude'] <= -72) & 
              (df['dropoff_longitude'] >= -75) & 
              (df['dropoff_longitude'] <= -72) & 
              (df['pickup_latitude'] >= 40) & 
              (df['pickup_latitude'] <= 42) & 
              (df['dropoff_latitude'] >=40) & 
              (df['dropoff_latitude'] <= 42) & 
              (df['passenger_count'] >= 1) & 
              (df['passenger_count'] <= 6)]

In [16]:
train = remove_outliers(train)

In [17]:
train.corr()

In [18]:
train.describe()

# Data Prep

In [19]:
remove_cols = ["pickup_datetime", "pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude"]
train = train.drop(remove_cols, axis=1)
test = test.drop(remove_cols, axis=1)

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
train_df, val_df = train_test_split(train, test_size=0.2, random_state=42)

In [22]:
len(train_df), len(val_df)

In [23]:
y_train = train_df["fare_amount"]
x_train = train_df.drop("fare_amount",axis=1)

In [24]:
y_val = val_df["fare_amount"]
x_val = val_df.drop("fare_amount",axis=1)

In [25]:
test_df = test.drop("key", axis=1)

# Modeling

## Linear Regression

In [26]:
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

In [27]:
model = LinearRegression()
model.fit(x_train, y_train)

preds = model.predict(x_val)

rmse = mean_squared_error(y_val, preds, squared=False)
print('RMSE Score on Validation data',rmse)

final = model.predict(test_df)

submission = pd.DataFrame({'key': test.key, "fare_amount": final})
submission.to_csv('linear.csv', index=False)

## Ridge Regression

In [28]:
from sklearn.linear_model import Ridge

In [29]:
model = Ridge()
model.fit(x_train, y_train)

preds = model.predict(x_val)

rmse = mean_squared_error(y_val, preds, squared=False)
print('RMSE Score on Validation data',rmse)

final = model.predict(test_df)

submission = pd.DataFrame({'key': test.key, "fare_amount": final})
submission.to_csv('ridge.csv', index=False)

## Random Forest

In [30]:
from sklearn.ensemble import RandomForestRegressor

In [34]:
model = RandomForestRegressor(n_estimators=20, max_depth=16)
model.fit(x_train, y_train)

preds = model.predict(x_val)

rmse = mean_squared_error(y_val, preds, squared=False)
print('RMSE Score on Validation data',rmse)

final = model.predict(test_df)

submission = pd.DataFrame({'key': test.key, "fare_amount": final})
submission.to_csv('randomforest.csv', index=False)

## XGradient Boosting

In [32]:
from xgboost import XGBRegressor

In [None]:
model = XGBRegressor()
model.fit(x_train, y_train)

preds = model.predict(x_val)

rmse = mean_squared_error(y_val, preds, squared=False)
print('RMSE Score on Validation data',rmse)

final = model.predict(test_df)

submission = pd.DataFrame({'key': test.key, "fare_amount": final})
submission.to_csv('xgb.csv', index=False)

## LightGBM

In [35]:
from lightgbm import LGBMRegressor

In [36]:
model = LGBMRegressor()
model.fit(x_train, y_train)

preds = model.predict(x_val)

rmse = mean_squared_error(y_val, preds, squared=False)
print('RMSE Score on Validation data',rmse)

final = model.predict(test_df)

submission = pd.DataFrame({'key': test.key, "fare_amount": final})
submission.to_csv('lgbm.csv', index=False)

## CatBoost

In [37]:
from catboost import CatBoostRegressor

In [38]:
model = CatBoostRegressor()
model.fit(x_train, y_train)

preds = model.predict(x_val)

rmse = mean_squared_error(y_val, preds, squared=False)
print('RMSE Score on Validation data',rmse)

final = model.predict(test_df)

submission = pd.DataFrame({'key': test.key, "fare_amount": final})
submission.to_csv('cat.csv', index=False)