In [None]:
import pandas as pd 
import numpy as np
from matplotlib import style
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
#tdqm = progress bar
from tqdm import tqdm
from datetime import datetime
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing
from sklearn import utils

In [None]:
# Number of rows
nrows = 10000000
# Chunksize = 10000 
chunksize = 10000
# Vitou's path
# path = 'C:/Users/sirus/Downloads/train.csv'
# Muna's path
path = '/Users/muna/Development/DataScience/new-york-city-taxi-fare-prediction/train.csv'
# Columns to read from the data
# Todo :split date into meaningful data
# cols = ['pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude','passenger_count','fare_amount']
cols = ['pickup_datetime','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude','passenger_count','fare_amount']

#list to hold the batch dataframe
df_list = []

for df_chunk in tqdm(pd.read_csv(path,usecols=cols, chunksize=chunksize,nrows=nrows)):
    df_list.append(df_chunk)
    
# Merge all dataframes into one dataframe
data = pd.DataFrame()
data = pd.concat(df_list)
# # Delete the dataframe list to release memory
del df_list, df_chunk

### Let's take a look at the data

In [None]:
%%time
# data.describe(include='all')

### Check for missing values

In [None]:
%%time
# Checking for missing values
data.isnull().sum()

### From our observation, there are some missing values, so we remove them

In [None]:
%%time
data = data.dropna(how = 'any', axis = 'rows')

### Let's take a look at the data

In [None]:
data['fare_amount'].describe()

### From the description above, some of the fare_amount have negative values

We are going to remove all negatvie fare_amounts

In [None]:
%%time
data = data[(data['fare_amount'] > 0)]

In [None]:
data['fare_amount'].describe()

In [None]:
data['fare_amount'].describe()

### Let's take a detailed look at the passenger count

In [None]:
data['passenger_count'].describe()

The highest passenger count per taxi ride is 208, which is not possible. So we remove all passenger_count grater than 6, since 6 is the maximum passenger capacity for Uber/Lyft if the ride is an SUV

In [None]:
data = data[(data['passenger_count'] <= 6)]

In [None]:
data['passenger_count'].describe()

Now the highest passenger capacity is 6

Next we remove all passenger count that are zero

In [None]:
data = data[(data['passenger_count'] > 0)]

In [None]:
data['passenger_count'].describe()

### Looking at the pickup|droppoff latitudes and longitudes

In [None]:
data[['pickup_latitude','pickup_longitude','dropoff_latitude','dropoff_longitude']].describe()

Looking at the info above, the maximum pickup_latitude/pickup_longtide/droppoff_latitude/dropoff_longitude are over 3000 WITH their respective minimum values are over -3000

Latitudes range from -90 to 90 while longitudes range from -180 to 180, for single degree format. So we remove all values that are not with the latitude and longitude ranges

In [None]:
data = data.drop((data[(data['pickup_latitude'] > 90) | (data['pickup_latitude'] < -90)]).index, axis=0)

In [None]:
data = data.drop((data[(data['pickup_longitude'] > 180) | (data['pickup_longitude'] < -180)]).index, axis=0)

We will do the same for dropoff cordinates

In [None]:
data = data.drop((data[(data['dropoff_latitude'] > 90) | (data['dropoff_latitude'] < -90)]).index, axis=0)

In [None]:
data = data.drop((data[(data['dropoff_longitude'] > 180) | (data['dropoff_longitude'] < -180)]).index, axis=0)

In [None]:
data[['pickup_latitude','pickup_longitude','dropoff_latitude','dropoff_longitude']].describe()

### Feature Engineering

In [None]:
# Given a dataframe, add two new features 'abs_diff_longitude' and
# 'abs_diff_latitude' reprensenting the "Manhattan vector" from
# the pickup location to the dropoff location.
def add_travel_vector_features(df):
    df['abs_diff_longitude'] = (df.dropoff_longitude - df.pickup_longitude).abs()
    df['abs_diff_latitude'] = (df.dropoff_latitude - df.pickup_latitude).abs()

add_travel_vector_features(data)

### In further observation, we have pickup_longitude, puckup_latitiude, dropoff_longitude and dropoff_latitude. We can calculate the distance

In [None]:
# Function that calculates distance between pickup location and dropoff location
def getDistance(lat1,lon1,lat2,lon2):
    r = 6378 # earth's radius
    lat1 = np.deg2rad(lat1)
    lon1 = np.deg2rad(lon1)
    lat2 = np.deg2rad(lat2)
    lon2 = np.deg2rad(lon2)
    
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    distance = r*c
    
    return distance



In [None]:
%%time
# Add new cloumn "distance" to the data
data['distance'] = getDistance(data.pickup_latitude, data.pickup_longitude, 
                                      data.dropoff_latitude, data.dropoff_longitude)

In [None]:
data['distance'].describe()

We have some distances which are zero. so we remove all

In [None]:
data = data[(data['distance'] > 0)]

In [None]:
data['distance'].describe()

In [None]:
%%time
print(data[::10])

### Creating new features like year, month, day, hour and dayOfWeek from pickup_datetime

In [None]:
def split_datetime(df):
        df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'], format="%Y-%m-%d %H:%M:%S UTC")
        df['year'] = df['pickup_datetime'].dt.year
        df['month'] = df['pickup_datetime'].dt.month
        df['day'] = df['pickup_datetime'].dt.day
        df['hour'] = df['pickup_datetime'].dt.hour
        df['dayOfWeek'] = df['pickup_datetime'].dt.dayofweek
        
        return df

In [None]:
%%time
split_datetime(data)

Now that we have split pickup_datetime into year, month, day, hour and year

### Let's take a look at hour adn dayOfWeek

In [None]:
%%time
data['hour'].describe()

According to New York Times, rush hours is ususally between 7 to 9AM and from 4 to 6PM. Let's create a new feature from hour

In [None]:
# 1 for rush hour, 0 for not
def rush_hour(hour):
    if hour in range(7, 10) or hour in range(4, 7):
        return 1
    else:
        return 0

In [None]:
%%time
data['rush_hour'] = data['hour'].apply(rush_hour)

In [None]:
data.describe()

In [None]:
%%time
data['dayOfWeek'].describe()

We can also create a new feature from daysOfWeek. We can check if it's a weekend or not

In [None]:
# 1 for weekend, 0 for not
def weekend(dayOfWeek):
    if dayOfWeek == 0 or dayOfWeek == 6:
        return 1
    else:
        return 0

In [None]:
%%time
data['weekend'] = data['dayOfWeek'].apply(weekend)

In [None]:
data.describe()

rush_hour and weekend are categorical either 1 or 0, so we can crerate dummy columns from them

In [None]:
data = pd.get_dummies(data, columns=['rush_hour','weekend'])

In [None]:
data.describe()

In [None]:
# sns.barplot(x=data['year'],y=data["fare_amount"],data=data).set_title("Fare Amount over Years")

### Looks like fares have been incrasing over the years

In [None]:
# sns.barplot(x=data['hour'],y=data["fare_amount"],data=data).set_title("Pickup hour vs fare amount")

#### The fare amount is highest around 5am

In [None]:
# sns.barplot(x=data['dayOfWeek'],y=data["fare_amount"],data=data).set_title("Pickup days vs fare amount")

### Correlation between features/variables

In [None]:
# %%time
# correlation= data.corr()
# colormap = plt.cm.inferno
# mask = np.array(correlation)
# mask[np.tril_indices_from(mask)] = False
# fig=plt.gcf()
# fig.set_size_inches(30,12)
# sns.heatmap(data=correlation ,mask=mask,square=True,annot=True,cbar=True,cmap=colormap, linecolor='White', linewidths=0.1)

In [None]:
%%time
feature_cols = ['pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude','passenger_count',
                'abs_diff_longitude','abs_diff_latitude','distance','year','month','day','hour','dayOfWeek',
                'rush_hour_0','rush_hour_1','weekend_0','weekend_1']

X = data[feature_cols] 
y = data['fare_amount']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
%%time
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.35,random_state=6)

In [None]:
%%time
lr = LinearRegression()
lr.fit(X_train,y_train)

In [None]:
%%time
lr_predict = lr.predict(X_test)
mse = metrics.mean_squared_error(y_test, lr_predict)
rmse = np.sqrt(mse)
print(f'RMSE of Logistic Regresion: {rmse}')

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rf = RandomForestRegressor(n_estimators=10,bootstrap=True,random_state=3)
rf.fit(X_train,y_train)

y_test_pred = rf.predict(X_test)

y_train_err = metrics.mean_squared_error(y_test, y_test_pred)
rmse = np.sqrt(y_train_err)
print(f'RMSE of Logistic Regresion: {rmse}')