<a href="https://colab.research.google.com/github/vamsibitra/Time-estimation/blob/main/Probability_Time_estimatation_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Load Libraries

In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
from datetime import timedelta
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split 
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsRegressor

Load Data

In [None]:
from google.colab import files
uploaded = files.upload()

KeyboardInterrupt: ignored

In [None]:
df = pd.read_csv('nyc_taxi_trip_duration.csv')

Preprocessing & feature extraction

DATE TIME CONVERSION

In [None]:
df['pickup_datetime'] = pd.to_datetime(df.pickup_datetime)
df['dropoff_datetime'] = pd.to_datetime(df.dropoff_datetime)

In [None]:
df_y = np.log1p(df['trip_duration'])
df.loc[:, 'pickup_weekday'] = df['pickup_datetime'].dt.weekday
df.loc[:, 'pickup_hour_weekofyear'] = df['pickup_datetime'].dt.weekofyear
df.loc[:, 'pickup_hour'] = df['pickup_datetime'].dt.hour
df.loc[:, 'pickup_minute'] = df['pickup_datetime'].dt.minute
df.loc[:, 'pickup_dt'] = (df['pickup_datetime'] - df['pickup_datetime'].min()).dt.total_seconds()
df.loc[:, 'pickup_week_hour'] = df['pickup_weekday'] * 24 + df['pickup_hour']



```
# This is formatted as code
```

Distance Features

Eucledian Distance

In [None]:
y_dist = df['pickup_longitude'] - df['dropoff_longitude']
x_dist = df['pickup_latitude'] - df['dropoff_latitude']
df['dist_sq'] = (y_dist ** 2) + (x_dist ** 2)
df['dist_sqrt'] = df['dist_sq'] ** 0.5

Haversine Distance

In [None]:
def haversine_array(lat1, lng1, lat2, lng2):
  lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
  AVG_EARTH_RADIUS = 6371   #in km
  lat = lat2 - lat1
  lng = lng2 - lng1
  d = np.sin(lat * 0.5) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(lng * 0.5) ** 2
  h = 2 * AVG_EARTH_RADIUS * np.arcsin(np.sqrt(d))
  return h

def direction_array(lat1, lng1, lat2, lng2):
  AVG_EARTH_RADIUS = 6371   #in km
  lng_delta_rad = np.radians(lng2 - lng1)
  lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
  y = np.sin(lng_delta_rad) * np.cos(lat2)
  x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(lng_delta_rad)
  return np.degrees(np.arctan2(y, x))

df['haversine_distance'] = haversine_array(df['pickup_latitude'].values,
                                           df['pickup_longitude'].values,
                                           df['dropoff_latitude'].values,
                                           df['dropoff_longitude'].values)
df['direction'] = direction_array(df['pickup_latitude'].values,
                                           df['pickup_longitude'].values,
                                           df['dropoff_latitude'].values,
                                           df['dropoff_longitude'].values)


Binning

In [None]:
df['pickup_latitude_round3'] = np.round(df['pickup_latitude'],3)
df['pickup_longitude_round3'] = np.round(df['pickup_longitude'],3)
df['dropoff_latitude_round3'] = np.round(df['dropoff_latitude'],3)
df['dropoff_longitude_round3'] = np.round(df['dropoff_longitude'],3)

In [None]:
df.vendor_id.value_counts()

2    299280
1    259099
Name: vendor_id, dtype: int64

In [None]:
df['vendor_id'] = df['vendor_id'] - 1

In [None]:
np.sum(pd.isnull(df))

id                          0
vendor_id                   0
pickup_datetime             0
dropoff_datetime            1
passenger_count             1
pickup_longitude            1
pickup_latitude             1
dropoff_longitude           1
dropoff_latitude            1
store_and_fwd_flag          1
trip_duration               1
pickup_weekday              0
pickup_hour_weekofyear      0
pickup_hour                 0
pickup_minute               0
pickup_dt                   0
pickup_week_hour            0
dist_sq                     1
dist_sqrt                   1
haversine_distance          1
direction                   1
pickup_latitude_round3      1
pickup_longitude_round3     1
dropoff_latitude_round3     1
dropoff_longitude_round3    1
dtype: int64

In [None]:
df.fillna(0, inplace = True)

In [None]:
df = df.drop(['id', 'pickup_datetime', 'dropoff_datetime', 'trip_duration','store_and_fwd_flag'], axis=1)

Model Building 

In [None]:
df.head()

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt

Test Train Split

In [None]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(df,df_y,test_size=1/3,random_state=0)

Mean Prediction

In [None]:
mean_pred = np.repeat(ytrain.mean(),len(ytest))
sqrt(mean_squared_error(ytest, mean_pred))

Cross Validation 

In [None]:
def cv_score(m1_model, rstate = 11,cols = df.columns):
  i = 1
  cv_scores = []
  df1 = df.copy()
  df1 = df[cols]
  kf = KFold(n_splits=5,random_state=rstate,shuffle=True)
  for train_index,test_index in kf.split(df1,df_y):
    print('\n{} of kfold {}'.format(i,kf.n_splits))
    xtr,xv1=df1.loc[train_index],df1.loc[test_index]
    ytr,yv1=df_y[train_index],df_y[test_index]
    model = m1_model
    model.fit(xtr,ytr)
    train_val = model.predict(xtr)
    pred_val = model.predict(xv1)
    rmse_score_train = sqrt(mean_squared_error(ytr,train_val))
    rmse_score = sqrt(mean_squared_error(yv1,pred_val))
    sufix = ""
    msg = ""
    msg += "Valid RMSE: {:.5f}".format(rmse_score)
    print("{}".format(msg))
    cv_scores.append(rmse_score)
    i+=1
  return cv_scores

Linear Regression 

In [None]:
linreg_scores = cv_score(LinearRegression())

Decision Tree

In [None]:
dtree_scores = cv_score(DecisionTreeRegressor(min_samples_leaf=25, min_samples_split=25))

In [None]:
results_df = pd.DataFrame({'linear_regression':linreg_scores,'dtree':dtree_scores})

In [None]:
results_df.plot(y=["linear_regression", "dtree"], kind="bar", legend = False)
plt.legend(bbox_to_anchor=(1.05,1), loc=2, borderaxespad=0.)
plt.show()

Decision Tree Visualization

In [None]:
from sklearn import tree

In [None]:
dtree = DecisionTreeRegressor(min_samples_leaf=25,min_samples_split=25)
dtree.fit(xtrain,ytrain)

In [None]:
decision_tree = tree.export_graphviz(dtree,out_file='tree.dot',feature_names=xtrain.columns,max_depth=2,filled=True)
!dot -Tpng tree.dot -o tree.png