Checking files:

In [1]:
import numpy  as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/nyc-taxi-trip-duration/sample_submission.csv
/kaggle/input/nyc-taxi-trip-duration/test.csv
/kaggle/input/nyc-taxi-trip-duration/train.csv
/kaggle/input/nyc-taxi-trip-duration/test/test.csv
/kaggle/input/nyc-taxi-trip-duration/sample_submission/sample_submission.csv
/kaggle/input/nyc-taxi-trip-duration/train/train.csv


Summoning some libs:

In [2]:
import datetime
import warnings

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn           as sns
from pandas.plotting import scatter_matrix

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.impute          import SimpleImputer
from sklearn.compose         import ColumnTransformer
from sklearn.preprocessing   import OrdinalEncoder, OneHotEncoder
from sklearn.pipeline        import Pipeline
from sklearn.preprocessing   import StandardScaler

from sklearn.cluster      import KMeans, DBSCAN
from sklearn.linear_model import LinearRegression


from IPython.display import display

#

warnings.filterwarnings('ignore')
plt.rcParams['figure.figsize'] = [13, 7]
np.random.seed(1642)

Defining some functions:

In [3]:
# constants and functions

from math import sin, cos, sqrt, atan2, radians
def lat_lon_converter(lat1, lon1, lat2, lon2, unit):
    """
    ref: https://stackoverflow.com/questions/19412462/getting-distance-between-two-points-based-on-latitude-longitude
    """
    try:
        R = 6373.0
        dlon = radians(lon2) - radians(lon1)
        dlat = radians(lat2) - radians(lat1)
        a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
        c = 2 * atan2(sqrt(a), sqrt(1 - a))
        distance = R * c

        if unit == 'm':
            return distance * 10e3
        elif unit == 'km':
            return distance
    except ValueError:
        return np.nan

    
def dbscan_predict(model, X):
    """
    ref: https://stackoverflow.com/questions/27822752/scikit-learn-predicting-new-points-with-dbscan
    """
    nr_samples = X.shape[0]

    y_new = np.ones(shape=nr_samples, dtype=int) * -1

    for i in range(nr_samples):
        diff = model.components_ - X[i, :]   # NumPy broadcasting
        dist = np.linalg.norm(diff, axis=1)  # Euclidean distance
        shortest_dist_idx = np.argmin(dist)

        if dist[shortest_dist_idx] < model.eps:
            y_new[i] = model.labels_[model.core_sample_indices_[shortest_dist_idx]]

    return y_new


import scipy as sp
def dbscan_predict2(dbscan_model, X_new, metric=sp.spatial.distance.euclidean):
    """
    ref: https://stackoverflow.com/questions/27822752/scikit-learn-predicting-new-points-with-dbscan
    """
    # Result is noise by default
    y_new = np.ones(shape=len(X_new), dtype=int)*-1 

    # Iterate all input samples for a label
    for j, x_new in enumerate(X_new):
        # Find a core sample closer than EPS
        for i, x_core in enumerate(dbscan_model.components_): 
            if metric(x_new, x_core) < dbscan_model.eps:
                # Assign label of x_core to x_new
                y_new[j] = dbscan_model.labels_[dbscan_model.core_sample_indices_[i]]
                break

    return y_new

# Exploring

In [4]:
df_train = pd.read_csv('/kaggle/input/nyc-taxi-trip-duration/train.csv')
df_test  = pd.read_csv('/kaggle/input/nyc-taxi-trip-duration/test.csv')

print('train: ', df_train.shape)
print('test:  ', df_test.shape)

display( df_train.head() )
display( df_test.head() )

train:  (1458644, 11)
test:   (625134, 9)


Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


Unnamed: 0,id,vendor_id,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag
0,id3004672,1,2016-06-30 23:59:58,1,-73.988129,40.732029,-73.990173,40.75668,N
1,id3505355,1,2016-06-30 23:59:53,1,-73.964203,40.679993,-73.959808,40.655403,N
2,id1217141,1,2016-06-30 23:59:47,1,-73.997437,40.737583,-73.98616,40.729523,N
3,id2150126,2,2016-06-30 23:59:41,1,-73.95607,40.7719,-73.986427,40.730469,N
4,id1598245,1,2016-06-30 23:59:33,1,-73.970215,40.761475,-73.96151,40.75589,N


In [5]:
_TARGET      = 'trip_duration'
_NON_FEATURE = set(df_train.columns) - set(df_test.columns)
_FEATURES    = set(df_train.columns).intersection(set(df_test.columns)) - set(['id'])
display(_FEATURES)

{'dropoff_latitude',
 'dropoff_longitude',
 'passenger_count',
 'pickup_datetime',
 'pickup_latitude',
 'pickup_longitude',
 'store_and_fwd_flag',
 'vendor_id'}

In [6]:
train = df_train[_FEATURES]

## Overview

In [None]:
train.info()

In [None]:
train.describe().apply( lambda s: s.apply( lambda x: format(x, '.3f') ) )

In [None]:
# scatter_matrix(train[train.dtypes[train.dtypes != object].index].sample(frac=0.005), diagonal='kde'); it doesnt bring any insight
train[train.dtypes[train.dtypes != object].index].hist(bins=75, grid='off');

## Dealing with outliers

In [None]:
_FILTER_OBJECT = train.dtypes[train.dtypes != object].index

In [None]:
for _col in _FILTER_OBJECT:
    #train['{}_quantile'.format(_col)] = pd.qcut(train[_col], 10, labels=False, duplicates='drop')
    _std  = train[_col].std()
    _mean = train[_col].mean()
    train['{}_outlier'.format(_col)] = train[_col].apply( lambda x: True if ( abs(x) > abs(_mean + 1.5*_std) ) else False)

In [None]:
# train[train['trip_duration_outlier'] == False].hist(column = 'trip_duration', 
#                                                     by     = 'passenger_count',
#                                                     bins   = 50, grid='off', alpha = 0.5)
# plt.title('Trip Duration Distributions per Passenger Count')
# plt.legend();

<s>Binning trip duration:</s>

In [None]:
# _TRIP_BINS = [0.0, 300.0, 600.0, np.inf]
# train['trip_duration_cat'] = pd.cut( train['trip_duration'],
#                                      bins   = _TRIP_BINS,
#                                      labels = [i for i in range(len(_TRIP_BINS)-1)])
# train['trip_duration_cat'].value_counts(normalize=True).sort_index()

## Geographical data

In [None]:
_FILTER = (train['dropoff_latitude_outlier'] == False) & (train['dropoff_longitude_outlier'] == False)
# display( train['dropoff_longitude_outlier'].value_counts() )
# display( train['dropoff_latitude_outlier'].value_counts() )
# display( (_FILTER).value_counts() )

In [None]:
_p = train[_FILTER].plot(kind='scatter', x='pickup_longitude', y='pickup_latitude', alpha=0.25, color='b', label='pickup')
train[_FILTER].plot(kind='scatter', x='dropoff_longitude', y='dropoff_latitude', alpha=0.25, color='r', label='dropoff', ax=_p)
plt.legend();

- - -

# Model

## Simple Feature Engineering

## Clustering

In [None]:
_sample = train[_FILTER].sample(3000, random_state=159)[['pickup_longitude', 'pickup_latitude']]

### Kmeans

In [None]:
kmeans = KMeans(n_clusters=5, random_state=51).fit(_sample)

display(kmeans.cluster_centers_)
_p = _sample.plot(kind='scatter', x='pickup_longitude', y='pickup_latitude', alpha=0.25, color='b', label='pickup')
_p.scatter(x=kmeans.cluster_centers_[:,0],y=kmeans.cluster_centers_[:,1], marker='o', color='g', s=150);

### DBSCAN

In [None]:
dbscan = DBSCAN(eps=0.01, min_samples=15).fit(_sample)

core_samples_mask = np.zeros_like( dbscan.labels_, dtype=bool )
core_samples_mask[dbscan.core_sample_indices_] = True
labels = dbscan.labels_

n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_    = list(labels).count(-1)

print('Estimated number of clusters:     %d'    % n_clusters_)
print('Estimated number of noise points: %d'    % n_noise_)
# print("Homogeneity:                      %0.3f" % metrics.homogeneity_score(labels_true, labels))
# print("Completeness:                     %0.3f" % metrics.completeness_score(labels_true, labels))
# print("V-measure:                        %0.3f" % metrics.v_measure_score(labels_true, labels))
# print("Adjusted Rand Index:              %0.3f" % metrics.adjusted_rand_score(labels_true, labels))
# print("Adjusted Mutual Information:      %0.3f" % metrics.adjusted_mutual_info_score(labels_true, labels, average_method='arithmetic'))
print("Silhouette Coefficient:           %0.3f" % metrics.silhouette_score(_sample[['pickup_longitude', 'pickup_latitude']], labels))

#

unique_labels = set(labels)
colors        = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))]

In [None]:
for k, col in zip(unique_labels, colors):
    if k == -1:
        col = [0, 0, 0, 1] # Black used for noise.

    class_member_mask = (labels == k)

    xy = _sample[class_member_mask & core_samples_mask].as_matrix()
    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col), markeredgecolor='k', markersize=14)

    xy = _sample[class_member_mask & ~core_samples_mask].as_matrix()
    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col), markeredgecolor='k', markersize=6)

plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()

Predicting all observations:

In [None]:
train['db_predict'] = dbscan_predict(dbscan, train[['pickup_longitude', 'pickup_latitude']].as_matrix())

In [None]:
train['kmeans_predict'] = kmeans.predict(train[['pickup_longitude', 'pickup_latitude']])

In [None]:
display( train['db_predict'].value_counts().sort_index() )
display( train['kmeans_predict'].value_counts().sort_index() )

In [None]:
train.plot(kind='scatter', x='pickup_longitude', y='pickup_latitude', c='db_predict', cmap=plt.get_cmap('jet'));

In [None]:
train.plot(kind='scatter', x='pickup_longitude', y='pickup_latitude', c='kmeans_predict', cmap=plt.get_cmap('jet'));

Additional feature combination:
```

In [None]:
train['lon_lat_manhattan']    = abs(train['dropoff_longitude']-train['pickup_longitude']) + abs(train['dropoff_latitude']-train['pickup_latitude'])
train['dist_manhattan_meter'] = train.apply( lambda x: lat_lon_converter(x['pickup_latitude'], 
                                                                         x['pickup_longitude'],
                                                                         x['dropoff_latitude'], 
                                                                         x['dropoff_longitude'],
                                                                         'm'), axis=1 )

In [None]:
#train['pickup_dt']          = train['pickup_datetime'].apply( lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
#train['dropoff_dt']         = train['dropoff_datetime'].apply( lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
train['pickup_dt']  = pd.to_datetime(train['pickup_datetime'], format='%Y-%m-%d %H:%M:%S', errors='ignore')
# train['dropoff_dt'] = pd.to_datetime(train['dropoff_datetime'], format='%Y-%m-%d %H:%M:%S', errors='ignore')
# train['delta_time'] = (train['dropoff_dt'] - train['pickup_dt']).dt.total_seconds()

In [None]:
train[['pickup_dt', 'pickup_datetime']]

In [None]:
# train['avg_speed']           = train['dist_manhattan_meter'] / train['delta_time']
# train['dist_per_passenger']  = train['dist_manhattan_meter'] / train['passenger_count']
# train['speed_per_passenger'] = train['avg_speed'] / train['passenger_count']

# Data Cleaning

In [None]:
_FILTER_NUM = train.dtypes[(train.dtypes != np.dtype('object')) & (train.dtypes != np.dtype('<M8[ns]'))].index.to_list()
_FILTER_CAT = train.dtypes[(train.dtypes == np.dtype('object'))].index.to_list()

train[_FILTER_NUM].describe().apply( lambda s: s.apply( lambda x: format(x, '.3f') ) )

In [None]:
# train['dist_per_passenger']  = train['dist_per_passenger'].replace([np.inf, -np.inf], np.nan)
# train['speed_per_passenger'] = train['speed_per_passenger'].replace([np.inf, -np.inf], np.nan)

In [None]:
display(_FILTER_NUM)
display(_FILTER_CAT)

In [None]:
# imputer = SimpleImputer(strategy='median')
# imputer.fit(train[(_FILTER_OBJECT)])
# X = imputer.transform(train[_FILTER_OBJECT])
# train_tr = pd.DataFrame(X, columns=train[(_FILTER_OBJECT)].columns)
num_pipeline = Pipeline([ ('imputer', SimpleImputer(strategy="median")),
                          ('std_scaler', StandardScaler())
                        ])

In [None]:
full_pipeline = ColumnTransformer([
                ('num', num_pipeline, list(set(_FILTER_NUM) - set( ['vendor_id_outlier',
                                                                'dropoff_longitude_outlier',
                                                                'pickup_latitude_outlier',
                                                                'pickup_longitude_outlier',
                                                                'passenger_count_outlier',
                                                                'dropoff_latitude_outlier']))),
                ('cat', OneHotEncoder(), _FILTER_CAT), 
                 ])


train_x = train[_FILTER_CAT + list(set(_FILTER_NUM) - set( ['vendor_id_outlier',
                                            'dropoff_longitude_outlier',
                                            'pickup_latitude_outlier',
                                            'pickup_longitude_outlier',
                                            'passenger_count_outlier',
                                            'dropoff_latitude_outlier']))]
train_y = df_train[_TARGET]
train_prepared = full_pipeline.fit_transform(train_x)

In [None]:
train_prepared

# Model

Fit:

In [None]:
linreg = LinearRegression()
linreg.fit(train_prepared, train_y)

Predict:

In [None]:
_FILTER_OBJECT = test.dtypes[test.dtypes != object].index
for _col in _FILTER_OBJECT:
    _std  = test[_col].std()
    _mean = test[_col].mean()
    test['{}_outlier'.format(_col)] = test[_col].apply( lambda x: True if ( abs(x) > abs(_mean + 1.5*_std) ) else False)

In [None]:
test['db_predict']           = dbscan_predict(dbscan, test[['pickup_longitude', 'pickup_latitude']].as_matrix())
test['kmeans_predict']       = kmeans.predict(test[['pickup_longitude', 'pickup_latitude']])
test['lon_lat_manhattan']    = abs(test['dropoff_longitude']-test['pickup_longitude']) + abs(test['dropoff_latitude']-test['pickup_latitude'])
test['dist_manhattan_meter'] = test.apply( lambda x: lat_lon_converter(x['pickup_latitude'], 
                                                                         x['pickup_longitude'],
                                                                         x['dropoff_latitude'], 
                                                                         x['dropoff_longitude'],
                                                                         'm'), axis=1 )
test['pickup_dt']  = pd.to_datetime(test['pickup_datetime'], format='%Y-%m-%d %H:%M:%S', errors='ignore')

In [None]:
test_prepared = full_pipeline.fit_transform(test)

In [None]:
set(test.columns) - set(test.columns)

In [None]:
train_prepared

In [None]:
test_prepared

Predict:

In [None]:
pred = linreg.predict(test_prepared)

In [None]:
pd.read_csv('/kaggle/input/nyc-taxi-trip-duration/sample_submission.csv')

In [None]:
# train_set, holdout_set = train_test_split(train, 
#                                           test_size    = 0.2, 
#                                           random_state = 13,
#                                           stratify     = train['trip_duration_cat'])
# holdout_set['trip_duration_cat'].value_counts(normalize=True).sort_index()