# Imports

In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
import numpy as np 
import pandas as pd 
import feather
import os
import matplotlib.pyplot as plt

In [3]:
PATH = 'data/nyc-taxi/'
seed = 101

In [4]:
train_df = pd.read_feather('tmp/taxi-train-v10-Airport')
test_df = pd.read_feather('tmp/taxi-test-v10-Airport')

In [5]:
train_df.shape, test_df.shape

((54075311, 24), (9914, 23))

In [None]:
%%time
train_df = train_df.sample(n=len(test_df), random_state=seed)

In [None]:
train_df.drop('fare_amount', axis=1, inplace=True)

In [None]:
train_df.shape, test_df.shape

In [None]:
# adding a column to identify whether a row comes from train or test
test_df['is_test'] = 0
train_df['is_test'] = 1 

In [None]:
# combining test and train data
joined_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)
y = joined_df['is_test'].values #labels
x = joined_df.drop('is_test', axis=1).values # covariates or our independent variables
cols = joined_df.drop('is_test', axis=1).columns

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics

In [None]:
rf = RandomForestClassifier(n_jobs=-1, max_depth=5, min_samples_leaf = 5)
predictions = np.zeros(y.shape) #creating an empty prediction array

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

for fold, (train_idx, test_idx) in enumerate(skf.split(x, y)):
    X_train, X_test = x[train_idx], x[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    rf.fit(X_train, y_train)
    probs = rf.predict_proba(X_test)[:, 1] #calculating the probability
    predictions[test_idx] = probs

In [None]:
print('ROC-AUC for train and test distributions:', metrics.auc(y, predictions))

In [None]:
feature_importances = pd.DataFrame(rf.feature_importances_,
                                   index = cols,
                                   columns=['importance']).sort_values('importance',  ascending=False)
feature_importances