In [None]:
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
from sklearn.ensemble import RandomForestClassifier
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
dataframe = pd.read_csv("/content/drive/My Drive/ML Project Taxi Fair/train.csv")

In [None]:
#Fill nan
columns = list(dataframe.drop(columns=['label' ,'tripid', 'drop_time' ,'pickup_time']).columns)
for column in columns:
  dataframe[column] = dataframe[column].fillna(dataframe[column].mean())

In [None]:
#Save dataset without nan values
dataframe.to_csv("/content/drive/My Drive/ML Project Taxi Fair/Train_without_nan.csv", index=False)

In [None]:
dataframe.isna().sum()

tripid                       0
additional_fare              0
duration                     0
meter_waiting                0
meter_waiting_fare           0
meter_waiting_till_pickup    0
pickup_time                  0
drop_time                    0
pick_lat                     0
pick_lon                     0
drop_lat                     0
drop_lon                     0
fare                         0
label                        0
dtype: int64

In [None]:
#Try to fit the data without any transeformation

#Convert labels for 1 and 0
#Before do describe, Let's convert the label as 1 and 0 .(1-correct and 0-incorrect)
def encoding_label(label):
  if(label=='correct'):
    return 1
  else:
    return 0

dataframe['label']= dataframe['label'].apply(encoding_label).values

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split


In [None]:
X = dataframe.drop(columns=['label', 'tripid', 'drop_time', 'pickup_time']).values
Y = dataframe['label'].values
X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=0.20, random_state=1)

In [None]:
model = RandomForestClassifier()
model.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
print("Accuracy score : {0}".format(model.score(X_validation, Y_validation)))

Accuracy score : 0.9420838183934808


In [None]:
predictions = model.predict(X_validation)

print("F1 score {0}".format(f1_score(predictions, Y_validation)))

F1 score 0.9687941038105692


In [None]:
#So the raw data without any tranformations gives 0.910 accuracy and 0.95059 f1 score
#Let's transform the data by np.log because off high skewness

dataframe_tr = dataframe.drop(columns=['tripid','tripid', 'drop_time', 'pickup_time'])

In [None]:
#For log scale the value can't be zero. So add 1 to every cell
dataframe_tr = dataframe_tr+1

#Convert to log scale
dataframe_tr = np.log(dataframe_tr)

In [None]:
#Model fit and train
X = dataframe_tr.drop(columns=['label']).values
Y = dataframe['label'].values
X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=0.20, random_state=1)

model = RandomForestClassifier()
model.fit(X_train, Y_train)

predictions = model.predict(X_validation)
print("Accuracy score : {0}".format(model.score(X_validation, Y_validation)))
print("F1 score {0}".format(f1_score(predictions, Y_validation)))

Accuracy score : 0.9415017462165308
F1 score 0.9684507926542144


In [None]:
#Lets deal with np.log and min max scaler

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(dataframe_tr.drop(columns=['label']).values)

#Transform and svae to X
X = scaler.transform(dataframe_tr.drop(columns=['label']).values)
Y = dataframe['label'].values
X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=0.20, random_state=1)

model = RandomForestClassifier()
model.fit(X_train, Y_train)

predictions = model.predict(X_validation)
print("Accuracy score : {0}".format(model.score(X_validation, Y_validation)))
print("F1 score {0}".format(f1_score(predictions, Y_validation)))

Accuracy score : 0.9420838183934808
F1 score 0.9688234372552091


In [None]:
#Lets deal with np.log and Standard scaler

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(dataframe_tr.drop(columns=['label']).values)

#Transform and svae to X
X = scaler.transform(dataframe_tr.drop(columns=['label']).values)
Y = dataframe['label'].values
X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=0.20, random_state=1)

model = RandomForestClassifier()
model.fit(X_train, Y_train)

predictions = model.predict(X_validation)
print("Accuracy score : {0}".format(model.score(X_validation, Y_validation)))
print("F1 score {0}".format(f1_score(predictions, Y_validation)))

Accuracy score : 0.9423748544819558
F1 score 0.9689265536723163


In [None]:
#Lets deal with np.log and Robust scaler

from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()
scaler.fit(dataframe_tr.drop(columns=['label']).values)

#Transform and svae to X
X = scaler.transform(dataframe_tr.drop(columns=['label']).values)
Y = dataframe['label'].values
X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=0.20, random_state=1)

model = RandomForestClassifier()
model.fit(X_train, Y_train)

predictions = model.predict(X_validation)
print("Accuracy score : {0}".format(model.score(X_validation, Y_validation)))
print("F1 score {0}".format(f1_score(predictions, Y_validation)))

Accuracy score : 0.9447031431897556
F1 score 0.9701820464532328


In [None]:
#Lets choose np.log and minmax and Grid search for parameters

from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.preprocessing import MinMaxScaler


parameters = {
     'criterion' : ['gini', 'entropy'],
    'max_depth' : [18,20,25],
    'max_features' : ['auto','sqrt','log2']
    }
####Minmaxand log transformation start

scaler = MinMaxScaler()
scaler.fit(dataframe_tr.drop(columns=['label']).values)

#Transform and svae to X
X = scaler.transform(dataframe_tr.drop(columns=['label']).values)
Y = dataframe['label'].values
X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=0.20, random_state=1)

####end of transformation

model = RandomForestClassifier(verbose=0, n_jobs=5)

clf = GridSearchCV(model, parameters)

# Fit the grid search
clf.fit(X_train, Y_train)

# View The Best Parameters
print('Best Criterion:', clf.best_estimator_.get_params()['criterion'])
#print('Best n_estimators:', clf.best_estimator_.get_params()['n_estimators']) #max_depth
print('Best max_features:', clf.best_estimator_.get_params()['max_features'])
print("Best score : ", clf.best_score_)

#Predictions
model = clf.best_estimator_
predictions = model.predict(X_validation)
print("F1 Score : ", f1_score(Y_validation, predictions))
clf.best_estimator_


Best Criterion: gini
Best max_features: auto
Best score :  0.9406113537117904
F1 Score :  0.9698965192850423


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=25, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=5,
                       oob_score=False, random_state=None, verbose=0,
                       warm_start=False)