In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA, TruncatedSVD
import matplotlib.patches as mpatches
import time

# Classifier Libraries
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import collections


# Other Libraries
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report
from collections import Counter
from sklearn.model_selection import KFold, StratifiedKFold
import warnings
warnings.filterwarnings("ignore")



df = pd.read_csv('/content/drive/My Drive/ML Project Taxi Fair/train.csv')
df.head()

Unnamed: 0,tripid,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare,label
0,189123628,10.5,834.0,56.0,0.0,64.0,11/1/2019 0:20,11/1/2019 0:34,6.86252,79.8993,6.9033,79.8783,270.32,correct
1,189125358,10.5,791.0,47.0,0.0,134.0,11/1/2019 0:56,11/1/2019 1:09,6.88589,79.8984,6.91373,79.8923,197.85,correct
2,189125719,10.5,1087.0,80.0,0.0,61.0,11/1/2019 1:08,11/1/2019 1:26,6.90839,79.8651,6.93669,79.9146,301.64,correct
3,189127273,10.5,598.0,271.0,15.6638,68.0,11/1/2019 2:27,11/1/2019 2:37,6.9257,79.8895,6.92748,79.8971,82.3,correct
4,189128020,,,,,,11/1/2019 3:34,11/1/2019 3:51,6.87441,79.8615,6.84478,79.929,358.39,correct


In [None]:
df['pickup_time'] = pd.to_datetime(df['pickup_time'])
df['pickup_hour'] = df['pickup_time'].dt.hour

In [None]:
df.isna().sum()/df.shape[0]*100

tripid                       0.000000
additional_fare              1.176060
duration                     1.176060
meter_waiting                1.176060
meter_waiting_fare           1.176060
meter_waiting_till_pickup    1.176060
pickup_time                  0.000000
drop_time                    0.000000
pick_lat                     0.000000
pick_lon                     0.000000
drop_lat                     0.000000
drop_lon                     0.000000
fare                         0.797625
label                        0.000000
pickup_hour                  0.000000
dtype: float64

In [None]:
df.dropna(inplace=True)

df.drop(columns=['pickup_time', 'drop_time', 'tripid'], inplace=True)

In [None]:
feature_columns = df.columns
for column in feature_columns:
  if column != "label":
    df[column] = np.log(df[column]+1)

df.head()

Unnamed: 0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pick_lat,pick_lon,drop_lat,drop_lon,fare,label,pickup_hour
0,2.442347,6.727432,4.043051,0.0,4.174387,2.062107,4.393205,2.06728,4.392946,5.603299,correct,0.0
1,2.442347,6.674561,3.871201,0.0,4.905275,2.065075,4.393194,2.068599,4.393119,5.292551,correct,0.0
2,2.442347,6.992096,4.394449,0.0,4.127134,2.067924,4.392782,2.071496,4.393394,5.712544,correct,0.693147
3,2.442347,6.395262,5.605802,2.813239,4.234107,2.070111,4.393084,2.070335,4.393178,4.422449,correct,1.098612
5,2.442347,8.133881,5.209486,0.0,4.727388,2.096055,4.393176,2.069221,4.39278,6.971687,correct,1.791759


In [None]:
def transform_label(x):
  if x=="correct":
    return 1
  else:
    return 0

df['label'] = df['label'].apply(lambda x: transform_label(x))

In [None]:
df.head()

Unnamed: 0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pick_lat,pick_lon,drop_lat,drop_lon,fare,label,pickup_hour
0,2.442347,6.727432,4.043051,0.0,4.174387,2.062107,4.393205,2.06728,4.392946,5.603299,1,0.0
1,2.442347,6.674561,3.871201,0.0,4.905275,2.065075,4.393194,2.068599,4.393119,5.292551,1,0.0
2,2.442347,6.992096,4.394449,0.0,4.127134,2.067924,4.392782,2.071496,4.393394,5.712544,1,0.693147
3,2.442347,6.395262,5.605802,2.813239,4.234107,2.070111,4.393084,2.070335,4.393178,4.422449,1,1.098612
5,2.442347,8.133881,5.209486,0.0,4.727388,2.096055,4.393176,2.069221,4.39278,6.971687,1,1.791759


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

X = df.drop(columns=['label']).values
Y = df['label'].values

scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=0.20, random_state=1)

model = RandomForestClassifier(criterion='entropy', max_features='auto', n_estimators=200, max_depth=18)
model.fit(X_train, Y_train)

print("F1 Score : {0}".format(f1_score(model.predict(X_validation),Y_validation)))

F1 Score : 0.9674976333228148


In [None]:
0.9674359785014228