In [56]:
# The classification task is done in two parts
# 1st - train.csv is split into 80-20 train-test proportions and then trained and tested for accuracy
# 2nd - train.csv is used to train and prediction is done for test.csv

In [57]:
## 1st PART

In [58]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [59]:
data_train = pd.read_csv("data/train.csv")
print(data_train.shape)
data_train.columns

(17176, 14)


Index(['tripid', 'additional_fare', 'duration', 'meter_waiting',
       'meter_waiting_fare', 'meter_waiting_till_pickup', 'pickup_time',
       'drop_time', 'pick_lat', 'pick_lon', 'drop_lat', 'drop_lon', 'fare',
       'label'],
      dtype='object')

In [60]:
data_train["pickup_time"] = pd.to_datetime(data_train["pickup_time"], format="%m/%d/%Y %H:%M")
data_train["drop_time"] = pd.to_datetime(data_train["drop_time"], format="%m/%d/%Y %H:%M")

In [61]:
print(data_train[data_train["additional_fare"].isna()].shape)
print(data_train[data_train["duration"].isna()].shape)
print(data_train[data_train["meter_waiting"].isna()].shape)
print(data_train[data_train["meter_waiting_fare"].isna()].shape)
print(data_train[data_train["meter_waiting_till_pickup"].isna()].shape)
print(data_train[data_train["fare"].isna()].shape)

(202, 14)
(202, 14)
(202, 14)
(202, 14)
(202, 14)
(137, 14)


In [62]:
data_train.dropna(inplace=True)

In [63]:
print(data_train[data_train["additional_fare"].isna()].shape)
print(data_train[data_train["duration"].isna()].shape)
print(data_train[data_train["meter_waiting"].isna()].shape)
print(data_train[data_train["meter_waiting_fare"].isna()].shape)
print(data_train[data_train["meter_waiting_till_pickup"].isna()].shape)
print(data_train[data_train["fare"].isna()].shape)

(0, 14)
(0, 14)
(0, 14)
(0, 14)
(0, 14)
(0, 14)


In [64]:
data_train.sort_values(by=["tripid"], ascending=["True"], inplace=True)

In [65]:
data_train.drop(["pickup_time", "drop_time"], axis=1, inplace=True)

In [66]:
x = data_train.drop("label", axis=1)
y = data_train["label"]

In [67]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [68]:
svm_1 = SVC(gamma="scale")
svm_1 = svm_1.fit(x_train, y_train)

In [69]:
y_pred = svm_1.predict(x_test)

In [70]:
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

[[3074    0]
 [ 320    0]]
0.90571596935769


In [71]:
y_pred_df = pd.DataFrame(y_pred, columns=["prediction"])
y_pred_df["prediction"].value_counts()

correct    3394
Name: prediction, dtype: int64

In [72]:
pred = pd.DataFrame(y_pred_df["prediction"].replace({"correct": "1", "incorrect": "0"}))

In [73]:
pred["prediction"] = pred["prediction"].astype(int)

In [74]:
pred.set_index(x_test["tripid"], inplace=True)

In [75]:
pred.to_csv("data\pred_2_1 - train_only - svm.csv")

In [76]:
## 2nd PART

In [77]:
data_test = pd.read_csv("data/test.csv")

In [78]:
data_test.drop(["pickup_time", "drop_time"], axis=1, inplace=True)

In [79]:
print(data_test[data_test["additional_fare"].isna()].shape)
print(data_test[data_test["duration"].isna()].shape)
print(data_test[data_test["meter_waiting"].isna()].shape)
print(data_test[data_test["meter_waiting_fare"].isna()].shape)
print(data_test[data_test["meter_waiting_till_pickup"].isna()].shape)
print(data_test[data_test["fare"].isna()].shape)

(0, 11)
(0, 11)
(0, 11)
(0, 11)
(0, 11)
(0, 11)


In [80]:
data_test.sort_values(by=["tripid"], ascending=["True"], inplace=True)

In [81]:
X_TRAIN = data_train.drop("label", axis=1)
Y_TRAIN = data_train["label"]
X_PRED = data_test

In [82]:
svm_2 = SVC(gamma="scale")
svm_2 = svm_2.fit(X_TRAIN, Y_TRAIN)

In [83]:
Y_PRED = svm_2.predict(X_PRED)

In [84]:
Y_PRED_DF = pd.DataFrame(Y_PRED, columns=["prediction"])
Y_PRED_DF["prediction"].value_counts()

correct    8576
Name: prediction, dtype: int64

In [85]:
PRED = pd.DataFrame(Y_PRED_DF["prediction"].replace({"correct": "1", "incorrect": "0"}))

In [86]:
PRED.set_index(X_PRED["tripid"], inplace=True)

In [87]:
PRED.to_csv("data\pred_2_2 - train_test - svm.csv")