In [60]:
# The classification task is done in two parts
# 1st - train.csv is split into 80-20 train-test proportions and then trained and tested for accuracy
# 2nd - train.csv is used to train and prediction is done for test.csv

In [61]:
## 1st PART

In [62]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [63]:
data_train = pd.read_csv("data/train.csv")
print(data_train.shape)
data_train.columns

(17176, 14)


Index(['tripid', 'additional_fare', 'duration', 'meter_waiting',
       'meter_waiting_fare', 'meter_waiting_till_pickup', 'pickup_time',
       'drop_time', 'pick_lat', 'pick_lon', 'drop_lat', 'drop_lon', 'fare',
       'label'],
      dtype='object')

In [64]:
data_train["pickup_time"] = pd.to_datetime(data_train["pickup_time"], format="%m/%d/%Y %H:%M")
data_train["drop_time"] = pd.to_datetime(data_train["drop_time"], format="%m/%d/%Y %H:%M")

In [65]:
print(data_train[data_train["additional_fare"].isna()].shape)
print(data_train[data_train["duration"].isna()].shape)
print(data_train[data_train["meter_waiting"].isna()].shape)
print(data_train[data_train["meter_waiting_fare"].isna()].shape)
print(data_train[data_train["meter_waiting_till_pickup"].isna()].shape)
print(data_train[data_train["fare"].isna()].shape)

(202, 14)
(202, 14)
(202, 14)
(202, 14)
(202, 14)
(137, 14)


In [66]:
data_train.dropna(inplace=True)

In [67]:
print(data_train[data_train["additional_fare"].isna()].shape)
print(data_train[data_train["duration"].isna()].shape)
print(data_train[data_train["meter_waiting"].isna()].shape)
print(data_train[data_train["meter_waiting_fare"].isna()].shape)
print(data_train[data_train["meter_waiting_till_pickup"].isna()].shape)
print(data_train[data_train["fare"].isna()].shape)

(0, 14)
(0, 14)
(0, 14)
(0, 14)
(0, 14)
(0, 14)


In [68]:
data_train.sort_values(by=["tripid"], ascending=["True"], inplace=True)

In [69]:
data_train.drop(["pickup_time", "drop_time"], axis=1, inplace=True)

In [70]:
x = data_train.drop("label", axis=1)
y = data_train["label"]

In [71]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [96]:
logreg_1 = LogisticRegression(solver="saga", C=0.01, penalty="l2")
logreg_1 = logreg_1.fit(x_train, y_train)



In [97]:
y_pred = logreg_1.predict(x_test)

In [98]:
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
print(f1_score(y_test, y_pred, average="macro"))

[[3095    0]
 [ 299    0]]
0.9119033588685916
0.4769610109415935


  'precision', 'predicted', average, warn_for)


In [99]:
y_pred_df = pd.DataFrame(y_pred, columns=["prediction"])
y_pred_df["prediction"].value_counts()

correct    3394
Name: prediction, dtype: int64

In [76]:
pred = pd.DataFrame(y_pred_df["prediction"].replace({"correct": "1", "incorrect": "0"}))

In [77]:
pred["prediction"] = pred["prediction"].astype(int)

In [78]:
pred.set_index(x_test["tripid"], inplace=True)

In [79]:
pred.to_csv("data\pred_3_1 - train_only - logistic_regression.csv")

In [80]:
## 2nd PART

In [81]:
data_test = pd.read_csv("data/test.csv")

In [82]:
data_test.drop(["pickup_time", "drop_time"], axis=1, inplace=True)

In [83]:
print(data_test[data_test["additional_fare"].isna()].shape)
print(data_test[data_test["duration"].isna()].shape)
print(data_test[data_test["meter_waiting"].isna()].shape)
print(data_test[data_test["meter_waiting_fare"].isna()].shape)
print(data_test[data_test["meter_waiting_till_pickup"].isna()].shape)
print(data_test[data_test["fare"].isna()].shape)

(0, 11)
(0, 11)
(0, 11)
(0, 11)
(0, 11)
(0, 11)


In [84]:
data_test.sort_values(by=["tripid"], ascending=["True"], inplace=True)

In [85]:
X_TRAIN = data_train.drop("label", axis=1)
Y_TRAIN = data_train["label"]
X_PRED = data_test

In [86]:
logreg_2 = LogisticRegression(solver="lbfgs", C=0.01, penalty="l2")
logreg_2 = logreg_2.fit(X_TRAIN, Y_TRAIN)

In [87]:
Y_PRED = logreg_2.predict(X_PRED)

In [88]:
Y_PRED_DF = pd.DataFrame(Y_PRED, columns=["prediction"])
Y_PRED_DF["prediction"].value_counts()

correct    8576
Name: prediction, dtype: int64

In [89]:
PRED = pd.DataFrame(Y_PRED_DF["prediction"].replace({"correct": "1", "incorrect": "0"}))

In [90]:
PRED.set_index(X_PRED["tripid"], inplace=True)

In [91]:
PRED.to_csv("data\pred_3_2 - train_test - logistic_regression.csv")