In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# read train.csv
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,Feat-1,Feat-2,Feat-3,Feat-4,Feat-5,Feat-6,Feat-7,Feat-8,Feat-9,Feat-10,...,Feat-22,Feat-23,Feat-24,Feat-25,Feat-26,Feat-27,Feat-28,Feat-29,Feat-30,Target
0,0.412963,0.5772,-0.953418,0.79884,0.658006,0.768405,-0.405588,-0.442541,-0.001558,1.049287,...,0.009059,0.646514,0.860895,-2.411875,0.994851,1.025006,0.579802,0.803897,0.904436,0.0
1,0.591946,1.21028,1.718345,3.05379,-1.179496,1.10955,0.775757,0.153515,-0.001558,0.555115,...,0.009059,1.478546,1.046527,0.503741,1.002365,0.842098,0.71303,0.777919,1.009964,0.0
2,1.907598,0.455454,-1.252488,0.215091,0.001179,0.489095,2.519551,0.292663,-0.001558,0.460814,...,3.11102,0.886526,0.740503,1.110159,0.998776,0.704871,0.493281,0.594564,0.580279,0.0
3,1.891311,-1.178866,1.03531,1.221389,1.481984,0.80459,1.561999,-0.699774,-0.001558,0.649795,...,3.11102,1.082386,0.92854,-0.060958,0.988557,0.699047,0.974086,1.213914,1.480364,0.0
4,1.550914,-0.886675,1.014226,0.666486,-0.244051,0.763458,-0.730383,-1.467584,2.171518,0.957251,...,0.009059,0.915975,0.941412,0.527787,1.308168,1.02223,0.592647,0.897614,0.92763,0.0


In [4]:
labels = train['Target'].to_numpy()
train = train.drop('Target', axis=1).to_numpy()

In [5]:
# standard normalize dataset using sklearn
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
train = scaler.fit_transform(train)

In [6]:
# split the dataset into train and val
from sklearn.model_selection import train_test_split

train, val, labels_train, labels_val = train_test_split(train, labels, test_size=0.3)

In [7]:
# train the model using logistic regression
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(train, labels_train)

In [8]:
# accuracy on train and test dataset
print("Train accuracy: ", model.score(train, labels_train))
print("Validation accuracy: ", model.score(val, labels_val))

Train accuracy:  0.64105
Validation accuracy:  0.6421583333333334


In [9]:
# loss on train and val dataset
from sklearn.metrics import log_loss

print("Train loss: ", log_loss(labels_train, model.predict_proba(train)))
print("Validation loss: ", log_loss(labels_val, model.predict_proba(val)))

Train loss:  0.6382058086749699
Validation loss:  0.6373053267686201


In [11]:
# precision, recall, f1-score, confusion matrix on train and val dataset using sklearn
from sklearn.metrics import classification_report, confusion_matrix

print("Train classification report: \n", classification_report(labels_train, model.predict(train)))
print("Validation classification report: \n", classification_report(labels_val, model.predict(val)))

print("Train confusion matrix: \n", confusion_matrix(labels_train, model.predict(train)))
print("Validation confusion matrix: \n", confusion_matrix(labels_val, model.predict(val)))


Train classification report: 
               precision    recall  f1-score   support

         0.0       0.64      0.53      0.58    131725
         1.0       0.64      0.74      0.69    148275

    accuracy                           0.64    280000
   macro avg       0.64      0.63      0.63    280000
weighted avg       0.64      0.64      0.64    280000

Validation classification report: 
               precision    recall  f1-score   support

         0.0       0.65      0.53      0.58     56449
         1.0       0.64      0.74      0.69     63551

    accuracy                           0.64    120000
   macro avg       0.64      0.64      0.63    120000
weighted avg       0.64      0.64      0.64    120000

Train confusion matrix: 
 [[ 69449  62276]
 [ 38230 110045]]
Validation confusion matrix: 
 [[29714 26735]
 [16206 47345]]


In [46]:
# apply pca on the dataset
from sklearn.decomposition import PCA

pca = PCA(n_components=0.8)
train_pca = pca.fit_transform(train)
val_pca = pca.transform(val)

In [47]:
# train the model using logistic regression
model = LogisticRegression()
model.fit(train_pca, labels_train)

In [48]:
# calculate accuracy and loss on train and val dataset

print("Train accuracy: ", model.score(train_pca, labels_train))
print("Validation accuracy: ", model.score(val_pca, labels_val))

print("Train loss: ", log_loss(labels_train, model.predict_proba(train_pca)))
print("Validation loss: ", log_loss(labels_val, model.predict_proba(val_pca)))

Train accuracy:  0.602675
Validation accuracy:  0.602875
Train loss:  0.6589504242333526
Validation loss:  0.6583924435210055


In [49]:
# Precision, recall, f1-score, confusion matrix on train and val dataset using sklearn

print("Train classification report: \n", classification_report(labels_train, model.predict(train_pca)))
print("Validation classification report: \n", classification_report(labels_val, model.predict(val_pca)))

Train classification report: 
               precision    recall  f1-score   support

         0.0       0.59      0.49      0.54    131725
         1.0       0.61      0.70      0.65    148275

    accuracy                           0.60    280000
   macro avg       0.60      0.60      0.59    280000
weighted avg       0.60      0.60      0.60    280000

Validation classification report: 
               precision    recall  f1-score   support

         0.0       0.59      0.49      0.54     56449
         1.0       0.61      0.70      0.65     63551

    accuracy                           0.60    120000
   macro avg       0.60      0.60      0.59    120000
weighted avg       0.60      0.60      0.60    120000

