In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
import sklearn.metrics as metrics
from sklearn.metrics import accuracy_score, recall_score, f1_score, log_loss
import matplotlib.pyplot as plt

# Import dataset
df = pd.read_csv('card_transdata.csv')
display(df)
print(df.shape)

In [None]:
# Check for any data instances with missing values
df[df.isna().any(axis=1)]

In [None]:
df =  df.sample(frac=1) # shuffle the dataset
X = df.iloc[:,[0, 1, 2, 3, 4, 5, 6]]
y = df.iloc[:,[7]]
display(X)
display(y)

In [None]:
# using the train test split function (80% for training, 20% for testing)
X_train, X_test, y_train, y_test = train_test_split(
  X, y, random_state=42, test_size=0.2, shuffle=False)
print(len(X_train))
print(len(y_train))
print(len(X_test))
print(len(y_test))
print(X_train.head()) # check if x and y align to each data instance
print(y_train.head())

In [None]:
# Define SGDClassifer using log loss
model = SGDClassifier(loss='log_loss')

# Train the model
model.fit(X_train, y_train)



In [None]:
# Show parameters (including intercept)
parameters = list(model.coef_[0])
parameters.insert(0, model.intercept_[0])
print("Parameters: %s" % parameters)

# Predict
train_pred = model.predict(X_train)
test_pred = model.predict(X_test)

# Class probabilities for log loss function
train_pred_proba = model.predict_proba(X_train)
test_pred_proba = model.predict_proba(X_test)

In [None]:
# Evaluation metrics

train_accuracy = accuracy_score(y_train, train_pred)
test_accuracy = accuracy_score(y_test, test_pred)
train_sensitivity = recall_score(y_train, train_pred, average='binary', pos_label=1)
test_sensitivity = recall_score(y_test, test_pred, average='binary', pos_label=1)
train_specificity = recall_score(y_train, train_pred, average='binary', pos_label=0)
test_specificity = recall_score(y_test, test_pred, average='binary', pos_label=0)
train_f1 = f1_score(y_train, train_pred, average='binary')
test_f1 = f1_score(y_test, test_pred, average='binary')
train_logloss = log_loss(y_train, train_pred_proba)
test_logloss = log_loss(y_test, test_pred_proba)

In [None]:
# To print out evaluation metrics
metric = {
    'Metrics': ['Accuracy', 'Sensitivity', 'Specificity', 'F1 Score', 'Log Loss'],
    'Training': [train_accuracy, train_sensitivity, train_specificity, train_f1, train_logloss],
    'Test' : [test_accuracy, test_sensitivity, test_specificity, test_f1, test_logloss]
}

# Replace header by the first row
metric = pd.DataFrame(metric).T
metric.columns = metric.iloc[0]
metric = metric[1:]
display(metric)

In [None]:
# Plotting ROC Curve (AUC)
test_class_1_prob = test_pred_proba[:,1] # get the probabilites of class 1 to be labeled
fpr, tpr, threshold = metrics.roc_curve(y_test, test_class_1_prob)
roc_auc = metrics.auc(fpr, tpr)

# Plot the ROC curve
plt.figure()  
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()