In [None]:
# Import libraries
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import precision_recall_curve

In [None]:
# Load data

df = pd.read_csv('D:\Lecture Notes\Machine Learning\Project\data\creditcard.csv')

print("Data - rows: ", df.shape[0], "columns: ", df.shape[1])

df.head()

In [None]:
# Check for missing values 

print(df.isnull().sum())

In [None]:
# Check for data imbalance

class_distribution = df['Class'].value_counts()
print(class_distribution)

plt.figure(figsize = (8,6))
sns.countplot(x = 'Class', data = df)
plt.title("Class Distribution")
plt.savefig('class_distribution.png', dpi=300)
plt.show()


# Data is highly unbalanced

In [None]:
# Visualizing numerical features

plt.figure(figsize = (12,10))
df.boxplot()
plt.xticks(rotation = 45)
plt.tight_layout()
plt.show()

# All numerical features are standardized (their mean is 0)

In [None]:
# Visualize correlation matrix

corr_matrix = df.corr()
plt.figure(figsize = (16,10))
sns.heatmap(corr_matrix, annot = True, cmap = 'coolwarm', fmt = ".2f")
plt.title("Correlation Matrix")
plt.show()

# There is almost no correlation between V1-V28

In [None]:
# Check any outliers 

plt.figure(figsize = (12,10))
sns.boxplot(x = 'Class', y = 'Amount', data = df)
plt.title("Outliers Detection")
plt.show()

tmp_df = df[['Amount','Class']].copy()
class_0 = tmp_df.loc[tmp_df['Class'] == 0]['Amount']
class_1 = tmp_df.loc[tmp_df['Class'] == 1]['Amount']

class_0.describe()

In [None]:
class_1.describe()

# real transactions have a larger mean value, larger Q1, smaller Q3 and Q4 and larger outliers;
# fraudulent transactions have a smaller Q1 and mean, larger Q4 and smaller outliers.

In [None]:
# Splitting the data in training/testing 

x = df.drop('Class', axis = 1)
y = df['Class']

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2, random_state = 42, stratify = y)

# Scale the parameters
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(x_train)
X_test_scaled = scaler.transform(x_test)

# Grid search
param_grid = [{'C': [1], 'kernel': ['rbf','linear'], 'gamma': ['scale']}]

svm_clf = SVC(random_state=42)

grid_search = GridSearchCV(svm_clf, param_grid, scoring='recall', verbose=2)
grid_search.fit(X_train_scaled, y_train)

# Get the best model from grid search
best_svm = grid_search.best_estimator_
y_pred = best_svm.predict(X_test_scaled)


In [None]:
# Print classification report and confusion matrix
print("Best Model - Classification Report:")
print(classification_report(y_test, y_pred))

# Calculate confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Plot confusion matrix
plt.figure(figsize=(5, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.savefig('confusion_matrix.png', dpi=300)
plt.show()


In [1]:
# plot feature importance 

# took some help from https://stackoverflow.com/questions/41592661/determining-the-most-contributing-features-for-svm-classifier-in-sklearn

feature_names = ['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19',
       'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28',
       'Amount']

def plot_importances(coef, names):
    importances = coef
    importances, names = zip(*sorted(zip(importances, names)))
    plt.barh(range(len(names)), importances, align='center')
    plt.yticks(range(len(names)), names)
    plt.xlabel('Feature Importance')
    plt.ylabel('Features')
    plt.savefig('feature_importance.png', dpi=300, bbox_inches='tight')
    plt.show()
    

plot_importances(best_svm.coef_.ravel(), feature_names)

In [2]:
# plot the precision recall curve 

y_scores = best_svm.decision_function(X_test_scaled)

precision, recall, thresholds = precision_recall_curve(y_test, y_scores)

plt.figure(figsize=(8, 6))
plt.plot(recall, precision, marker='.')
plt.xlabel('Recall (Sensitivity)')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.grid(False)
plt.savefig('precision_recall_curve.png', dpi=300)
plt.show()