In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
#Packages related to general operating system & warnings
import os
import warnings
warnings.filterwarnings('ignore')
#Packages related to data importing, manipulation, exploratory data #analysis, data understanding
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
from termcolor import colored as cl # text customization
#Packages related to data visualizaiton
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
#Setting plot sizes and type of plot
plt.rc("font", size=14)
plt.rcParams['axes.grid'] = True
plt.figure(figsize=(6,3))
plt.gray()
from matplotlib.backends.backend_pdf import PdfPages
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics
from sklearn.impute import MissingIndicator, SimpleImputer
from sklearn.preprocessing import  PolynomialFeatures, KBinsDiscretizer, FunctionTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, LabelBinarizer, OrdinalEncoder
import statsmodels.formula.api as smf
import statsmodels.tsa as tsa
from sklearn.linear_model import LogisticRegression, LinearRegression, ElasticNet, Lasso, Ridge
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, export_graphviz
from sklearn.ensemble import BaggingClassifier, BaggingRegressor,RandomForestClassifier,RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier,GradientBoostingRegressor, AdaBoostClassifier, AdaBoostRegressor
from sklearn.svm import LinearSVC, LinearSVR, SVC, SVR
from xgboost import XGBClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

<Figure size 600x300 with 0 Axes>

In [3]:
data=pd.read_csv("/content/drive/MyDrive/creditcard[1].csv")

In [4]:
Total_transactions = len(data)
normal = len(data[data.Class == 0])
fraudulent = len(data[data.Class == 1])
fraud_percentage = round(fraudulent/normal*100, 2)
print(cl('Total number of Trnsactions are {}'.format(Total_transactions), attrs = ['bold']))
print(cl('Number of Normal Transactions are {}'.format(normal), attrs = ['bold']))
print(cl('Number of fraudulent Transactions are {}'.format(fraudulent), attrs = ['bold']))
print(cl('Percentage of fraud Transactions is {}'.format(fraud_percentage), attrs = ['bold']))

Total number of Trnsactions are 284807
Number of Normal Transactions are 284315
Number of fraudulent Transactions are 492
Percentage of fraud Transactions is 0.17


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [6]:
min(data.Amount),max(data.Amount)

(0.0, 25691.16)

In [7]:
sc = StandardScaler()
amount = data['Amount'].values
data['Amount'] = sc.fit_transform(amount.reshape(-1, 1))

In [8]:
data.drop(['Time'], axis=1, inplace=True)

In [9]:
data.shape

(284807, 30)

In [10]:
data.drop_duplicates(inplace=True)

In [11]:
data.shape

(275663, 30)

In [12]:
X = data.drop('Class', axis = 1).values
y = data['Class'].values

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 1)

In [14]:
DT = DecisionTreeClassifier(max_depth = 4, criterion = 'entropy')
DT.fit(X_train, y_train)
dt_yhat = DT.predict(X_test)

In [15]:
dt_ac = accuracy_score(y_test, dt_yhat)
print('Accuracy score of the Decision Tree model is {}'.format(dt_ac))

Accuracy score of the Decision Tree model is 0.9991583957281328


In [16]:
dt_f1=f1_score(y_test, dt_yhat)
print('F1 score of the Decision Tree model is {}'.format(dt_f1))

F1 score of the Decision Tree model is 0.7521367521367521


In [17]:
confusion_matrix(y_test, dt_yhat, labels = [0, 1])
#confusion matrix : ( [[True trans - true predict, true trans - fraud predict],
#                     [Farud trans - true predict ,fraud trans -Fraud predict]])

array([[68770,    18],
       [   40,    88]])

In [18]:
fraud_indices = [i for i, label in enumerate(dt_yhat) if label == 1]

fraudulent_transactions = X_test[fraud_indices]
print("Fraudulent Transactions:")
print(fraud_indices)

print(len(fraudulent_transactions))

Fraudulent Transactions:
[215, 373, 768, 1170, 1366, 3619, 3975, 4503, 4808, 5296, 5318, 5673, 7833, 8656, 8941, 9147, 10119, 10246, 10290, 12299, 12597, 14194, 14413, 14453, 14742, 15333, 16341, 16713, 18875, 19724, 20633, 20694, 22670, 23000, 24511, 25302, 25913, 25962, 26217, 27304, 27921, 28364, 29105, 29852, 30084, 30263, 30352, 31294, 31331, 32473, 32533, 34168, 34547, 35096, 35123, 35265, 37457, 37900, 38412, 38998, 39233, 39292, 40318, 42505, 44229, 44382, 44976, 45985, 47349, 47964, 49131, 49817, 50390, 50880, 50994, 50995, 51215, 51219, 51351, 51983, 52134, 52221, 53283, 54756, 54958, 55021, 55122, 55309, 55446, 56056, 56141, 56722, 57190, 57751, 58131, 59001, 59665, 60047, 61730, 62531, 63363, 63852, 63945, 65747, 66270, 68606]
106


In [19]:
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, recall_score, f1_score
def metrics(actuals, predictions):
    print("Accuracy: {:.5f}".format(accuracy_score(actuals, predictions)))
    print("Precision: {:.5f}".format(precision_score(actuals, predictions)))
    print("Recall: {:.5f}".format(recall_score(actuals, predictions)))
    print("F1-score: {:.5f}".format(f1_score(actuals, predictions)))

In [None]:
n = 7
KNN = KNeighborsClassifier(n_neighbors = n)
KNN.fit(X_train, y_train)
knn_yhat = KNN.predict(X_test)

In [None]:
knn_ac = accuracy_score(y_test, knn_yhat)
print('Accuracy score of the K-Nearest Neighbors model is {}'.format(knn_ac))

Accuracy score of the K-Nearest Neighbors model is 0.999288989494457


In [None]:
knn_f1=f1_score(y_test, knn_yhat)
print('F1 score of the K-Nearest Neighbors model is {}'.format(knn_f1))

F1 score of the K-Nearest Neighbors model is 0.7949790794979079


In [None]:
fraud_indices = [i for i, label in enumerate(knn_yhat) if label == 1]

fraudulent_transactions = X_test[fraud_indices]
print("Fraudulent Transactions:")
print(fraud_indices)

print(len(fraudulent_transactions))

Fraudulent Transactions:
[215, 1170, 1366, 3135, 3619, 3975, 4503, 4808, 5296, 5318, 5673, 7833, 8656, 8941, 9147, 9167, 10119, 10246, 10290, 11691, 12299, 12597, 14194, 14413, 14453, 14742, 15333, 16341, 16713, 18470, 18875, 19724, 20633, 20694, 22670, 23000, 24201, 24511, 25302, 25913, 25962, 26217, 27304, 27921, 28364, 29105, 29852, 30084, 30263, 30352, 31294, 31331, 32473, 34168, 34547, 35034, 35096, 35123, 35265, 37457, 38247, 38412, 38998, 39233, 39292, 39900, 40318, 42505, 44229, 44382, 44976, 45985, 46129, 47349, 47964, 49131, 49817, 50390, 50880, 50994, 51215, 51219, 51351, 51983, 52134, 52221, 53283, 54756, 54889, 54958, 55021, 55122, 55309, 55446, 56056, 56141, 56722, 57190, 57751, 58131, 59001, 62468, 62531, 62689, 63363, 63852, 63945, 65747, 66270, 67733, 68606]
111


In [None]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_yhat = lr.predict(X_test)

In [None]:
lr_ac = accuracy_score(y_test, lr_yhat)
print('Accuracy score of the Logistic Regression model is {}'.format(lr_ac))

Accuracy score of the Logistic Regression model is 0.9989552498694062


In [None]:
lr_f1 = f1_score(y_test, lr_yhat)
print('F1 score of the Logistic Regression model is {}'.format(lr_f1))

F1 score of the Logistic Regression model is 0.6666666666666666


In [None]:
fraud_indices = [i for i, label in enumerate(lr_yhat) if label == 1]

fraudulent_transactions = X_test[fraud_indices]
print("Fraudulent Transactions:")
print(fraud_indices)

print(len(fraudulent_transactions))

Fraudulent Transactions:
[215, 373, 1366, 3135, 3619, 4808, 5296, 5673, 8656, 9147, 10119, 10246, 11691, 12597, 14413, 14453, 14742, 15333, 16341, 16713, 18470, 18875, 19724, 20633, 20694, 23000, 24132, 24511, 25302, 25962, 28364, 29095, 29105, 29852, 30084, 30263, 31294, 31331, 31771, 32473, 34168, 34547, 35096, 35123, 37457, 37900, 38412, 39233, 39292, 39900, 40318, 42505, 44229, 44976, 45985, 46129, 46747, 47349, 47964, 48183, 49131, 50880, 50994, 50995, 51215, 51219, 51351, 52134, 52221, 53283, 54756, 54889, 54958, 55013, 55021, 55122, 55309, 55446, 56056, 57751, 58131, 60047, 60960, 62531, 62689, 63363, 65747, 68606]
88


In [None]:
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, recall_score, f1_score
def metrics(actuals, predictions):
    print("Accuracy: {:.5f}".format(accuracy_score(actuals, predictions)))
    print("Precision: {:.5f}".format(precision_score(actuals, predictions)))
    print("Recall: {:.5f}".format(recall_score(actuals, predictions)))
    print("F1-score: {:.5f}".format(f1_score(actuals, predictions)))

In [None]:
svm = SVC()
svm.fit(X_train, y_train)
svm_yhat = svm.predict(X_test)

In [None]:
svm_ac = accuracy_score(y_test, svm_yhat)
print('Accuracy score of the Support Vector Machines model is {}'.format(svm_ac))

In [None]:
svm_f1 = f1_score(y_test, svm_yhat)
print('F1 score of the Support Vector Machines model is {}'.format(svm_f1))

In [None]:
fraud_indices = [i for i, label in enumerate(svm_yhat) if label == 1]

fraudulent_transactions = X_test[fraud_indices]
print("Fraudulent Transactions:")
print(fraud_indices)

print(len(fraudulent_transactions))

In [None]:
rf = RandomForestClassifier(max_depth = 4)
rf.fit(X_train, y_train)
rf_yhat = rf.predict(X_test)

In [None]:
rf_ac = accuracy_score(y_test, rf_yhat)
print('Accuracy score of the Random Forest model is {}'.format(rf_ac))

In [None]:
rf_f1 = f1_score(y_test, rf_yhat)
print('F1 score of the Random Forest model is {}'.format(rf_f1))

In [None]:
fraud_indices = [i for i, label in enumerate(rf_yhat) if label == 1]

fraudulent_transactions = X_test[fraud_indices]
print("Fraudulent Transactions:")
print(fraud_indices)

print(len(fraudulent_transactions))

In [None]:
import itertools
def plot_confusion_matrix(cm, classes, title, normalize = False, cmap = plt.cm.Blues):
    title = 'Confusion Matrix of {}'.format(title)
    if normalize:
        cm = cm.astype(float) / cm.sum(axis=1)[:, np.newaxis]

    plt.imshow(cm, interpolation = 'nearest', cmap = cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation = 45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools . product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment = 'center',
                 color = 'white' if cm[i, j] > thresh else 'black')

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

# Compute confusion matrix for the models

dt_matrix = confusion_matrix(y_test, dt_yhat, labels = [0, 1]) # Decision Tree
knn_matrix = confusion_matrix(y_test, knn_yhat, labels = [0, 1]) # K-Nearest Neighbors
lr_matrix = confusion_matrix(y_test, lr_yhat, labels = [0, 1]) # Logistic Regression
svm_matrix = confusion_matrix(y_test, svm_yhat, labels = [0, 1]) # Support Vector Machine
rf_matrix = confusion_matrix(y_test, rf_yhat, labels = [0, 1]) # Random Forest Tree

# Plot the confusion matrix

plt.rcParams['figure.figsize'] = (6, 6)

# 1. Decision tree

dt_cm_plot = plot_confusion_matrix(dt_matrix,
                                classes = ['Non-Default(0)','Default(1)'],
                                normalize = False, title = 'DT')
plt.savefig('dt_cm_plot.png')
plt.show()

# 2. K-Nearest Neighbors

knn_cm_plot = plot_confusion_matrix(knn_matrix,
                                classes = ['Non-Default(0)','Default(1)'],
                                normalize = False, title = 'KNN')
plt.savefig('knn_cm_plot.png')
plt.show()

# 3. Logistic regression

lr_cm_plot = plot_confusion_matrix(lr_matrix,
                                classes = ['Non-Default(0)','Default(1)'],
                                normalize = False, title = 'LR')
plt.savefig('lr_cm_plot.png')
plt.show()

# 4. Support Vector Machine

svm_cm_plot = plot_confusion_matrix(svm_matrix,
                                classes = ['Non-Default(0)','Default(1)'],
                                normalize = False, title = 'SVM')
plt.savefig('svm_cm_plot.png')
plt.show()

# 5. Random forest tree

rf_cm_plot = plot_confusion_matrix(rf_matrix,
                                classes = ['Non-Default(0)','Default(1)'],
                                normalize = False, title = 'RF')
plt.savefig('rf_cm_plot.png')
plt.show()



In [None]:
import numpy as np

# Accuracy scores and F1 scores of the five algorithms
accuracy_scores = {
   "Random Forest": rf_ac,
    "Decision Tree": dt_ac,
    "Logistic Regression": lr_ac,
    "KNN": knn_ac,
    "SVM": svm_ac
}

f1_scores = {
    "Random Forest": rf_f1,
    "Decision Tree": dt_f1,
    "Logistic Regression": lr_f1,
    "KNN": knn_f1,
    "SVM": svm_f1
}


# Create scatter plot
plt.figure(figsize=(10, 6))
for algorithm, accuracy in accuracy_scores.items():
    f1 = f1_scores[algorithm]
    plt.scatter(accuracy, f1, label=algorithm)

# Add labels, title, and legend
plt.xlabel('Accuracy')
plt.ylabel('F1 Score')
plt.title('Accuracy vs F1 Score of Different Machine Learning Algorithms')
plt.legend()

# Add grid for better visualization
plt.grid(True)

plt.show()
# Find the algorithm with the highest accuracy
best_accuracy_algorithm = max(accuracy_scores, key=accuracy_scores.get)
best_accuracy_score = accuracy_scores[best_accuracy_algorithm]

# Find the algorithm with the highest F1 score
best_f1_algorithm = max(f1_scores, key=f1_scores.get)
best_f1_score = f1_scores[best_f1_algorithm]

# Print the algorithm with the highest accuracy
print(f"The algorithm with the highest accuracy is {best_accuracy_algorithm} with an accuracy of {best_accuracy_score:.5f}")

# Print the algorithm with the highest F1 score
print(f"The algorithm with the highest F1 score is {best_f1_algorithm} with an F1 score of {best_f1_score:.5f}")

