In [165]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')

In [184]:
# Load the dataset
df = pd.read_csv('../../datasets/creditcard.csv')
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [170]:
# Drop the 'Time' column
df.drop(['Time'], axis=1, inplace=True)

# Create the feature matrix and target vector
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)


In [171]:
X_train.columns, X_test.columns

(Index(['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11',
        'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21',
        'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount'],
       dtype='object'),
 Index(['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11',
        'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21',
        'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount'],
       dtype='object'))

In [172]:
# Create the scaler for standard scaling
sc = StandardScaler()

# Fit and transform the scaler on the training data
X_train_prep = sc.fit_transform(X_train)

# Transform the scaler on the testing data
X_test_prep = sc.transform(X_test)


In [173]:
# Create and train the Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, random_state=0)
rf.fit(X_train, y_train)

# Predict on the testing data
y_pred = rf.predict(X_test)

# Evaluate the model
accuracy = np.mean(y_pred == y_test)
print(f'Accuracy: {accuracy}')


Accuracy: 0.9995552538564172


In [174]:
import plotly.graph_objects as go
from sklearn.metrics import confusion_matrix

# Create the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Get the percentages for each category
cm_percentages = cm / cm.sum(axis=1).reshape(-1, 1)

# Create the heatmap
fig = go.Figure(data=go.Heatmap(z=cm_percentages,
                                 x=['Predicted Normal', 'Predicted Fraud'],
                                 y=['Actual Normal', 'Actual Fraud'],
                                 colorscale='Blues'))

# Add the values to the heatmap
for i in range(len(cm_percentages)):
    for j in range(len(cm_percentages)):
        fig.add_annotation(x=j, y=i,
                           text=str(round(cm_percentages[i][j]*100, 2))+'%',
                           font=dict(color='white', size=12),
                           showarrow=False)

# Set the title and axis labels
fig.update_layout(title='Confusion Matrix',
                  xaxis_title='Predicted label',
                  yaxis_title='True label')

# Show the plot
fig.show()

In [175]:
df['Class'].value_counts()

0.0    284314
1.0       492
Name: Class, dtype: int64

In [192]:
from imblearn.over_sampling import SMOTE

# Create the SMOTE object
smote = SMOTE(sampling_strategy='minority')

# Apply the SMOTE technique on the training data
#X_train_resampled, y_train_resampled = smote.fit_resample(X_train[:int(len(X_train)*0.5)], y_train[:int(len(y_train)*0.5)])
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Check the number of samples in each class after resampling
print('Class distribution after resampling:')
print(y_train_resampled.value_counts())


Class distribution after resampling:
0.0    199011
1.0    199011
Name: Class, dtype: int64


In [177]:
# Create and train the Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, random_state=0)
rf.fit(X_train_resampled, y_train_resampled)

# Predict on the testing data
y_pred = rf.predict(X_test)

# Evaluate the model
accuracy = np.mean(y_pred == y_test)
print(f'Accuracy: {accuracy}')


Accuracy: 0.9995435500105334


In [201]:
import plotly.graph_objects as go
from sklearn.metrics import confusion_matrix

# Create the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Get the percentages for each category
cm_percentages = cm / cm.sum(axis=1).reshape(-1, 1)

# Create the heatmap
fig = go.Figure(data=go.Heatmap(z=cm_percentages,
                                 x=['Predicted Normal', 'Predicted Fraud'],
                                 y=['Actual Normal', 'Actual Fraud'],
                                 colorscale='Blues'))

# Add the values to the heatmap
for i in range(len(cm_percentages)):
    for j in range(len(cm_percentages)):
        fig.add_annotation(x=j, y=i,
                           text=str(round(cm_percentages[i][j]*100, 2))+'%',
                           font=dict(color='white', size=12),
                           showarrow=False)

# Set the title and axis labels
fig.update_layout(title='Confusion Matrix',
                  xaxis_title='Predicted label',
                  yaxis_title='True label')

# Show the plot
fig.show()

In [188]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

# Instantiate the XGBClassifier with default parameters
xgb = XGBClassifier()

# Fit the model on the resampled training data
xgb.fit(X_train_resampled, y_train_resampled)

# Make predictions on the testing data
y_pred = xgb.predict(X_test)


# Evaluate the model
accuracy = np.mean(y_pred == y_test)
print(f'Accuracy: {accuracy}')


Accuracy: 0.9994148077058121


In [190]:
# Print the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     85303
         1.0       0.80      0.86      0.83       139

    accuracy                           1.00     85442
   macro avg       0.90      0.93      0.91     85442
weighted avg       1.00      1.00      1.00     85442



In [200]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Create the logistic regression model
lr = LogisticRegression(C=10, max_iter=10000)

# Fit the model on the training data
lr.fit(X_train_resampled, y_train_resampled)

# Make predictions on the testing data
y_pred = lr.predict(X_test_prep)

# Evaluate the model
accuracy = np.mean(y_pred == y_test)
print(f'Accuracy: {accuracy}')

# Print the classification report
print(classification_report(y_test, y_pred_lr))


Accuracy: 0.9911284848201118
              precision    recall  f1-score   support

         0.0       1.00      0.99      1.00     85303
         1.0       0.14      0.89      0.25       139

    accuracy                           0.99     85442
   macro avg       0.57      0.94      0.62     85442
weighted avg       1.00      0.99      0.99     85442



In [180]:
# Predict on new data
new_data = [[0.919136507326981,4.19963266902077,-7.53560659365729,7.42694037455047,1.11821533001319,-2.88672235145027,-1.34103599607144,0.363933268353337,-2.20322443100109,-4.13784019622865,4.57011280759332,-7.62916958856017,1.73391746350254,-9.44037499459843,-0.0233533498971062,-1.23395828198259,1.63200885764087,1.3157345644727,-0.287188855297045,0.535434663150233,0.316093599204254,0.0551792270194195,0.21069215028398,-0.417917881438145,-0.91118827562932,0.466523924588163,0.627393270472548,0.157851282541588,1]]
#new_data = ct.transform(new_data)
new_data = sc.transform(new_data)
new_prediction = rf.predict(new_data)

# Predict the probabilities for each class on the testing data
y_pred_proba = rf.predict_proba(new_data)
y_pred_proba = y_pred_proba[0][np.argmax(y_pred_proba)]

#Print prediction and probability
print(f'New prediction class: {new_prediction[0]} \nIt`s a {"fraud" if new_prediction == 1 else "normal"} transaction \nProbability: {y_pred_proba}')


New prediction class: 1.0 
It`s a fraud transaction 
Probability: 0.8


In [181]:
# Predict on new data
new_data = [[-1.07748253596416,3.22728520031616,-3.16812191714818,1.03393361325717,1.25923621386835,-0.850410022329757,0.564159506201422,0.130628988317531,2.16284863337447,0.945572814797711,2.32895860640553,-2.65341150645149,1.91848221202065,-2.51513817360048,-0.172250593538398,1.23262768383207,2.7623599845856,2.5750722485644,-0.0269610642051941,1.03199517012554,-0.606652552403105,-0.564416792547745,-0.0121783638965162,-1.46104727958803,0.00593034105538404,-0.422634382935921,0.22015699264168,-0.38768873033541,2.99]]
#new_data = ct.transform(new_data)
new_data = sc.transform(new_data)
new_prediction = rf.predict(new_data)

# Predict the probabilities for each class on the testing data
y_pred_proba = rf.predict_proba(new_data)
y_pred_proba = y_pred_proba[0][np.argmax(y_pred_proba)]

#Print prediction and probability
print(f'New prediction class: {new_prediction[0]} \nIt`s a {"fraud" if new_prediction == 1 else "normal"} transaction \nProbability: {y_pred_proba}')


New prediction class: 0.0 
It`s a normal transaction 
Probability: 0.99


In [182]:
# Predict on new data
new_data = [[-2.0642397521399,2.62973923034009,-0.748406253034646,0.694992040498764,0.418177985610222,1.39251973416781,-1.69780121912626,-6.33306461223925,1.72418428025944,-0.887241636246378,-1.59425779202416,-0.338775118165825,-0.978064513351916,-3.68882599540221,-1.48708342266887,0.526946020771924,2.34702289955703,1.69122000470716,-0.736110693106424,-1.42448619962422,6.21551399149032,-1.27690859567394,0.459861127330875,-1.05168547489323,0.209178360382973,-0.319859444506001,0.0154338685560936,-0.0501165001659193,8]]
#new_data = ct.transform(new_data)
new_data = sc.transform(new_data)
new_prediction = rf.predict(new_data)

# Predict the probabilities for each class on the testing data
y_pred_proba = rf.predict_proba(new_data)
y_pred_proba = y_pred_proba[0][np.argmax(y_pred_proba)]

#Print prediction and probability
print(f'New prediction class: {new_prediction[0]} \nIt`s a {"fraud" if new_prediction == 1 else "normal"} transaction \nProbability: {y_pred_proba}')


New prediction class: 0.0 
It`s a normal transaction 
Probability: 0.57
