In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

In [2]:
# cols = ['typeofaction', 'sourceid', 'destinationid', 'amountofmoney', 'date', 'isfraud', 'typeoffraud']
df = pd.read_csv('banktransaction.csv', header=0)
cols = df.columns

FileNotFoundError: [Errno 2] No such file or directory: 'banktransaction.csv'

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df.info()
df.isnull().sum()

In [None]:
df.shape


In [None]:
df['typeofaction'].unique()

In [None]:
type_of_action = df['typeofaction'].value_counts()

In [None]:
transaction = type_of_action.index

In [None]:
quantity = type_of_action.values

In [None]:
import plotly.express as px

In [None]:
px.pie(df,values=quantity,names=transaction, hole=0.4, title='Type of transaction')

In [None]:
# df = df.dropna()

In [None]:
df

In [None]:
fraud_count = (df['isfraud'] == 1).sum()
no_fraud_count = len(df['isfraud']== 0)

print(fraud_count)
print(no_fraud_count)

# Create a bar plot to display the class imbalance
plt.bar(['Fraud', 'No Fraud'], [fraud_count, no_fraud_count], color=['red', 'blue'])
plt.xlabel('Class')
plt.ylabel('Number of Samples')
plt.title('Class Imbalance in the Dataset')
plt.show()


In [None]:
type_of_action

In [None]:
# replace trasnfer and cash-in values with numbers
df.replace(to_replace=['transfer', 'cash-in'],value=[1,2],inplace=True)

In [None]:
df

In [None]:
for label in cols[:-2]:
  plt.hist(df[df['isfraud']==1][label], color='red', label='fraud', alpha=0.7, density=True)
  plt.hist(df[df['isfraud']==0][label], color='blue', label='not fraud', alpha=0.7, density=True)
  plt.title(label)
  plt.ylabel('Probability')
  plt.xlabel('Label')
  plt.legend()
  plt.show()

In [None]:
# convert date into pandas datetime object
df['date'] = pd.to_datetime(df['date'])
# df['date_as_int'] = df['date'].dt.strftime('%Y%m%d').astype(int)
df['date'] = df['date'].dt.strftime('%Y%m%d').astype(int)
df['isfraud']=df['isfraud'].map({0:'no fraud', 1:'fraud'})
train, valid, test = np.split(df.sample(frac=1), [int(0.6*len(df)), int(0.8*len(df))])


In [None]:
def scale_dataset(dataframe, oversample=False):
  x = dataframe[dataframe.columns[:-2]].values
  y = dataframe[dataframe.columns[-2]].values

  scaler = StandardScaler()
  x = scaler.fit_transform(x)

  if oversample:
    ros = RandomOverSampler()
    x, y = ros.fit_resample(x, y)

  data =  np.hstack((x, np.reshape(y, (-1, 1))))


  return data, x, y

In [None]:
# if the difference is too big, whe need to take more of the less class to increase the size of the dataset
print(len(train[train['isfraud']=='fraud']))
print(len(train[train['isfraud']=='no fraud']))

In [None]:
train, x_train, y_train = scale_dataset(train, oversample=True)
valid, x_valid, y_valid = scale_dataset(valid, oversample=False)
test, x_test, y_test = scale_dataset(test, oversample=False)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
classifier = KNeighborsClassifier(n_neighbors=5)
classifier.fit(x_train, y_train)

In [None]:
y_pred = classifier.predict(x_test)

In [None]:
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

In [None]:
n = list(range(1,30))
accuracy = []
for i in n:
    classifier = KNeighborsClassifier(n_neighbors=i)
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)
    acc = accuracy_score(y_test, y_pred)
    accuracy.append(acc)
    

In [None]:
plt.plot(n, accuracy)
plt.grid(True)

In [None]:
classifier_best = KNeighborsClassifier(n_neighbors=accuracy.index(max(accuracy))+1)
classifier_best.fit(x_train,y_train)
y_pred = classifier_best.predict(x_test)

In [None]:
print(f'Accuracy: {accuracy_score(y_test, y_pred)}, neighbours: {accuracy.index(max(accuracy))+1}')

In [None]:
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

In [None]:
import seaborn as sns

In [None]:
def plot_normalized_confusion_matrix(y_true, y_pred, classes):
    cm = confusion_matrix(y_true, y_pred)
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.figure(figsize=(8, 6))
    sns.set(font_scale=1.2)
    sns.heatmap(cm_normalized, annot=True, fmt=".2f", cmap="Blues", xticklabels=classes, yticklabels=classes)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title('Normalized Confusion Matrix')
    plt.grid(True)
    plt.show()

plot_normalized_confusion_matrix(y_test, y_pred, classes=["fraud", "no fraud"])