In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_validate
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif

from plot_confusion_matrix import plot_confusion_matrix


ModuleNotFoundError: No module named 'plot_confusion_matrix'

In [None]:
data = pd.read_csv('data.csv')

In [None]:
data.head()

In [None]:
  data.describe().round(decimals=2)

In [None]:
print('Columns :',list(data))
print('Number of columns : ',len(list(data)))


In [None]:
n_genuine = data[data['Class']==0]
n_fraud = data[data['Class']==1]
# Now we can plot this as a pi-chart
print('Number of Genuine Transactions :',len(n_genuine))
print('Number of Fraud Transactions:',len(n_fraud))
# this function always takes input as boolean operator # selects the indices of that column
plt.pie([len(n_genuine),len(n_fraud)],labels=['Genuine','Fraud'],radius =1)
plt.show

In [None]:
# the data is highly imbalanced


In [None]:
X , y = data.iloc[:,:-1], data.iloc[:,-1]
X.head()


In [None]:
# 10 best features
k = 10

k_best = SelectKBest(f_classif, k=k)
k_best.fit(X,y)
# fclassif is the particular method creates a new class
# this function will totally depend on your problem
# we use the score function for this 
# here we have used the f_classif function

In [None]:
mask = k_best.get_support()
not_mask = np.logical_not(mask)
# mask and not_mask stores the different indices of the array
all_features = np.array(list(X))
# this consists of the list of the features(only names) and has
# been converted to an array
# this function below loads elements with those particular indices
best_features = all_features[mask]
bad_features = all_features[not_mask]

print('Best Features :',best_features)
print('Bad Features :',bad_features)

In [None]:
# we now using the drop function to drop all the bad features
# we use the drop function from the pandas library also because X
# is a dataframe
# axis = 1 we will drop the columns
X = X.drop(bad_features, axis=1)
X.head()

In [None]:
# distplot or distribution plot seaborn provides high level distribution plots
# parameters features dta
def plot_fraud_genuine(features, data):
    plt_index = 0
    plt.figure(figsize=(10,10))
    plt.subplots_adjust(top=0.99, bottom=0.01, hspace=1.5, wspace=0.4)
    for feature in features :
        plt_index+=1
        feature_data = pd.concat([data[feature],data['Class']],axis=1)
        fraud = feature_data[data['Class']==1]
        genuine = feature_data[data['Class']==0]
        if len(genuine > 10000):
            genuine = genuine[::100]
        plt.subplot(5,5,plt_index)
        sns.distplot(fraud[feature])
        sns.distplot(genuine[feature])
        plt.title(feature)
        

In [None]:
plot_fraud_genuine(best_features,data)

In [None]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
# sampling 20 percent of data as testing data set



In [None]:
nb = GaussianNB()
cv_results = cross_validate(nb,x_train,y_train,cv=10,scoring='recall',
                           return_train_score = True,return_estimator = True)
print(cv_results)

In [None]:
print('Training scores from each fold:',cv_results['train_score'])
max_score_index = np.argmax(cv_results['train_score'])
print(max_score_index)
best_estimator = cv_results['estimator'][max_score_index]

In [None]:
def display_results(estimator, x, y):
    predicted = estimator.predict(x)
    cm = confusion_matrix(y,predicted)
    report = classification_report(y,predicted)
    print(report)
    plot_confusion_matrix(cm,classes=['Genuine','Fraud'],title='Fraud Detection')

In [None]:
display_results(best_estimator,x_test,y_test)
# look at how we enter the x test and the y test variable 

In [None]:
display_results(best_estimator, x_train , y_train)
# only if we want to check how accurate is it on the training dataset