In [None]:
import sys
import numpy
import pandas
import matplotlib
import seaborn
import scipy
import sklearn

print('Python: {}'.format(sys.version))
print('Numpy: {}'.format(numpy.__version__))
print('Pandas: {}'.format(pandas.__version__))
print('matplotlib: {}'.format(matplotlib.__version__))
print('seaborn: {}'.format(seaborn.__version__))
print('Scipy: {}'.format(scipy.__version__))
print('Numpy: {}'.format(numpy.__version__))
print('Sklearn: {}'.format(sklearn.__version__))


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#Load the dataset from csv file using Pandas
data = pd.read_csv("C:/Data Science/creditcard.csv")

In [None]:
#Exploring dataset
print(data.columns)

In [None]:
print(data.shape)

In [None]:
print(data.describe())

In [None]:
#We will take just fraction of data and set random state as 1 so that it does not change
#data = data.sample(frac=.10,random_state=1)
#print(data.shape)

In [None]:
# Plot for each variable in histogram
data.hist(figsize=(20,20))
plt.show()

In [None]:
#Check number of fraudulent and normal transactions
Fraud = data[data['Class'] == 1]
Valid = data[data['Class'] == 0]

outlier_fraction = len(Fraud)/float(len(Valid))
print(outlier_fraction)
print('Fraud_Cases: {}'.format(len(Fraud)))
print('Valid_Cases: {}'.format(len(Valid)))

In [None]:
#We need to check correlation of various variables to class(Fraud/Valid)
corrmat = data.corr()
fig = plt.figure(figsize=(12,9))
sns.heatmap(corrmat,vmax=.8,square=True)
plt.show()

In [None]:
#Get columns from out dataframe
columns = data.columns.tolist()

#This is an unsupervised machine learning model and hence we need to Filter the columns 'Class ' we do not want here

columns = [c for c in columns if c not in ["Class"]]

#Storing the target variable we will be predicting on 
target = "Class"

X = data[columns]
Y = data[target]

#Print the shapes
print(X.shape)
print(Y.shape)

In [None]:
#We are not getting into using ML models. SInce the fraud data is neglible we are going to use the below methods
#IsolationForest returns anamoly score (Based on random forest - shorter pathlink)
#LocalOutlierFactor is an unsupervised outlier detection method and gives us a anamoly score (Based on Neighbours) 

from sklearn.metrics import classification_report,accuracy_score
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor


#define a random state
state = 1

#define the outlier detection methods
classifiers = {
    "Isolation Forest": IsolationForest(max_samples=len(X),contamination=(outlier_fraction),random_state=state),
    "Local Outlier Factor": LocalOutlierFactor(n_neighbors= 20,contamination = outlier_fraction)
}

In [None]:
# Fit the model
plt.figure(figsize=(9, 7))
n_outliers = len(Fraud)


for i, (clf_name, clf) in enumerate(classifiers.items()):
    
    # fit the data and tag outliers
    if clf_name == "Local Outlier Factor":
        y_pred = clf.fit_predict(X)
        scores_pred = clf.negative_outlier_factor_
    else:
        clf.fit(X)
        scores_pred = clf.decision_function(X)
        y_pred = clf.predict(X)
    
    # Reshape the prediction values to 0 for valid, 1 for fraud. 
    y_pred[y_pred == 1] = 0
    y_pred[y_pred == -1] = 1
    
    n_errors = (y_pred != Y).sum()
    
    # Run classification metrics
    print('{}: {}'.format(clf_name, n_errors))
    print(accuracy_score(Y, y_pred))
    print(classification_report(Y, y_pred))
            