In [None]:
#Imports
import pandas as pd
import matplotlib.pyplot as plt
#import tensorflow as t
#from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
#Read data from the dataset
df=pd.read_csv('Customers_File.csv', delimiter=',')
df.head(10)

In [None]:
#Data Pre-processing
#Removal of extra spaces from the data
df['First_Name']=df['First_Name'].str.lower()
df['Last_Name']=df['Last_Name'].str.lower()
#Converting columns into Lists for the tokernizer
first_name=list(df.First_Name.values)
last_name=list(df.Last_Name.values)
fraud=list(df.Fraud.values)

#DOB to be split into DD MM and YYYY for ML algo
df[['DD','MM','YYYY']]=df.DOB.str.split("-", expand=True,)
#Now DOB column can be dropped from the dataframe
df=df.drop(columns=['DOB','Date_of_joining','Date_of_exit'])
df['DD']=df['DD'].astype(int)
df['MM']=df['MM'].astype(int)
df['YYYY']=df['YYYY'].astype(int)

fraud={
    'Fraud' : 1,
    'Not Fraud' : 0
}

df = df.replace({'Fraud': fraud})
df.head(10)

In [None]:
#Tokenizing First Name
tk_fn = Tokenizer(num_words=2, lower=False, oov_token=None)
tk_fn.fit_on_texts(first_name)
wi_fn=tk_fn.word_index
wi_fn

#Tokenizing Last Name
tk_ln = Tokenizer(num_words=2, lower=False, oov_token=None)
tk_ln.fit_on_texts(last_name)
wi_ln=tk_ln.word_index
wi_ln

#Reflecting changes to the Dataframe
df=df.replace({"First_Name" : wi_fn})
df=df.replace({"Last_Name" : wi_ln})

df = df.drop(columns='Customer_ID')

In [None]:
df.head(10)

In [None]:
#Shuffling of the data in the dataframe to avoid overfitting or underfitting
df = df.sample(frac = 1)
df

In [None]:
#Machine Learning Starts
#Imports
import itertools
import matplotlib.pyplot as plt 
import seaborn as sns 
from matplotlib import gridspec
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
#%matplotlib inline

In [None]:
#Dividing data into Input Vector and Dependent Variable
X=df[['First_Name','Last_Name','DD','MM','YYYY']].values
y=df['Fraud'].values

In [None]:
#Corelation Matrix
corrmat=df.corr()
fig = plt.figure(figsize = (12, 9)) 
sns.heatmap(corrmat, vmax = .8, square = True) 
plt.show()

In [None]:
#Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test =train_test_split(X, y, test_size=0.2, random_state=4)
print('Train set:', X_train.shape, y_train.shape)
print('Test set:', X_test.shape, y_test.shape)

In [None]:
#Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
FDTree = DecisionTreeClassifier(criterion="entropy", max_depth = 4)
FDTree # it shows the default parameters
FDTree.fit(X_train,y_train)

In [None]:
#Prediction
predTree = FDTree.predict(X_test)
print (predTree [0:5])
print (y_test [0:5])

In [None]:
#Evaluation
# Evaluating the classifier 
# printing every score of the classifier 
# scoring in anything 
from sklearn.metrics import classification_report, accuracy_score  
from sklearn.metrics import precision_score, recall_score 
from sklearn.metrics import f1_score, matthews_corrcoef 
from sklearn.metrics import confusion_matrix 
  
print("The model used is Decision Tree classifier") 
  
acc = accuracy_score(y_test, predTree) 
print("The accuracy is {}".format(acc)) 
  
#prec = precision_score(y_test, predTree) 
#print("The precision is {}".format(prec)) 
  
rec = recall_score(y_test, predTree) 
print("The recall is {}".format(rec)) 
  
f1 = f1_score(y_test, predTree) 
print("The F1-Score is {}".format(f1)) 
  
MCC = matthews_corrcoef(y_test, predTree) 
print("The Matthews correlation coefficient is{}".format(MCC)) 

In [None]:
# printing the confusion matrix 
LABELS = ['Normal', 'Fraud'] 
conf_matrix = confusion_matrix(y_test, predTree) 
plt.figure(figsize =(12, 12)) 
sns.heatmap(conf_matrix, xticklabels = LABELS,  
            yticklabels = LABELS, annot = True, fmt ="d"); 
plt.title("Confusion matrix") 
plt.ylabel('True class') 
plt.xlabel('Predicted class') 
plt.show() 

In [None]:
import pickle
from sklearn.externals import joblib
joblib.dump(FDTree,'./FDTree.joblib', compress=True)
joblib.dump(wi_fn,'./wi_fn.joblib', compress=True)
joblib.dump(wi_ln,'./wi_ln.joblib', compress=True)