In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import classification_report,accuracy_score,mean_absolute_error,mean_squared_error, confusion_matrix,roc_auc_score,precision_recall_fscore_support
from sklearn.linear_model import LogisticRegressionCV,LogisticRegression
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
import numpy as np
from imblearn.over_sampling import SMOTE
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import (
   
    BatchNormalization,
)
from tensorflow.python.keras.layers import Dense,Conv1D, Flatten,Conv2D,Dropout,MaxPool2D,MaxPool1D
from tensorflow.keras import Sequential
from sklearn.metrics import ConfusionMatrixDisplay

In [2]:
# percentage of the test size xomparing to the whole dataset
test_set_size = 0.1 
#set to 1 to over-sample the minority class
oversampling_flag = 0 
#percentage of the minority class after the oversampling comparing to majority class
oversampling_percentage = 0.2

In [3]:
#Definition of fxns

data = pd.read_csv("./Data_clean/min_max_both_inter_scaled.csv")

In [4]:
y = data.FLAG

In [5]:
X = data.drop(["FLAG","CONS_NO"],axis=1)

In [6]:
y[y == 0]

3579     0
3580     0
3581     0
3582     0
3583     0
        ..
40251    0
40252    0
40253    0
40254    0
40255    0
Name: FLAG, Length: 36677, dtype: int64

In [7]:
print(f"Normal Consumers: {len(y[y == 0])}")

Normal Consumers: 36677


In [8]:
print(f"Consumers with Fraud: {len(y[y == 1])}")

Consumers with Fraud: 3579


In [9]:
print(f"Total Consumers: {len(y)}")

Total Consumers: 40256


In [10]:
print(f"Classification assuming no fraud: {len(y[y == 0])/len(y)*100:.2f}")

Classification assuming no fraud: 91.11


In [11]:
X.columns = pd.to_datetime(X.columns)
X = X.reindex(X.columns,axis=1)

In [12]:
X

Unnamed: 0,2014-01-01,2014-01-02,2014-01-03,2014-01-04,2014-01-05,2014-01-06,2014-01-07,2014-01-08,2014-01-09,2014-01-10,...,2016-10-22,2016-10-23,2016-10-24,2016-10-25,2016-10-26,2016-10-27,2016-10-28,2016-10-29,2016-10-30,2016-10-31
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.264158,0.296902,0.297637,0.350616,0.201613,0.321919,0.342154,0.277402,0.337003,0.247970
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.333740,0.547089,0.543432,0.527583,0.653459,0.581835,0.565681,0.495276,0.432795,0.416336
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.242798,0.286581,0.327142,0.282600,0.371304,0.394049,0.340220,0.365049,0.274071,0.222138
3,0.048145,0.112494,0.144199,0.058008,0.064819,0.105683,0.091123,0.066463,0.062940,0.060357,...,0.313762,0.220056,0.178957,0.123532,0.123297,0.138093,0.156646,0.235791,0.211602,0.184829
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.147734,0.174809,0.172454,0.043555,0.024132,0.024720,0.112419,0.024720,0.022366,0.035903
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40251,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.288360,0.273369,0.242504,0.265432,0.263668,0.249559,0.223986,0.299824,0.316578,0.223986
40252,0.164334,0.000000,0.000000,0.348144,0.368229,0.353621,0.186853,0.245892,0.345709,0.267194,...,0.233719,0.402921,0.189897,0.314060,0.220329,0.282410,0.225806,0.378576,0.368229,0.290323
40253,0.048013,0.096026,0.076159,0.081126,0.127483,0.114238,0.073675,0.057947,0.101821,0.069536,...,0.081954,0.050497,0.053808,0.045530,0.040563,0.042219,0.065397,0.054636,0.032285,0.053808
40254,0.349472,0.313470,0.398924,0.355680,0.347610,0.361680,0.369543,0.496379,0.255328,0.286365,...,0.323609,0.340989,0.269812,0.214980,0.248293,0.230706,0.252845,0.272295,0.275812,0.214980


In [13]:
X = np.array(X)
y = np.array(y)

In [14]:
#oversampling of minority class (imbalanced learning)
over = SMOTE(sampling_strategy=0.2,random_state=0)
over_x_train,over_y_train = over.fit_resample(X,y)

In [15]:
len(over_y_train[over_y_train==1])

7335

In [16]:
print(f"Normal Consumers: {len(over_y_train[over_y_train == 0])}")
print(f"Consumers with Fraud: {len(over_y_train[over_y_train == 1])}")
print(f"Total Consumers: {len(over_y_train)}")
print(f"Classification assuming no fraud: {len(over_y_train[over_y_train == 0])/len(over_y_train)*100:.2f}")


Normal Consumers: 36677
Consumers with Fraud: 7335
Total Consumers: 44012
Classification assuming no fraud: 83.33


In [17]:
def results(y_test,prediction):
    print("Accuracy: ",100*accuracy_score(y_test,prediction))
    print("RMSE: ",mean_squared_error(y_test,prediction))
    print("MAE: ", mean_absolute_error(y_test,prediction))
    print("F1: ",precision_recall_fscore_support(y_test,prediction))
    print("Classification report", classification_report(y_test,prediction))
    print("AUC: ", 100*roc_auc_score(y_test,prediction))
    print(confusion_matrix(y_test,prediction), "\n")

In [18]:
def SVM(X_train,X_test,y_train,y_test):
    print("SVM:: ")
    model = SVC(random_state=0)
    model.fit(X_train,y_train)
    prediction = model.predict(X_test)
    results(y_test,prediction)

In [19]:
def RF(x_train,x_test,y_train,y_test):
    print("Random Forest: ")
    
    model = RandomForestClassifier(n_estimators=100,min_samples_leaf=1, max_features= "auto", random_state=0,n_jobs=-1,max_depth=10)
    model.fit(x_train,y_train)
    prediction = model.predict(x_test)
    results(y_test,prediction)

In [20]:
def DT(x_train,x_test,y_train,y_test):
    print("Descision Tree:")
    model = DecisionTreeClassifier(random_state=0)
    model.fit(x_train,y_train)
    prediction = model.predict(x_test)
    results(y_test,prediction)

In [21]:
def LR(X_train, X_test, y_train, y_test):
    print('Logistic Regression:')
    '''
    # Parameters selection 
    param_grid = {'C': [0.1,1,10,100],'solver': ['newton-cg', 'lbfgs']}
    grid = GridSearchCV(LogisticRegression(max_iter=1000,random_state=0), param_grid=param_grid, n_jobs=-1)
    grid.fit(X_train, y_train)
    df = pd.DataFrame(grid.cv_results_)
    print(df[['param_C', 'param_solver', 'mean_test_score', 'rank_test_score']])
    '''
    model = LogisticRegression(C=1000, max_iter=1000, n_jobs=-1, solver='newton-cg')
    model.fit(X_train, y_train)
    prediction = model.predict(X_test)
    results(y_test, prediction)


In [22]:
def CNN1D(X_train, X_test, y_train, y_test):
    print('1D - Convolutional Neural Network:')

    # Transforming the dataset into tensors
    X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
    X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)
    y_test = keras.utils.to_categorical(y_test,num_classes=2)
    y_train = keras.utils.to_categorical(y_train,num_classes=2)

    # Model creation
    model = Sequential()
    model.add(Conv1D(100, kernel_size=7, input_shape=(1034, 1), activation='relu'))
    model.add(Flatten())
    model.add(Dense(100, activation='relu'))
    model.add(Dense(100, activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss=keras.losses.binary_crossentropy,
                  optimizer='adam',
                  metrics=['accuracy'])

    # model.fit(X_train, y_train, epochs=1, validation_split=0.1, shuffle=False, verbose=1)
    model.fit(X_train, y_train, epochs=20, validation_split=0, shuffle=False, verbose=1)
    prediction = model.predict(X_test)
    classes_x=np.argmax(prediction,axis=1)
    model.summary()
    results(y_test, classes_x)


In [30]:
def ANN(X_train, X_test, y_train, y_test):
    print('Artificial Neural Network:')
    # for i in range(4,100,3):
    #     print("Epoch:",i)
        # Transforming the dataset into tensors
  
    # Model creation
    model = Sequential()
    model.add(Dense(1000, input_dim=1034, activation='relu'))
    model.add(Dense(100, activation='relu'))
    model.add(Dense(100, activation='relu'))
    model.add(Dense(100, activation='relu'))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss=keras.losses.binary_crossentropy,
                  optimizer='adam',
                  metrics=['accuracy'])

    # model.fit(X_train, y_train, validation_split=0, epochs=i, shuffle=True, verbose=0)
    model.fit(X_train, y_train, validation_split=0, epochs=20, shuffle=True, verbose=1)
    prediction = model.predict(X_test)
    classes_x=np.argmax(prediction,axis=1)
    model.summary()
    results(y_test, classes_x)


In [26]:
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [31]:
ANN(X_train, X_test, y_train, y_test)

Artificial Neural Network:
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 module_wrapper_12 (ModuleWr  (None, 1000)             1035000   
 apper)                                                          
                                                                 
 module_wrapper_13 (ModuleWr  (None, 100)              100100    
 apper)                                                          
                                                                 
 module_wrapper_14 (ModuleWr  (None, 100)              10100     
 apper)                                                          
                                                            

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
CNN1D(X_train, X_test, y_train, y_test)

1D - Convolutional Neural Network:
Epoch 1/20


In [None]:
RF(X_train, X_test, y_train, y_test)


In [None]:
LR(X_train, X_test, y_train, y_test)

In [None]:
DT(X_train, X_test, y_train, y_test)

In [30]:
SVM(X_train, X_test, y_train, y_test)

Artificial Neural Network:
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


AttributeError: 'Sequential' object has no attribute 'predict_classes'

In [None]:
 X_train, X_test, y_train, y_test = train_test_split(over_x_train, over_y_train, test_size=0.2, random_state=0)

In [None]:
ANN(X_train, X_test, y_train, y_test)
CNN1D(X_train, X_test, y_train, y_test)
RF(X_train, X_test, y_train, y_test)
LR(X_train, X_test, y_train, y_test)
DT(X_train, X_test, y_train, y_test)
SVM(X_train, X_test, y_train, y_test)
