In [1]:
print("This library is not compatible with Kaggle notebook, you try to download and run them in Colab")

This library is not compatible with Kaggle notebook, you try to download and run them in Colab


## Problem Statement:

The challenging part of the problem is that the data contains very few fraud instances in comparison to the overall population. To give more edge to the solution they have also collected data regarding location [geo_scores] of the transactions, their own proprietary index [Lambda_wts], on network turn around times [Qset_tats] and vulnerability qualification score [instance_scores]. As of now you don't need to understand what they mean.
Training data contains masked variables pertaining to each transaction id . Your prediction target here is 'Target' .

1: Fraudulent transactions


0: Clean transactions


In [2]:
# !pip install pandas_profiling
# !pip install pycaret

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
import statsmodels.api as sm

from imblearn.over_sampling import RandomOverSampler
from numba import jit, cuda
from pandas_profiling import ProfileReport
from pycaret.classification import *
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from statsmodels.formula.api import ols

sns.set()
%matplotlib inline

ModuleNotFoundError: No module named 'pycaret'

In [None]:
#Reading all the given files as dataframes

location_df = pd.read_csv('/Geo_scores.csv')
location_df.head()

In [None]:
propindex_df = pd.read_csv('/Lambda_wts.csv')
propindex_df.head()

In [None]:
nettat_df = pd.read_csv('/Qset_tats.csv')
nettat_df.head()

In [None]:
vqs_df = pd.read_csv('/instance_scores.csv')
vqs_df.head()

In [None]:
# Reading the training file

dataset_df = pd.read_csv('/train.csv')
dataset_df.head()

In [None]:
# Reading the testing file

test_df = pd.read_csv('/test_share.csv')
test_df.head()

### Observing that the data in additional csv files are not present in test dataset. Not combining them with the train dataset.

In [None]:
# Checking for missing values in the traning dataset

dataset_df.info()

In [None]:
# Checking for missing values in the traning dataset

test_df.info()

#### There are no missing values in training and test datasets

In [None]:
# Checking for duplicated rows in training dataset

dataset_df.duplicated().sum()

In [None]:
# Checking for duplicated rows in test dataset

test_df.duplicated().sum()

#### There are no duplicate entries in training and test datasets

In [None]:
# Spliting data into train and test

train_df, validation_df = train_test_split(dataset_df, train_size = 0.7, random_state = 1, stratify = dataset_df['Target'])

In [None]:
train_df.head()

In [None]:
train_df.shape

In [None]:
validation_df.shape

## EDA

In [None]:
# Visualising the data
# Since we plot all data takes long time to process and since there in imbalance not able to visualise clearly

@jit(target_backend='gpu')
def viz1():
    numeric_cols = train_df.select_dtypes(include=np.number)
    column_names = list(numeric_cols.columns)
    columns_index = 0

    plot_rows = 11
    plot_cols = 3

    fig, ax = plt.subplots(nrows=plot_rows, ncols=plot_cols, figsize=(20, 20))

    for row_count in range(plot_rows):
        for col_count in range(plot_cols):
            ax[row_count][col_count].scatter(x=train_df.index, y=train_df[column_names[columns_index]],
                                             c=np.array(['b', 'y'])[train_df['Class']])
            ax[row_count][col_count].set_ylabel(column_names[columns_index])
            columns_index += 1

    plt.tight_layout()
    plt.show()


In [None]:
viz1()

In [None]:
# Visualising the data
# Visualising a sample of data to clearly distinguish to see significant variables

numeric_cols = train_df.select_dtypes(include = np.number)

genuine_trans = numeric_cols.loc[numeric_cols['Target'] == 0, :]
fraud_trans = numeric_cols.loc[numeric_cols['Target'] == 1, :]

genuine_trans_sub = genuine_trans.sample(n=1000, replace = False)

new_df = pd.concat([fraud_trans, genuine_trans_sub], axis = 0)
new_df = new_df.sample(frac = 1)
new_df.reset_index(drop = True, inplace = True)

column_names = list(numeric_cols.columns)
columns_index = 0

plot_rows = 9
plot_cols = 3

fig, ax = plt.subplots(nrows = plot_rows, ncols = plot_cols, figsize = (20,20))

colormap = np.array(['b','y'])

for row_count in range(plot_rows):
    for col_count in range(plot_cols):
        ax[row_count][col_count].scatter(y = new_df[column_names[columns_index]], x = new_df.index, c = colormap[new_df['Target']])
        ax[row_count][col_count].set_ylabel(column_names[columns_index])
        columns_index += 1

### From the above plots we can see that
* Columns Per1, Per2, Per3, Per4, Per5, Per6, Per7, Per9, Der1, Der4, Der5 and Cred5 seem to be significant
* We also see a few outliers throughout the dataset
* There seems to be skewness throughout


### Advanced EDA - Using Pandas Profiling

In [None]:
prof = ProfileReport(train_df)
prof.to_file(output_file = "/EDA - Fraudulant ATM Transactions.html")

In [None]:
@jit(target_backend='cuda')
def pp():
  sns.pairplot(train_df)

pp()

#### Pandas Profiling and pairplot confirms the above observations
#### Additionally we can observe the below
* There is no corellation greater that 75% in the dataset
* There is no high auto corellation

In [None]:
# Encoding
# Only one column is of type object, encoding needed for 'Group' column only

# Converting object type to integer

train_df['Group'] = train_df['Group'].astype('category')
train_df['Group'] = train_df['Group'].cat.codes

In [None]:
train_df.info()

In [None]:
# Checking for unique values

train_df['Group'].nunique()


In [None]:
train_df['Group'].iloc[0]

In [None]:
train_df['Group'].iloc[1]

In [None]:
# Checking whether Group is a significant variable or not
# check p-value

model = ols('Target ~ Group', data = train_df).fit()
anova_result = sm.stats.anova_lm(model, typ=2)
print(anova_result)

#### Since p-value is greater that 0.05 the Group feature is not significant

#### We can also see that column 'id' is insignificat

In [None]:
# Droping the insignificant features

insignificant_train_df = train_df.iloc[:,0:2]
insignificant_train_df.head()

In [None]:
train_df.drop(columns = ['id','Group'], inplace = True)
train_df.head()

In [None]:
insignificant_val_df = validation_df.iloc[:,0:2]
insignificant_val_df.head()

In [None]:
validation_df.drop(columns = ['id','Group'], inplace = True)
validation_df.head()

In [None]:
insignificant_test_df = test_df.iloc[:,0:2]
insignificant_test_df.head()

In [None]:
test_df.drop(columns = ['id','Group'], inplace = True)
test_df.head()

In [None]:
# Checking for data imbalance in training dataset

train_df['Target'].value_counts()

In [None]:
# Checking percentage of minority class

print(276/(276+159215)*100,"%")

In [None]:
# Checking for data imbalance in validation dataset

validation_df['Target'].value_counts()

In [None]:
# Checking percentage of minority class

print(118/(118+68236)*100,"%")

#### We can see a clear data imbalance
* Minority class is only 0.17% in train and validation datasets (stratified)

In [None]:
# Balancing the dataset by oversampling (Random Oversampler technique)
# Spliting x and y (DV and IDV) first

train_features_df = train_df.iloc[:,:-1]
train_target_df = train_df.iloc[:,-1]

validation_features_df = validation_df.iloc[:,:-1]
validation_target_df = validation_df.iloc[:,-1]

In [None]:
train_features_df.columns

In [None]:
train_target_df.head()

In [None]:
validation_features_df.columns

In [None]:
validation_target_df.head()

In [None]:
print(train_features_df.shape, validation_features_df.shape, train_target_df.shape, validation_target_df.shape)

#### Minority class before and after train test spllt is 0.17%

In [None]:
# Oversampling of train data using Random Over Sampling Technique

over = RandomOverSampler()
ovrsam_train_features, ovrsam_train_target = over.fit_resample(train_features_df, train_target_df)
ovrsam_val_features, ovrsam_val_target = over.fit_resample(validation_features_df, validation_target_df)

In [None]:
# After oversampling

print("Shape : ", ovrsam_train_features.shape, ovrsam_train_target.shape, ovrsam_val_features.shape, ovrsam_val_target.shape)
print("Value counts : ", ovrsam_train_target.value_counts(), ovrsam_val_target.value_counts())

#### Data imbalance is cleared

### Using pycaret to find the best model (without any data preprocessing)

In [None]:
raw_clf = setup(data = ovrsam_train_features, target = ovrsam_train_target, use_gpu = True, pca = True, pca_components = 0.95, session_id = 111)

In [None]:
compare_models()

### Without any of the data preprocessing Decission Tree and Random Forest Models seem to have the highest train accuracy afetr cross validation.

### Iterating after outlier treatment to see if we get a better result

In [None]:
# Checking for outliers in train data
# Creating a new dataframe to hold the outlier treated dataset

outltrtd_train_features = ovrsam_train_features
outltrtd_train_target = ovrsam_train_target

In [None]:
outltrtd_train_features.head()

In [None]:
outltrtd_train_features.info()

In [None]:
for i in outltrtd_train_features.columns:
  sns.boxplot(outltrtd_train_features[i])
  plt.show()

In [None]:
for i in outltrtd_train_features.columns:
  sns.distplot(outltrtd_train_features[i])
  plt.show()

### We see outliers in almost all columns
* This creates skewness of data
* Treating the outliers to overcome skewness

In [None]:
# Using IQR caping method

for i in outltrtd_train_features.columns:
  col_q1 = outltrtd_train_features[i].quantile(0.25)
  col_q3 = outltrtd_train_features[i].quantile(0.75)
  col_iqr = col_q3 - col_q1

  upper_limit = col_q3 + (1.5 * col_iqr)
  lower_limit = col_q1 - (1.5 * col_iqr)

  outltrtd_train_features[i] = np.where(outltrtd_train_features[i] > upper_limit, upper_limit, np.where(outltrtd_train_features[i] < lower_limit, lower_limit, outltrtd_train_features[i]))

In [None]:
for i in outltrtd_train_features.columns:
  sns.boxplot(outltrtd_train_features[i])
  plt.show()

### We can observe that all outliers are treated and within 1.5 times the IQR

In [None]:
# Iterating with the outlier treated data to see if there is any improvement

outltrtd_clf = setup(data = outltrtd_train_features, target = outltrtd_train_target, use_gpu = True, pca = True, pca_components = 0.95, session_id = 111)

In [None]:
compare_models()

### There is slight change in accuracy and speed throughout different models

In [None]:
# Iterating after feature scalling - Trying with outliers and outlier treated dataset

# Feature scalling with outliers

ovrsam_train_features.describe()

In [None]:
scaler = StandardScaler()
ovrsamscl_train_features = pd.DataFrame(scaler.fit_transform(ovrsam_train_features))
ovrsamscl_train_features.head()

In [None]:
ovrsamscld_clf = setup(data = ovrsamscl_train_features, target = ovrsam_train_target, use_gpu = True, pca = True, pca_components = 0.95, session_id = 111)

In [None]:
compare_models()

In [None]:
# Iterating after feature scalling - Trying with outliers and outlier treated dataset

# Feature scalling after outlier treatment

outltrtd_train_features.describe()

In [None]:
outltrtdscl_train_features = pd.DataFrame(scaler.fit_transform(outltrtd_train_features))
outltrtdscl_train_features.head()

In [None]:
outltrtdscl_clf = setup(data = outltrtdscl_train_features, target = ovrsam_train_target, use_gpu = True, pca = True, pca_components = 0.95, session_id = 111)

In [None]:
compare_models()

### After all the comparisons we can observe the below

* We get better results after feature scalling
* Outlier treatment doesnt seem to affect the accuracy that significantly
* After oversampling and cross validation for 10 folds we get very good accuracies (more than 99%) in the below models (with or without outlier treatment)
  * Gradient Boosting Classifier - 99.36%
  * KNN Classifier - 99.94%
  * Decission Tree Classifier - 99.96%
  * Light GBM - 99.98%
  * XG Boosting - 99.99%
  * Random Forest Classifier - 99.99%
  * Extra Trees Classifier - 100%

### Using the Random Forest model (to avoid overfitting problem of 100% with Extra Trees Classifier)

In [None]:
# Creating the model

rf = create_model('rf')

In [None]:
# Hyperparameter tuning

@jit(target_backend='cuda')
def tune():
  tuned_rf = tune_model(rf)

tune()

### We see that the default model is better than the tuned model.

### Evaluating Random Forest Classifier model

In [None]:
evaluate_model(rf)

### Defining the predictive model as Random Forest Classifier

In [None]:
predict_model(rf)

### We can see very good results

## Validation of the predictive model

### Doing the same preprocessing as training dataset

In [None]:
validation_df.head()

In [None]:
validation_features_df.head()

In [None]:
validation_target_df.head()

In [None]:
outltrtd_val_features = validation_features_df
outltrtd_val_target = validation_target_df

outltrtd_val_features.head()

In [None]:
# Using IQR caping method

for i in outltrtd_val_features.columns:
  col_q1 = outltrtd_val_features[i].quantile(0.25)
  col_q3 = outltrtd_val_features[i].quantile(0.75)
  col_iqr = col_q3 - col_q1

  upper_limit = col_q3 + (1.5 * col_iqr)
  lower_limit = col_q1 - (1.5 * col_iqr)

  outltrtd_val_features[i] = np.where(outltrtd_val_features[i] > upper_limit, upper_limit, np.where(outltrtd_val_features[i] < lower_limit, lower_limit, outltrtd_val_features[i]))

In [None]:
# Feature scalling

outltrtdscl_val_features = pd.DataFrame(scaler.fit_transform(outltrtd_val_features))
outltrtdscl_val_features.head()

In [None]:
# Concating with the target to validate the model

validation_df2 = pd.concat([outltrtdscl_val_features, validation_target_df], axis = 1, join = 'inner')
validation_df2.head()

In [None]:
# Running the predictive model

validation_predictions2 = predict_model(rf, data = outltrtdscl_val_features)
validation_predictions2

## Predicting the results of the test data

### Doing the same preprocessing as train dataset

In [None]:
test_df.head()

In [None]:
# Using IQR caping method

for i in test_df.columns:
  col_q1 = test_df[i].quantile(0.25)
  col_q3 = test_df[i].quantile(0.75)
  col_iqr = col_q3 - col_q1

  upper_limit = col_q3 + (1.5 * col_iqr)
  lower_limit = col_q1 - (1.5 * col_iqr)

  test_df[i] = np.where(test_df[i] > upper_limit, upper_limit, np.where(test_df[i] < lower_limit, lower_limit, test_df[i]))

In [None]:
# Feature scalling

scld_test_df = pd.DataFrame(scaler.fit_transform(test_df))
scld_test_df.head()

In [None]:
# Running the predictive model

test_predictions = predict_model(rf, data = scld_test_df)
test_predictions

In [None]:
# Extracting the final results to excel

test_predictions.to_excel('TestResults.xlsx')

In [None]:
# Saving model for deployment

save_model(rf, "fraudulent_detection_rf")