In [2]:
import pandas as pd
import numpy as np
import pickle

from scipy.stats import chi2_contingency, f_oneway, pointbiserialr

import matplotlib.pyplot as plt
import plotly.express as px

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.metrics import precision_score, classification_report, make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.utils import resample

In [3]:
df = pd.read_csv('onlinefraud.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


In [5]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [6]:
df['isFraud'].value_counts()

0    6354407
1       8213
Name: isFraud, dtype: int64

In [7]:
df_majority = df[df['isFraud'] == 0]  # Non-fraud transactions
df_minority = df[df['isFraud'] == 1]  # Fraud transactions

# Undersample the majority class
df_majority_undersampled = resample(df_majority, 
                                    replace=False,  # No replacement (random selection)
                                    n_samples=len(df_minority),  # Match minority class count
                                    random_state=42)  # For reproducibility

# Combine undersampled majority class with minority class
df_undersampled = pd.concat([df_majority_undersampled, df_minority])

# Shuffle dataset
df_undersampled = df_undersampled.sample(frac=1, random_state=42).reset_index(drop=True)

# Verify new class distribution
df_undersampled['isFraud'].value_counts()


1    8213
0    8213
Name: isFraud, dtype: int64

# Split Data

In [10]:
X = df_undersampled.drop(columns=['isFraud'])
y = df_undersampled['isFraud']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=26)

print(f"Training set size: {X_train.shape}")
print(f"Testing set size: {X_test.shape}")

Training set size: (13140, 10)
Testing set size: (3286, 10)


In [11]:
num_cols = ['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']
cat_cols = ['step','type','nameOrig','nameDest']

In [12]:
# get the categorical features
p_values = []
results = []

for feature in cat_cols:
    # Create a contingency table (cross-tabulation between target and feature)
    contingency_table = pd.crosstab(X_train[feature], y_train)
    
    # Perform chi-squared test
    chi2_stat, p_value, dof, expected = chi2_contingency(contingency_table)
    
    p_values.append(p_value)
    
    # Interpret the result based on p-value
    if p_value < 0.05:
        results.append(f'{feature} is correlated with isFraud')
    else:
        results.append(f'{feature} is not correlated with isFraud')

# Display results
correlation_results = pd.DataFrame({
    'Feature': cat_cols,
    'P-Value': p_values,
    'Interpretation': results
})

correlation_results

Unnamed: 0,Feature,P-Value,Interpretation
0,step,0.0,step is correlated with isFraud
1,type,0.0,type is correlated with isFraud
2,nameOrig,0.495898,nameOrig is not correlated with isFraud
3,nameDest,0.492579,nameDest is not correlated with isFraud


In [13]:
p_values = []
results = []

# Point Biserial Correlation (for binary categorical variables)
for feature in num_cols:
    corr, p_value = pointbiserialr(X_train[feature], y_train)  # Compute correlation

    p_values.append(p_value)
    
    # Interpret result
    if p_value < 0.05:
        results.append(f'{feature} is correlated with isFraud')
    else:
        results.append(f'{feature} is not correlated with isFraud')

# Display results
correlation_results = pd.DataFrame({
    'Feature': num_cols,
    'P-Value': p_values,
    'Interpretation': results
})

correlation_results

Unnamed: 0,Feature,P-Value,Interpretation
0,amount,0.0,amount is correlated with isFraud
1,oldbalanceOrg,4.883443999999999e-50,oldbalanceOrg is correlated with isFraud
2,newbalanceOrig,2.0371699999999998e-51,newbalanceOrig is correlated with isFraud
3,oldbalanceDest,1.5565499999999998e-19,oldbalanceDest is correlated with isFraud
4,newbalanceDest,0.1987584,newbalanceDest is not correlated with isFraud


### Features Selection

In [14]:
select_num_cols = ['step','amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest']
select_cat_cols = ['type']

# Pipeline Creation

## Preprocessing

In [None]:
onehot_encoder = OneHotEncoder()
robust_scaler = RobustScaler()
# create a preprocessing pipeline, using column transformer
preprocessing = ColumnTransformer(
    transformers=[
        # encode the low cardinality features using onehot encoder
        ('onehot', onehot_encoder, select_cat_cols),
        # scale the numerical column using standard scaler
        ('num', robust_scaler, select_num_cols)
    ],
    # as for the features we are not selected, we will drop it
    remainder='drop'
)

In [17]:
# checking the preprocessing if it's working
X_train_preprocess = preprocessing.fit_transform(X_train)
X_test_preprocess = preprocessing.transform(X_test)

In [18]:
rf_model = RandomForestClassifier(
    n_estimators=100,
    class_weight="balanced",  # Automatically adjusts weights for fraud cases
    random_state=26
)

In [19]:
# creating pipeline with preprocessing and Random Forest classifier
pipeline_RandFor = Pipeline(steps=[
    ('preprocessor', preprocessing),
    ('classifier', rf_model)
])

In [20]:
# fit the pipeline
pipeline_RandFor.fit(X_train, y_train)

In [21]:
# Define a custom scorer for macro precision (for multiclass problems)
custom_precision_scorer = make_scorer(precision_score, average='macro')

cv_scores = cross_val_score(
    estimator=pipeline_RandFor,  # Your Random Forest pipeline
    X=X_train,                   # Training features
    y=y_train,            # Encoded training target
    cv=5,                         # Number of folds
    scoring=custom_precision_scorer  # Macro precision metric
)

# Print cross-validation results
print('Precision Score - All - Cross Validation  : ', cv_scores)
print('Precision Score - Mean - Cross Validation : ', cv_scores.mean())
print('Precision Score - Std - Cross Validation  : ', cv_scores.std())
print('Precision Score - Range of Test-Set       : ',
      (cv_scores.mean() - cv_scores.std()), '-', (cv_scores.mean() + cv_scores.std()))


Precision Score - All - Cross Validation  :  [0.98914898 0.99391742 0.98937662 0.99430081 0.99394315]
Precision Score - Mean - Cross Validation :  0.9921373951016115
Precision Score - Std - Cross Validation  :  0.0023521109826631554
Precision Score - Range of Test-Set       :  0.9897852841189483 - 0.9944895060842747


In [22]:
parameters = {
    # split criteria
    'classifier__criterion': ['gini'],
    # Max tree depth
    'classifier__max_depth': [15, 20, 25],
    # minimum sample required to split
    'classifier__min_samples_split': [2, 5, 10],
    # minimum sample per leaf
    'classifier__min_samples_leaf': [1, 2, 4]
}

# using gridsearch
grid_search = GridSearchCV(
    # call the pipeline
    estimator=pipeline_RandFor,
    # call the input parameters
    param_grid=parameters,
    # 20 fold cross-validation
    cv=20,
    # use all resource
    n_jobs=-1,
    # using recall scorer
    scoring=custom_precision_scorer,
    # how detailed the output will be
    verbose=2
)

grid_search.fit(X_train, y_train)

Fitting 20 folds for each of 27 candidates, totalling 540 fits


In [23]:
# check the parameter and check the recall score
print("Best Parameters:", grid_search.best_params_)
print("Best Recall:", grid_search.best_score_)

Best Parameters: {'classifier__criterion': 'gini', 'classifier__max_depth': 20, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 5}
Best Recall: 0.9924634622207572


In [31]:
# Get the best model
best_model = grid_search.best_estimator_

# Predict on the test set
y_pred = best_model.predict(X_test)

# Evaluate performance
print(classification_report(y_test, y_pred, target_names=['0','1']))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99      1648
           1       0.99      1.00      0.99      1638

    accuracy                           0.99      3286
   macro avg       0.99      0.99      0.99      3286
weighted avg       0.99      0.99      0.99      3286



In [32]:
# Export the model using pickle
with open('model.pkl','wb') as file:
    pickle.dump(best_model, file)

In [36]:
num_samples = 10  # Define how many new data points you want
sampled_data = df.sample(n=num_samples, random_state=42)  # Ensuring reproducibility

# Create the new_data dictionary with randomly sampled values
new_data = {
    'step': sampled_data['step'].tolist(),
    'type': sampled_data['type'].tolist(),
    'amount': sampled_data['amount'].tolist(),
    'nameOrig': sampled_data['nameOrig'].tolist(),
    'oldbalanceOrg': sampled_data['oldbalanceOrg'].tolist(),
    'newbalanceOrig': sampled_data['newbalanceOrig'].tolist(),
    'nameDest': sampled_data['nameDest'].tolist(),
    'oldbalanceDest': sampled_data['oldbalanceDest'].tolist(),
    'newbalanceDest': sampled_data['newbalanceDest'].tolist(),
}


In [37]:
new_data_df = pd.DataFrame(new_data)
new_data_df

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest
0,278,CASH_IN,330218.42,C632336343,20866.0,351084.42,C834976624,452419.57,122201.15
1,15,PAYMENT,11647.08,C1264712553,30370.0,18722.92,M215391829,0.0,0.0
2,10,CASH_IN,152264.21,C1746846248,106589.0,258853.21,C1607284477,201303.01,49038.8
3,403,TRANSFER,1551760.63,C333676753,0.0,0.0,C1564353608,3198359.45,4750120.08
4,206,CASH_IN,78172.3,C813403091,2921331.58,2999503.88,C1091768874,415821.9,337649.6
5,259,PAYMENT,915.13,C2002954533,0.0,0.0,M290849763,0.0,0.0
6,188,CASH_OUT,20603.87,C813757373,0.0,0.0,C823291717,558068.66,578672.53
7,139,CASH_OUT,58605.72,C1850864812,0.0,0.0,C618657299,585494.94,644100.66
8,230,PAYMENT,4865.11,C886849972,0.0,0.0,M623175144,0.0,0.0
9,544,CASH_OUT,118131.63,C390714641,0.0,0.0,C366360355,8131691.35,8476246.86


In [38]:
# make a prediction with new_data_df
prediction = best_model.predict(new_data_df)

# for loop to predict each rows
for i in prediction:
    if i == 0:
        print('This is not A Fraud')
    elif i == 1:
        print('This is A Fraud')

This is not A Fraud
This is not A Fraud
This is not A Fraud
This is not A Fraud
This is not A Fraud
This is not A Fraud
This is not A Fraud
This is not A Fraud
This is not A Fraud
This is not A Fraud
