In [2]:
import numpy as np
import pandas as pd
import torch
from numpy import vstack
from pandas import read_csv
from sklearn.preprocessing import MinMaxScaler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import roc_auc_score
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch import Tensor
from torch.nn import Linear
from torch.nn import ReLU
from torch.nn import Tanh
from torch.nn import Sigmoid
from torch.nn import Module
from torch.optim import SGD
from torch.optim import Adam
from torch.nn import BCELoss
import torch.nn as nn
from torch.optim import lr_scheduler
from torch.nn.init import kaiming_uniform_
from torch.nn.init import xavier_uniform_
import streamlit as st
import time
import copy

In [3]:
url = "https://media.githubusercontent.com/media/syedshahlal/MDA_TextAnalysis/main/dataset/merged_compustat_and_labels.csv"

In [4]:
#Read merged data set
df = pd.read_csv(url)

In [5]:
df.head()

Unnamed: 0,gvkey,datadate,fyear,indfmt,consol,popsrc,datafmt,tic,cusip,conm,...,ch_cs,ch_cm,ch_roa,issue,bm,dpi,reoa,EBIT,ch_fcf,misstate
0,1003,1990-01-31,1989,INDL,C,D,STD,ANTQ,354100,A.A. IMPORTING CO INC,...,,,,1,-1.240403,,-0.403403,-0.087941,,0
1,1004,1990-05-31,1989,INDL,C,D,STD,AIR,361105,AAR CORP,...,,,,1,0.554652,5.380405,0.24052,0.123916,27.257486,0
2,1004,1991-05-31,1990,INDL,C,D,STD,AIR,361105,AAR CORP,...,0.32954,0.297848,-0.090196,0,0.863306,0.882711,0.262695,0.082704,-10.87404,0
3,1004,1992-05-31,1991,INDL,C,D,STD,AIR,361105,AAR CORP,...,-0.095484,-0.040947,-0.012672,1,0.961101,0.968053,0.259703,0.055586,-4.789317,0
4,1004,1993-05-31,1992,INDL,C,D,STD,AIR,361105,AAR CORP,...,-0.048105,-0.030084,-0.025104,0,0.881454,1.084524,0.261067,0.016952,-9.710404,0


In [6]:
#Preprocess dataset
df = df.fillna(0)

df = df[df.columns].replace([np.inf, -np.inf], 0)

columns_to_normalize = ['act', 'ap', 'at', 'ceq', 'che', 'cogs', 'csho', 'dlc', 'dltis', 'dltt', 'dp', 'ib',
                       'invt', 'ivao', 'ivst', 'lct', 'lt', 'ni', 'ppegt', 'pstk', 're', 'rect', 'sale', 'sstk',
                       'txp', 'txt', 'xint', 'prcc_f', 'dch_wc', 'ch_rsst', 'dch_rec', 'dch_inv', 'soft_assets',
                       'ch_cs', 'ch_cm', 'ch_roa', 'bm', 'dpi', 'reoa', 'EBIT', 'ch_fcf', 'issue']

#Step 2: Apply Min-Max scaling for normalization
scaler = MinMaxScaler()
df[columns_to_normalize] = scaler.fit_transform(df[columns_to_normalize])

# Step 3: Convert the normalized values to float type (should already be in float)
df[columns_to_normalize] = df[columns_to_normalize].astype('float32')

In [7]:
desired_columns_order = [
    'gvkey', 'fyear', 'tic', 'cik', 'Bank', 'act', 'ap', 'at', 'ceq', 'che',
    'cogs', 'csho', 'dlc', 'dltis', 'dltt', 'dp', 'ib', 'invt', 'ivao', 'ivst',
    'lct', 'lt', 'ni', 'ppegt', 'ppent', 'pstk', 're', 'rect', 'sale', 'sstk',
    'txp', 'txt', 'xint', 'prcc_f', 'dch_wc', 'ch_rsst', 'dch_rec', 'dch_inv',
    'soft_assets', 'ch_cs', 'ch_cm', 'ch_roa', 'issue', 'bm', 'dpi', 'reoa',
    'EBIT', 'ch_fcf', 'misstate']

# Reorder the columns
df = df.reindex(columns=desired_columns_order)

In [8]:
df.shape

(268113, 49)

In [9]:
import matplotlib.pyplot as plt

# Filter the DataFrame for years from 1990 to 2019
filtered_df = df[(df['fyear'] >= 1990) & (df['fyear'] <= 2019)]
training_df = df[(df['fyear'] >= 1990) & (df['fyear'] <= 2008)]
testing_df = df[(df['fyear'] >= 2008) & (df['fyear'] <= 2019)]

# Group by fiscal year and sum misstatements
misstatements_by_fyear = filtered_df.groupby('fyear')['misstate'].sum()
misstatements_by_fyear_training = training_df.groupby('fyear')['misstate'].sum()
misstatements_by_fyear_testing = testing_df.groupby('fyear')['misstate'].sum()

# Calculate total misstatements
total_misstatements = misstatements_by_fyear.sum()
total_misstatements_training = misstatements_by_fyear_training.sum()
total_misstatements_testing = misstatements_by_fyear_testing.sum()

# Plotting
plt.figure(figsize=(8, 4))
misstatements_by_fyear.plot(kind='bar')
plt.xlabel('Fiscal Year')
plt.ylabel('Number of Misstatements')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Print total misstatements
print("Total number of misstatements from 1990 to 2019:", total_misstatements)
print("Total number of misstatements in the training period from 1990 to 2002:", total_misstatements_training)
print("Total number of misstatements in the testing period from 2003 to 2019:", total_misstatements_testing)

Total number of misstatements from 1990 to 2019: 1371
Total number of misstatements in the training period from 1990 to 2002: 1145
Total number of misstatements in the testing period from 2003 to 2019: 255


  plt.show()


In [10]:
# Split features into the specified groups
raw_items_28_financial_ratios_14 = ['act', 'ap', 'at', 'ceq', 'che', 'cogs', 'csho', 'dlc', 'dltis', 'dltt', 'dp', 'ib',
                       'invt', 'ivao', 'ivst', 'lct', 'lt', 'ni', 'ppegt', 'pstk', 're', 'rect', 'sale', 'sstk',
                       'txp', 'txt', 'xint','prcc_f', 'dch_wc', 'ch_rsst', 'dch_rec', 'dch_inv', 'soft_assets',
                       'ch_cs', 'ch_cm', 'ch_roa', 'bm', 'dpi', 'reoa', 'EBIT', 'ch_fcf', 'issue']


In [11]:
def data_resampling(df, resampling_strategy):
    """
    Splits the data based on a cutoff year and applies the selected resampling strategy to balance the training set.
    """
    features = ['act', 'ap', 'at', 'ceq', 'che', 'cogs', 'csho', 'dlc', 'dltis', 'dltt', 'dp', 'ib',
                'invt', 'ivao', 'ivst', 'lct', 'lt', 'ni', 'ppegt', 'pstk', 're', 'rect', 'sale', 'sstk',
                'txp', 'txt', 'xint', 'prcc_f', 'dch_wc', 'ch_rsst', 'dch_rec', 'dch_inv', 'soft_assets',
                'ch_cs', 'ch_cm', 'ch_roa', 'bm', 'dpi', 'reoa', 'EBIT', 'ch_fcf', 'issue']
    
    # Assign Train, Val, and Test periods
    train_period, test_period = (1990, 2008), (2009, 2019)

    # loading data
    train_data = df[(df['fyear'] >= train_period[0]) & (df['fyear'] <= train_period[1])]
    test_data = df[(df['fyear'] >= test_period[0]) & (df['fyear'] <= test_period[1])]

    # Extract features (X) and target variable (y) for training and testing
    X_train = train_data[raw_items_28_financial_ratios_14]
    y_train = train_data['misstate']

    X_test = test_data[raw_items_28_financial_ratios_14]
    y_test = test_data['misstate']

    if resampling_strategy == 'Random Under Sampling (RUS)':
        sampler = RandomUnderSampler()
    elif resampling_strategy == 'Random Over Sampling (ROS)':
        sampler = RandomOverSampler()

    X_train_resampled, y_train_resampled = sampler.fit_resample(X_train, y_train)

    return X_train_resampled, y_train_resampled, X_test, y_test


In [12]:
data_resampling(df, 'Random Under Sampling (RUS)')

(             act            ap            at       ceq       che      cogs  \
 227353  0.000034  3.005317e-08  1.028967e-07  0.216603  0.000008  0.048552   
 157664  0.002900  8.054329e-05  2.061744e-04  0.217475  0.000044  0.054209   
 3613    0.001071  1.517766e-05  1.169065e-04  0.217158  0.000023  0.049020   
 172982  0.018453  3.082480e-04  6.089256e-03  0.215037  0.001666  0.062045   
 129530  0.001022  9.363998e-06  3.315757e-04  0.217490  0.000126  0.049336   
 ...          ...           ...           ...       ...       ...       ...   
 263814  0.002573  6.822881e-06  3.772335e-04  0.218731  0.000151  0.050452   
 263815  0.007801  1.315841e-05  5.210566e-04  0.219490  0.001065  0.051321   
 265984  0.002320  9.183070e-05  1.855939e-04  0.217087  0.000454  0.050918   
 265985  0.002558  1.216650e-04  2.061391e-04  0.217182  0.000495  0.051730   
 265986  0.005615  1.923386e-04  3.864931e-04  0.217480  0.001038  0.054882   
 
             csho       dlc     dltis          dlt

In [13]:
data_resampling(df, 'Random Over Sampling (ROS)')

(             act            ap        at       ceq       che      cogs  \
 0       0.001152  1.426266e-05  0.000088  0.216909  0.000009  0.049322   
 1       0.001240  1.763227e-05  0.000092  0.216914  0.000012  0.049248   
 2       0.001138  1.320918e-05  0.000085  0.216902  0.000010  0.049198   
 3       0.001316  2.014334e-05  0.000097  0.216902  0.000025  0.049243   
 4       0.001374  2.087192e-05  0.000099  0.216914  0.000029  0.049316   
 ...          ...           ...       ...       ...       ...       ...   
 335867  0.000032  1.587156e-04  0.000197  0.216920  0.000188  0.048951   
 335868  0.000054  2.375825e-07  0.000001  0.216615  0.000008  0.048559   
 335869  0.085453  3.633504e-03  0.009033  0.227196  0.004519  0.118145   
 335870  0.000109  1.043738e-07  0.000005  0.216616  0.000009  0.048575   
 335871  0.081662  3.839959e-03  0.008892  0.227848  0.002907  0.110881   
 
             csho       dlc     dltis          dltt  ...  soft_assets  \
 0       0.000026  0.0061

In [14]:
# Assign Train, Val, and Test periods
train_period, test_period = (1990, 2002), (2003, 2019)

# loading data
train_data = df[(df['fyear'] >= train_period[0]) & (df['fyear'] <= train_period[1])]
test_data = df[(df['fyear'] >= test_period[0]) & (df['fyear'] <= test_period[1])]

# Extract features (X) and target variable (y) for training and testing
X_train = train_data[raw_items_28_financial_ratios_14]
y_train = train_data['misstate']

X_test = test_data[raw_items_28_financial_ratios_14]
y_test = test_data['misstate']

# Undersampling the data
rus = RandomUnderSampler()
X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

print("Training data Shape after under sampling:", X_train_resampled.shape, y_train_resampled.shape)

Training data Shape after under sampling: (1662, 42) (1662,)


In [15]:
# Count the occurrences of each class in the resampled training dataset
class_counts = pd.Series(y_train).value_counts()

# Determine if the dataset is balanced (assuming a threshold for "balanced" at 80/20 distribution)
is_balanced = class_counts.min() / class_counts.max() >= 0.8

In [16]:
is_balanced

False

In [17]:
financial_ratios_14 = ['dch_wc', 'ch_rsst', 'dch_rec', 'dch_inv', 'soft_assets', 'ch_cs', 'ch_cm', 'ch_roa', 'bm',
                          'dpi', 'reoa', 'EBIT', 'ch_fcf', 'issue']

raw_financial_items_28 = ['act', 'ap', 'at', 'ceq', 'che', 'cogs', 'csho', 'dlc', 'dltis', 'dltt', 'dp', 'ib',
                       'invt', 'ivao', 'ivst', 'lct', 'lt', 'ni', 'ppegt', 'pstk', 're', 'rect', 'sale', 'sstk',
                       'txp', 'txt', 'xint', 'prcc_f']

X_train_resampled_28 = X_train_resampled.loc[:, raw_financial_items_28]
y_train_resampled_28 = y_train_resampled

X_test_28 = X_test.loc[:, raw_financial_items_28]
y_test_28 = y_test

X_train_resampled_14 = X_train_resampled.loc[:, financial_ratios_14]
y_train_resampled_14 = y_train_resampled

X_test_14 = X_test.loc[:, financial_ratios_14]
y_test_14 = y_test

In [18]:
merged_train_data = pd.concat([X_train_resampled, y_train_resampled], axis = 1)
merged_test_data = pd.concat([X_test, y_test], axis = 1)

merged_train_data_28 = pd.concat([X_train_resampled_28, y_train_resampled_28], axis = 1)
merged_test_data_28 = pd.concat([X_test_28, y_test_28], axis = 1)

merged_train_data_14 = pd.concat([X_train_resampled_14, y_train_resampled_14], axis = 1)
merged_test_data_14 = pd.concat([X_test_14, y_test_14], axis = 1)

In [19]:
merged_test_data

Unnamed: 0,act,ap,at,ceq,che,cogs,csho,dlc,dltis,dltt,...,ch_cs,ch_cm,ch_roa,bm,dpi,reoa,EBIT,ch_fcf,issue,misstate
15,0.001835,0.000023,0.000165,0.217076,0.000046,0.049652,5.317380e-05,0.006079,0.000035,0.000059,...,0.729345,0.481703,0.499405,0.915293,0.001778,0.999358,0.904115,0.453615,1.0,0
16,0.002012,0.000031,0.000170,0.217096,0.000055,0.049809,5.373613e-05,0.006080,0.000019,0.000054,...,0.729345,0.481703,0.499405,0.915293,0.001776,0.999358,0.904116,0.453611,0.0,0
17,0.002637,0.000039,0.000227,0.217263,0.000123,0.050032,6.044448e-05,0.006077,0.000048,0.000076,...,0.729345,0.481703,0.499405,0.915293,0.001782,0.999358,0.904116,0.453618,1.0,0
18,0.002726,0.000045,0.000248,0.217374,0.000086,0.050312,6.221722e-05,0.006194,0.000025,0.000060,...,0.729345,0.481703,0.499405,0.915293,0.001779,0.999358,0.904116,0.453648,1.0,0
19,0.003300,0.000040,0.000316,0.217515,0.000114,0.050824,6.393883e-05,0.006111,0.000070,0.000121,...,0.729345,0.481703,0.499404,0.915293,0.001778,0.999358,0.904116,0.453616,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
268074,0.005180,0.000069,0.000317,0.214382,0.000994,0.049462,1.709988e-04,0.006087,0.000019,0.000010,...,0.729345,0.481703,0.499412,0.915293,0.001785,0.999352,0.904115,0.453870,1.0,0
268080,0.000175,0.000002,0.000025,0.216677,0.000011,0.048589,1.649056e-09,0.006083,0.000022,0.000005,...,0.729345,0.481703,0.499468,0.915293,0.001772,0.999359,0.904117,0.455286,1.0,0
268081,0.000183,0.000001,0.000027,0.216696,0.000016,0.048590,1.649056e-09,0.006079,0.000019,0.000001,...,0.729345,0.481703,0.499409,0.915293,0.001778,0.999359,0.904117,0.453557,1.0,0
268100,0.048294,0.001390,0.004145,0.221263,0.000450,0.071066,0.000000e+00,0.015072,0.000019,0.000051,...,0.729345,0.481703,0.499451,0.915293,0.001820,0.999357,0.904115,0.453966,0.0,0


In [20]:
import os

In [83]:
def fin_ratio(X_train_resampled, y_train_resampled, X_test, y_test):
    financial_ratios_14 = ['dch_wc', 'ch_rsst', 'dch_rec', 'dch_inv', 'soft_assets', 'ch_cs', 'ch_cm', 'ch_roa', 'bm',
                          'dpi', 'reoa', 'EBIT', 'ch_fcf', 'issue']

    raw_financial_items_28 = ['act', 'ap', 'at', 'ceq', 'che', 'cogs', 'csho', 'dlc', 'dltis', 'dltt', 'dp', 'ib',
                        'invt', 'ivao', 'ivst', 'lct', 'lt', 'ni', 'ppegt', 'pstk', 're', 'rect', 'sale', 'sstk',
                        'txp', 'txt', 'xint', 'prcc_f']

    X_train_resampled_28 = X_train_resampled.loc[:, raw_financial_items_28]
    y_train_resampled_28 = y_train_resampled

    X_test_28 = X_test.loc[:, raw_financial_items_28]
    y_test_28 = y_test

    X_train_resampled_14 = X_train_resampled.loc[:, financial_ratios_14]
    y_train_resampled_14 = y_train_resampled

    X_test_14 = X_test.loc[:, financial_ratios_14]
    y_test_14 = y_test

    merged_train_data = pd.concat([X_train_resampled, y_train_resampled], axis=1)
    merged_test_data = pd.concat([X_test, y_test], axis=1)

    merged_train_data_28 = pd.concat([X_train_resampled_28, y_train_resampled_28], axis=1)
    merged_test_data_28 = pd.concat([X_test_28, y_test_28], axis=1)

    merged_train_data_14 = pd.concat([X_train_resampled_14, y_train_resampled_14], axis=1)
    merged_test_data_14 = pd.concat([X_test_14, y_test_14], axis=1)

    directory = "https://github.com/syedshahlal/MDA_TextAnalysis/tree/main/dataset"

    # Save CSV files
    merged_train_data.to_csv(os.path.join(directory, 'merged_train_data.csv'), index=False)
    merged_test_data.to_csv(os.path.join(directory, 'merged_test_data.csv'), index=False)
    merged_train_data_28.to_csv(os.path.join(directory, 'merged_train_data_28.csv'), index=False)
    merged_test_data_28.to_csv(os.path.join(directory, 'merged_test_data_28.csv'), index=False)
    merged_train_data_14.to_csv(os.path.join(directory, 'merged_train_data_14.csv'), index=False)
    merged_test_data_14.to_csv(os.path.join(directory, 'merged_test_data_14.csv'), index=False)

    return (merged_train_data, merged_test_data, merged_train_data_28, merged_test_data_28, 
            merged_train_data_14, merged_test_data_14, X_train_resampled, y_train_resampled, X_test, y_test)

In [95]:
pip install pydrive


Note: you may need to restart the kernel to use updated packages.


Traceback (most recent call last):
  File "d:\University\UB\Research_SEC\MDA_TextAnalysis\mda_venv\lib\runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "d:\University\UB\Research_SEC\MDA_TextAnalysis\mda_venv\lib\runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "d:\University\UB\Research_SEC\MDA_TextAnalysis\mda_venv\lib\site-packages\pip\__main__.py", line 22, in <module>
    from pip._internal.cli.main import main as _main
  File "d:\University\UB\Research_SEC\MDA_TextAnalysis\mda_venv\lib\site-packages\pip\_internal\cli\main.py", line 10, in <module>
    from pip._internal.cli.autocompletion import autocomplete
  File "d:\University\UB\Research_SEC\MDA_TextAnalysis\mda_venv\lib\site-packages\pip\_internal\cli\autocompletion.py", line 10, in <module>
    from pip._internal.cli.main_parser import create_main_parser
  File "d:\University\UB\Research_SEC\MDA_TextAnalysis\mda_venv\lib\site-packages\pip\_internal\cli\main

In [94]:
import base64
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive

def authenticate():
    gauth = GoogleAuth()
    
    # Load the JSON file
    gauth.LoadCredentialsFile("\credentials-file.json")
    
    if gauth.credentials is None:
        # Authenticate if they're not there
        gauth.LocalWebserverAuth()
    elif gauth.access_token_expired:
        # Refresh them if expired
        gauth.Refresh()
    else:
        # Initialize the saved creds
        gauth.Authorize()
        
    return GoogleDrive(gauth)

drive = authenticate()
print("Authentication successful.")


ModuleNotFoundError: No module named 'pydrive'

In [77]:
class CSVDataset(Dataset):
    #Constructor for initially loading
    def __init__(self, path):
        df = read_csv(path, header=0)
        self.X = df.values[0:, :-1]
        self.y = df.values[0:, -1]
        self.X = self.X.astype('float32')
        self.y = self.y.astype('float32')
        self.y = self.y.reshape((len(self.y), 1))

        print(self.X.shape)
        print(self.y.shape)

    # Get the number of rows in the dataset
    def __len__(self):
        return len(self.X)
    # Get a row at an index
    def __getitem__(self,idx):
        return [self.X[idx], self.y[idx]]

In [78]:
def prepare_train_dataset(path):
    train = CSVDataset(path)
    train_dl = DataLoader(train, batch_size=1662, shuffle=True)
    return train_dl

def prepare_test_dataset(path):
    test = CSVDataset(path)
    test_dl = DataLoader(test, batch_size=1024, shuffle=False)
    return test_dl


In [82]:
a = prepare_train_dataset('https://media.githubusercontent.com/media/syedshahlal/MDA_TextAnalysis/main/dataset/merged_train_data_14.csv')

(1672, 14)
(1672, 1)


In [49]:
train_dl_42 = prepare_train_dataset('/dataset/merged_train_data.csv')
test_dl_42 = prepare_test_dataset('/dataset/merged_test_data.csv')

train_dl_28 = prepare_train_dataset('/dataset/merged_train_data_28.csv')
test_dl_28 = prepare_test_dataset('/dataset/merged_test_data_28.csv')

train_dl_14 = prepare_train_dataset('/dataset/merged_train_data_14.csv')
test_dl_14 = prepare_test_dataset('/dataset/merged_test_data_14.csv')

(222262, 42)
(222262, 1)
(125314, 42)
(125314, 1)
(239976, 28)
(239976, 1)
(125314, 28)
(125314, 1)
(239976, 14)
(239976, 1)
(125314, 14)
(125314, 1)


In [30]:
test_dl_14.dataset.X.shape

(125314, 14)

In [31]:
#{'n_layers': 5, 'layer_0_size': 57, 'layer_1_size': 22, 'layer_2_size': 107, 'layer_3_size': 202, 'layer_4_size': 162,
#'activation_0': 'relu', 'activation_1': 'relu', 'activation_2': 'relu', 'activation_3': 'sigmoid', 'activation_4': 'sigmoid'}.

class FraudDetectionMLP(Module):
    def __init__(self, n_inputs):
        super(FraudDetectionMLP, self).__init__()
        # Input layer
        self.hidden1 = Linear(n_inputs, 57)
        kaiming_uniform_(self.hidden1.weight, nonlinearity='relu')
        self.act1 = ReLU()
        # Second (hidden) layer
        self.hidden2 = Linear(57, 22)
        kaiming_uniform_(self.hidden2.weight, nonlinearity='relu')
        self.act2 = ReLU()
        # Third (hidden) layer
        self.hidden3 = Linear(22, 107)
        kaiming_uniform_(self.hidden3.weight, nonlinearity='relu')
        self.act3 = ReLU()
        # Fourth (hidden) layer
        self.hidden4 = Linear(107, 202)
        kaiming_uniform_(self.hidden4.weight, nonlinearity='relu')
        self.act4 = Sigmoid()
        # Fifth (hidden) layer
        self.hidden5 = Linear(202, 162)
        kaiming_uniform_(self.hidden5.weight, nonlinearity='relu')
        self.act5 = Sigmoid()
        # Output layer
        self.hidden6 = Linear(162,1)
        xavier_uniform_(self.hidden6.weight)
        self.act6 = Sigmoid()

    def forward(self, X):
        # Input to the first hidden layer
        X = self.hidden1(X)
        X = self.act1(X)
        # Second hidden layer
        X = self.hidden2(X)
        X = self.act2(X)
        # Third hidden layer
        X = self.hidden3(X)
        X = self.act3(X)
        # Fourth hidden layer
        X = self.hidden4(X)
        X = self.act4(X)
        # Fifth hidden layer
        X = self.hidden5(X)
        X = self.act5(X)
        # Output layer
        X = self.hidden6(X)
        X = self.act6(X)
        return X


In [32]:
def train_model(model, train_dl, num_epochs):

    # Define loss function and optimizer
    criterion = torch.nn.BCELoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for inputs, labels in train_dl:
            optimizer.zero_grad()
            outputs = model(inputs.float())
            labels = labels.float()
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * inputs.size(0)
        epoch_loss = running_loss / len(train_dl.dataset)

In [33]:
def evaluate_model(model, test_dl):
    model.eval()
    y_true = []
    y_pred = []
    with torch.no_grad():
        for inputs, labels in test_dl:
            outputs = model(inputs.float())
            y_true.extend(labels.numpy())
            y_pred.extend(outputs.numpy().flatten())
    auc = roc_auc_score(y_true, y_pred)
    print(f"AUC: {auc:.4f}")

In [36]:
# 42 features model
model = FraudDetectionMLP(42)

In [37]:
train_model(model, train_dl_42, num_epochs=150)

In [38]:
evaluate_model(model, test_dl_42)


AUC: 0.6503


In [39]:
auc_values = []

# Perform 10 training runs
for i in range(10):
    model_42 = FraudDetectionMLP(42)
    train_model(model_42, train_dl_42, num_epochs=150)

    # Evaluate the model
    model_42.eval()
    y_true = []
    y_pred = []
    with torch.no_grad():
        for inputs, labels in test_dl_42:
            outputs = model_42(inputs.float())
            y_true.extend(labels.numpy())
            y_pred.extend(outputs.numpy().flatten())
    auc = roc_auc_score(y_true, y_pred)
    auc_values.append(auc)  # Append AUC to the list
    print(f"Run {i+1}: AUC = {auc:.4f}")

# Calculate the average AUC
average_auc = np.mean(auc_values)
print(f"\nAverage AUC: {average_auc:.4f}")

# Calculate the std AUC
auc_std_dev = np.std(auc_values)
print(f"Standard Deviation of AUC: {auc_std_dev:.4f}")

Run 1: AUC = 0.6459
Run 2: AUC = 0.6383
Run 3: AUC = 0.6438
Run 4: AUC = 0.6469
Run 5: AUC = 0.6550
Run 6: AUC = 0.6426
Run 7: AUC = 0.6434
Run 8: AUC = 0.6523
Run 9: AUC = 0.6404
Run 10: AUC = 0.6495

Average AUC: 0.6458
Standard Deviation of AUC: 0.0050


In [44]:
plt.hist(auc_values, bins=10, color='c', edgecolor='k', alpha=0.7)
plot_title = "Distribution of AUC Values"
plt.title(plot_title)
plt.xlabel("AUC")
plt.ylabel("Frequency")
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

  plt.show()


In [None]:
# 28 raw financial items model
model_28 = FraudDetectionMLP(28)

In [None]:
train_model(model_28, train_dl_28, num_epochs=150)

In [None]:
evaluate_model(model_28, test_dl_28)

In [None]:
auc_values_28 = []

# Perform 10 training runs
for i in range(10):
    model_28 = FraudDetectionMLP(28)
    train_model(model_28, train_dl_28, num_epochs=150)

    # Evaluate the model
    model_28.eval()
    y_true = []
    y_pred = []
    with torch.no_grad():
        for inputs, labels in test_dl_28:
            outputs = model_28(inputs.float())
            y_true.extend(labels.numpy())
            y_pred.extend(outputs.numpy().flatten())
    auc_28 = roc_auc_score(y_true, y_pred)
    auc_values_28.append(auc_28)  # Append AUC to the list
    print(f"Run {i+1}: AUC = {auc_28:.4f}")

# Calculate the average AUC
average_auc_28 = np.mean(auc_values_28)
print(f"\nAverage AUC: {average_auc_28:.4f}")

# Calculate the std AUC
auc_std_dev_28 = np.std(auc_values_28)
print(f"Standard Deviation of AUC: {auc_std_dev_28:.4f}")

Run 1: AUC = 0.6096
Run 2: AUC = 0.5723
Run 3: AUC = 0.5291
Run 4: AUC = 0.6051
Run 5: AUC = 0.5681
Run 6: AUC = 0.6039
Run 7: AUC = 0.5078
Run 8: AUC = 0.4255
Run 9: AUC = 0.5920
Run 10: AUC = 0.4869

Average AUC: 0.5500
Standard Deviation of AUC: 0.0581


In [None]:
# 14 financial ratios model
model_14 = FraudDetectionMLP(14)

In [None]:
train_model(model_14, train_dl_14, num_epochs=150)

In [None]:
evaluate_model(model_14, test_dl_14)

In [None]:
auc_values_14 = []

# Perform 10 training runs
for i in range(10):
    model_14 = FraudDetectionMLP(14)
    train_model(model_14, train_dl_14, num_epochs=150)

    # Evaluate the model
    model_14.eval()
    y_true = []
    y_pred = []
    with torch.no_grad():
        for inputs, labels in test_dl_14:
            outputs = model_14(inputs.float())
            y_true.extend(labels.numpy())
            y_pred.extend(outputs.numpy().flatten())
    auc_14 = roc_auc_score(y_true, y_pred)
    auc_values_14.append(auc_14)  # Append AUC to the list
    print(f"Run {i+1}: AUC = {auc_14:.4f}")

# Calculate the average AUC
average_auc_14 = np.mean(auc_values_14)
print(f"\nAverage AUC: {average_auc_14:.4f}")

auc_std_dev_14 = np.std(auc_values_14)
print(f"Standard Deviation of AUC: {auc_std_dev_14:.4f}")

Run 1: AUC = 0.6425
Run 2: AUC = 0.6160
Run 3: AUC = 0.6258
Run 4: AUC = 0.6121
Run 5: AUC = 0.6359
Run 6: AUC = 0.6460
Run 7: AUC = 0.5875
Run 8: AUC = 0.6221
Run 9: AUC = 0.6248
Run 10: AUC = 0.6406

Average AUC: 0.6253
Standard Deviation of AUC: 0.0166
