In [2]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns',None)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
import warnings
warnings.filterwarnings(action='ignore')

In [3]:
#loading the dataset
df=pd.read_csv('/kaggle/input/audit-data/audit_data.csv')
#loading
df

Unnamed: 0,Sector_score,LOCATION_ID,PARA_A,Score_A,Risk_A,PARA_B,Score_B,Risk_B,TOTAL,numbers,Score_B.1,Risk_C,Money_Value,Score_MV,Risk_D,District_Loss,PROB,RiSk_E,History,Prob,Risk_F,Score,Inherent_Risk,CONTROL_RISK,Detection_Risk,Audit_Risk,Risk
0,3.89,23,4.18,0.6,2.508,2.50,0.2,0.500,6.68,5.0,0.2,1.0,3.38,0.2,0.676,2,0.2,0.4,0,0.2,0.0,2.4,8.574,0.4,0.5,1.7148,1
1,3.89,6,0.00,0.2,0.000,4.83,0.2,0.966,4.83,5.0,0.2,1.0,0.94,0.2,0.188,2,0.2,0.4,0,0.2,0.0,2.0,2.554,0.4,0.5,0.5108,0
2,3.89,6,0.51,0.2,0.102,0.23,0.2,0.046,0.74,5.0,0.2,1.0,0.00,0.2,0.000,2,0.2,0.4,0,0.2,0.0,2.0,1.548,0.4,0.5,0.3096,0
3,3.89,6,0.00,0.2,0.000,10.80,0.6,6.480,10.80,6.0,0.6,3.6,11.75,0.6,7.050,2,0.2,0.4,0,0.2,0.0,4.4,17.530,0.4,0.5,3.5060,1
4,3.89,6,0.00,0.2,0.000,0.08,0.2,0.016,0.08,5.0,0.2,1.0,0.00,0.2,0.000,2,0.2,0.4,0,0.2,0.0,2.0,1.416,0.4,0.5,0.2832,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
771,55.57,9,0.49,0.2,0.098,0.40,0.2,0.080,0.89,5.0,0.2,1.0,0.00,0.2,0.000,2,0.2,0.4,0,0.2,0.0,2.0,1.578,0.4,0.5,0.3156,0
772,55.57,16,0.47,0.2,0.094,0.37,0.2,0.074,0.84,5.0,0.2,1.0,0.00,0.2,0.000,2,0.2,0.4,0,0.2,0.0,2.0,1.568,0.4,0.5,0.3136,0
773,55.57,14,0.24,0.2,0.048,0.04,0.2,0.008,0.28,5.0,0.2,1.0,0.00,0.2,0.000,2,0.2,0.4,0,0.2,0.0,2.0,1.456,0.4,0.5,0.2912,0
774,55.57,18,0.20,0.2,0.040,0.00,0.2,0.000,0.20,5.0,0.2,1.0,0.00,0.2,0.000,2,0.2,0.4,0,0.2,0.0,2.0,1.440,0.4,0.5,0.2880,0


In [5]:
#getting preliminary information  about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 776 entries, 0 to 775
Data columns (total 27 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Sector_score    776 non-null    float64
 1   LOCATION_ID     776 non-null    object 
 2   PARA_A          776 non-null    float64
 3   Score_A         776 non-null    float64
 4   Risk_A          776 non-null    float64
 5   PARA_B          776 non-null    float64
 6   Score_B         776 non-null    float64
 7   Risk_B          776 non-null    float64
 8   TOTAL           776 non-null    float64
 9   numbers         776 non-null    float64
 10  Score_B.1       776 non-null    float64
 11  Risk_C          776 non-null    float64
 12  Money_Value     775 non-null    float64
 13  Score_MV        776 non-null    float64
 14  Risk_D          776 non-null    float64
 15  District_Loss   776 non-null    int64  
 16  PROB            776 non-null    float64
 17  RiSk_E          776 non-null    flo

In [6]:
#checking for missing  value in the dataset
df.isna().sum()

Sector_score      0
LOCATION_ID       0
PARA_A            0
Score_A           0
Risk_A            0
PARA_B            0
Score_B           0
Risk_B            0
TOTAL             0
numbers           0
Score_B.1         0
Risk_C            0
Money_Value       1
Score_MV          0
Risk_D            0
District_Loss     0
PROB              0
RiSk_E            0
History           0
Prob              0
Risk_F            0
Score             0
Inherent_Risk     0
CONTROL_RISK      0
Detection_Risk    0
Audit_Risk        0
Risk              0
dtype: int64

# Preprocessing

In [16]:
def preprocess_inputs(df):
    df=df.copy()
    
    #filling the missing value in the dataset
    df['Money_Value']=df['Money_Value'].fillna(df['Money_Value'].mean())
    
    #onehot_encode location column
    def onehot_encode(df,column):
        df=df.copy()
        dummies=pd.get_dummies(df[column])
        df=pd.concat([df,dummies],axis=1)
        df=df.drop(column,axis=1)
        return df
    
    df=onehot_encode(df,'LOCATION_ID')
    
    for column in df.columns:
        if df[column].dtype=='bool':
            df[column]=df[column].astype(int)
    
    #splitting between target and features
    y=df['Risk']
    x=df.drop('Risk',axis=1)
    
    #train_test_split
    x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7,random_state=123,shuffle=True)
    #scaling the data
    
    scaler=StandardScaler()
    scaler.fit(x_train)
    x_train=pd.DataFrame(scaler.transform(x_train),columns=x_train.columns,index=x_train.index)
    x_test=pd.DataFrame(scaler.transform(x_test),columns=x_test.columns,index=x_test.index)
    return x_train,x_test,y_train,y_test

In [17]:
x_train,x_test,y_train,y_test=preprocess_inputs(df)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(543, 70)
(233, 70)
(543,)
(233,)


In [18]:
x_train

Unnamed: 0,Sector_score,PARA_A,Score_A,Risk_A,PARA_B,Score_B,Risk_B,TOTAL,numbers,Score_B.1,Risk_C,Money_Value,Score_MV,Risk_D,District_Loss,PROB,RiSk_E,History,Prob,Risk_F,Score,Inherent_Risk,CONTROL_RISK,Detection_Risk,Audit_Risk,1,11,12,13,14,15,16,17,18,19,2,20,21,22,23,24,25,27,28,29,3,30,31,32,33,34,35,36,37,38,39,4,40,41,42,43,44,5,6,7,8,9,LOHARU,NUH,SAFIDON
40,-0.659642,-0.414776,-0.896122,-0.381534,-0.127582,-0.659049,-0.165665,-0.169591,-0.266739,-0.314612,-0.302425,0.182728,1.902322,0.187129,-0.430086,-0.150329,-0.411259,-0.205625,-0.235321,-0.183858,-0.362005,0.015775,-0.403136,0.0,-0.086147,-0.122284,-0.174243,-0.258452,-0.224255,-0.162681,-0.224255,-0.285785,-0.042954,-0.143794,3.286997,-0.233171,-0.060802,-0.105703,-0.174243,-0.042954,-0.042954,-0.096404,-0.086146,-0.122284,-0.136973,-0.060802,-0.086146,-0.129823,-0.200574,-0.042954,-0.042954,-0.060802,-0.074536,-0.122284,-0.074536,-0.096404,-0.246063,-0.074536,-0.042954,0.0,-0.114279,-0.042954,-0.246063,-0.190419,-0.074536,-0.325433,-0.241825,0.0,-0.042954,-0.042954
102,-0.659642,-0.018335,1.372382,0.011594,-0.114116,-0.659049,-0.161179,-0.112968,-0.266739,-0.314612,-0.302425,-0.105720,0.662273,-0.134029,-0.430086,-0.150329,-0.411259,2.073033,2.881253,1.426380,0.102148,-0.186319,0.528365,0.0,-0.110251,-0.122284,-0.174243,-0.258452,-0.224255,-0.162681,-0.224255,-0.285785,-0.042954,-0.143794,-0.304229,-0.233171,-0.060802,-0.105703,-0.174243,-0.042954,-0.042954,-0.096404,-0.086146,-0.122284,-0.136973,-0.060802,-0.086146,-0.129823,-0.200574,23.280893,-0.042954,-0.060802,-0.074536,-0.122284,-0.074536,-0.096404,-0.246063,-0.074536,-0.042954,0.0,-0.114279,-0.042954,-0.246063,-0.190419,-0.074536,-0.325433,-0.241825,0.0,-0.042954,-0.042954
759,1.493946,-0.285200,-0.896122,-0.338703,-0.180410,-0.659049,-0.183263,-0.207114,-0.266739,-0.314612,-0.302425,-0.204526,-0.577776,-0.199854,-0.430086,-0.150329,-0.411259,-0.205625,-0.235321,-0.183858,-0.826157,-0.275198,-0.403136,0.0,-0.167604,-0.122284,-0.174243,-0.258452,-0.224255,6.147009,-0.224255,-0.285785,-0.042954,-0.143794,-0.304229,-0.233171,-0.060802,-0.105703,-0.174243,-0.042954,-0.042954,-0.096404,-0.086146,-0.122284,-0.136973,-0.060802,-0.086146,-0.129823,-0.200574,-0.042954,-0.042954,-0.060802,-0.074536,-0.122284,-0.074536,-0.096404,-0.246063,-0.074536,-0.042954,0.0,-0.114279,-0.042954,-0.246063,-0.190419,-0.074536,-0.325433,-0.241825,0.0,-0.042954,-0.042954
294,-0.722983,-0.160252,0.238130,-0.213269,-0.191286,-0.659049,-0.186886,-0.204071,-0.266739,-0.314612,-0.302425,0.161293,1.902322,0.165710,-0.430086,-0.150329,-0.411259,-0.205625,-0.235321,-0.183858,-0.129929,-0.001184,-0.403136,0.0,-0.090894,-0.122284,-0.174243,3.869184,-0.224255,-0.162681,-0.224255,-0.285785,-0.042954,-0.143794,-0.304229,-0.233171,-0.060802,-0.105703,-0.174243,-0.042954,-0.042954,-0.096404,-0.086146,-0.122284,-0.136973,-0.060802,-0.086146,-0.129823,-0.200574,-0.042954,-0.042954,-0.060802,-0.074536,-0.122284,-0.074536,-0.096404,-0.246063,-0.074536,-0.042954,0.0,-0.114279,-0.042954,-0.246063,-0.190419,-0.074536,-0.325433,-0.241825,0.0,-0.042954,-0.042954
525,0.078779,-0.414776,-0.896122,-0.381534,-0.191631,-0.659049,-0.187001,-0.232298,-0.266739,-0.314612,-0.302425,-0.204526,-0.577776,-0.199854,-0.430086,-0.150329,-0.411259,-0.205625,-0.235321,-0.183858,-0.826157,-0.279956,-0.403136,0.0,-0.168936,-0.122284,-0.174243,-0.258452,-0.224255,6.147009,-0.224255,-0.285785,-0.042954,-0.143794,-0.304229,-0.233171,-0.060802,-0.105703,-0.174243,-0.042954,-0.042954,-0.096404,-0.086146,-0.122284,-0.136973,-0.060802,-0.086146,-0.129823,-0.200574,-0.042954,-0.042954,-0.060802,-0.074536,-0.122284,-0.074536,-0.096404,-0.246063,-0.074536,-0.042954,0.0,-0.114279,-0.042954,-0.246063,-0.190419,-0.074536,-0.325433,-0.241825,0.0,-0.042954,-0.042954
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,-0.659642,-0.414776,-0.896122,-0.381534,-0.162973,-0.659049,-0.177454,-0.204240,-0.266739,-0.314612,-0.302425,-0.194593,-0.577776,-0.196545,2.663115,-0.150329,2.170404,-0.205625,-0.235321,-0.183858,-0.362005,-0.259453,1.459866,0.0,-0.139200,-0.122284,-0.174243,-0.258452,-0.224255,-0.162681,-0.224255,3.499129,-0.042954,-0.143794,-0.304229,-0.233171,-0.060802,-0.105703,-0.174243,-0.042954,-0.042954,-0.096404,-0.086146,-0.122284,-0.136973,-0.060802,-0.086146,-0.129823,-0.200574,-0.042954,-0.042954,-0.060802,-0.074536,-0.122284,-0.074536,-0.096404,-0.246063,-0.074536,-0.042954,0.0,-0.114279,-0.042954,-0.246063,-0.190419,-0.074536,-0.325433,-0.241825,0.0,-0.042954,-0.042954
322,-0.722983,-0.164879,0.238130,-0.216329,-0.183172,-0.659049,-0.184183,-0.196634,-0.266739,-0.314612,-0.302425,-0.204526,-0.577776,-0.199854,-0.430086,-0.150329,-0.411259,-0.205625,-0.235321,-0.183858,-0.594081,-0.268044,-0.403136,0.0,-0.165602,-0.122284,-0.174243,-0.258452,-0.224255,-0.162681,-0.224255,-0.285785,-0.042954,-0.143794,3.286997,-0.233171,-0.060802,-0.105703,-0.174243,-0.042954,-0.042954,-0.096404,-0.086146,-0.122284,-0.136973,-0.060802,-0.086146,-0.129823,-0.200574,-0.042954,-0.042954,-0.060802,-0.074536,-0.122284,-0.074536,-0.096404,-0.246063,-0.074536,-0.042954,0.0,-0.114279,-0.042954,-0.246063,-0.190419,-0.074536,-0.325433,-0.241825,0.0,-0.042954,-0.042954
382,-0.738818,-0.134028,0.238130,-0.195933,-0.191631,-0.659049,-0.187001,-0.201536,-0.266739,-0.314612,-0.302425,0.908093,1.902322,0.911988,1.116515,-0.150329,0.879572,-0.205625,-0.235321,-0.183858,0.102148,0.553679,0.528365,0.0,0.304073,-0.122284,-0.174243,-0.258452,-0.224255,-0.162681,-0.224255,3.499129,-0.042954,-0.143794,-0.304229,-0.233171,-0.060802,-0.105703,-0.174243,-0.042954,-0.042954,-0.096404,-0.086146,-0.122284,-0.136973,-0.060802,-0.086146,-0.129823,-0.200574,-0.042954,-0.042954,-0.060802,-0.074536,-0.122284,-0.074536,-0.096404,-0.246063,-0.074536,-0.042954,0.0,-0.114279,-0.042954,-0.246063,-0.190419,-0.074536,-0.325433,-0.241825,0.0,-0.042954,-0.042954
365,-0.738818,-0.255891,0.238130,-0.276496,-0.191631,-0.659049,-0.187001,-0.214889,-0.266739,-0.314612,-0.302425,0.152014,1.902322,0.156437,-0.430086,-0.150329,-0.411259,-0.205625,-0.235321,-0.183858,-0.129929,-0.012010,-0.403136,0.0,-0.093925,-0.122284,-0.174243,-0.258452,-0.224255,-0.162681,-0.224255,-0.285785,-0.042954,-0.143794,-0.304229,-0.233171,-0.060802,-0.105703,-0.174243,-0.042954,-0.042954,-0.096404,-0.086146,-0.122284,-0.136973,-0.060802,-0.086146,-0.129823,-0.200574,-0.042954,-0.042954,-0.060802,-0.074536,-0.122284,-0.074536,-0.096404,-0.246063,-0.074536,-0.042954,0.0,-0.114279,-0.042954,-0.246063,-0.190419,-0.074536,-0.325433,4.135215,0.0,-0.042954,-0.042954


# Model Construction and Training

In [19]:
 models={'LogisicRegression':LogisticRegression(),
'Kneighbors':KNeighborsClassifier(),
'Decision Tree':DecisionTreeClassifier(),
'Random Forest':RandomForestClassifier(),
'Gradient Boosting':GradientBoostingClassifier(),
'MLP Classifier':MLPClassifier()}

In [20]:
for name,model in models.items():
    model.fit(x_train,y_train)
    print(model,'Trained')
    print(model.score(x_test,y_test))

LogisticRegression() Trained
0.9699570815450643
KNeighborsClassifier() Trained
0.8841201716738197
DecisionTreeClassifier() Trained
1.0
RandomForestClassifier() Trained
1.0
GradientBoostingClassifier() Trained
1.0
MLPClassifier() Trained
0.9527896995708155


In [22]:
y_train.value_counts()

Risk
0    323
1    220
Name: count, dtype: int64