In [21]:
# package used in this project
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from imblearn.ensemble import RUSBoostClassifier
from sklearn.metrics import ndcg_score
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier

In [3]:
data = pd.read_csv("uscecchini28.csv")
data = data.drop(columns=['p_aaer', 'new_p_aaer']) # drop two columns
data.fillna(0, inplace=True)  # fill missing value with 0
data

Unnamed: 0,fyear,gvkey,sich,insbnk,understatement,option,misstate,act,ap,at,...,soft_assets,ch_cs,ch_cm,ch_roa,issue,bm,dpi,reoa,EBIT,ch_fcf
0,1990,1009,3460.0,0,0,0,0,10.047,3.736,32.335,...,0.312448,0.095082,0.082631,-0.019761,1,0.413170,0.873555,0.167620,0.161961,-0.042140
1,1990,1011,4841.0,0,0,0,0,1.247,0.803,7.784,...,0.315904,0.188832,-0.211389,-0.117832,1,0.157887,0.745139,-0.428957,-0.157888,0.100228
2,1990,1017,3812.0,0,0,0,0,55.040,3.601,118.120,...,0.605342,0.097551,-0.105780,0.091206,1,2.231337,1.015131,0.394768,0.063681,0.066348
3,1990,1021,3861.0,0,0,0,0,24.684,3.948,34.591,...,0.793068,-0.005725,-0.249704,0.017545,1,1.043582,1.026261,0.094822,0.088347,-0.017358
4,1990,1028,7385.0,0,0,0,0,17.325,3.520,27.542,...,0.869182,-0.231536,-1.674893,-0.466667,0,-1.602508,0.598443,-0.942379,-0.700821,0.130349
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
146040,2014,314866,8200.0,0,0,0,0,262.600,12.400,1234.800,...,0.751944,0.560406,0.127217,-0.050591,1,0.103693,0.829680,-0.327178,-0.008179,-0.261606
146041,2014,315318,2890.0,0,0,0,0,1578.400,106.700,4557.600,...,0.742781,-0.118178,0.031360,0.095355,1,0.581796,0.743084,-0.077826,0.000461,-0.296702
146042,2014,316056,3420.0,0,0,0,0,973.800,249.500,2015.900,...,0.751129,0.004207,-0.037925,0.072050,1,-0.000903,1.063878,-0.002877,0.153133,0.065569
146043,2014,317260,4412.0,0,0,0,0,51.743,1.555,322.421,...,0.018001,0.000000,0.000000,0.000000,1,1.109467,0.000000,0.000000,0.028804,0.000000


In [4]:
data.isnull().sum().sum()  # the number of missing value in the dataset

0

In [5]:
data.groupby('misstate')['misstate'].count()  # 0: accounting fraud. The number of fraud vs nonfraud

misstate
0    145081
1       964
Name: misstate, dtype: int64

In [6]:
features = data.iloc[:, 7:]   #7:35: raw numbers
labels = data.iloc[:, 6]

In [40]:
#np.random.seed(68)
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2,shuffle=True)

In [8]:
# Logistic Regression
logi_L = LogisticRegression()
logi_L.fit(X_train, y_train)
y_pred = logi_L.predict(X_test)
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, y_pred))

Logistic Regression Accuracy: 0.9925365469547057


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [10]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()  # confusion matrix of this prediction
print((tn, fp, fn, tp))
print(roc_auc_score(y_test, y_pred))

(28991, 23, 195, 0)
0.49960363962225135


In [11]:
#Naive Bayes
GNB = GaussianNB()
GNB.fit(X_train, y_train)
y_pred = GNB.predict(X_test)
print("Naive Bayes Accuracy:",metrics.accuracy_score(y_test, y_pred))

Naive Bayes Accuracy: 0.9658324489027355


In [12]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()  # confusion matrix of this prediction
print((tn, fp, fn, tp))
print(roc_auc_score(y_test, y_pred))

(28189, 825, 173, 22)
0.5421929819910105


In [13]:
# KNN
KNN = KNeighborsClassifier()
KNN.fit(X_train, y_train)
y_pred = KNN.predict(X_test)
print("KNN Accuracy:",metrics.accuracy_score(y_test, y_pred))

KNN Accuracy: 0.993358211510151


In [14]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()  # confusion matrix of this prediction
print((tn, fp, fn, tp))
print(roc_auc_score(y_test, y_pred))

(29003, 11, 183, 12)
0.5305796671103076


In [15]:
# adaboost
abc = AdaBoostClassifier(n_estimators=50,learning_rate=1)
model = abc.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Adaboost:",metrics.accuracy_score(y_test, y_pred))

Adaboost: 0.9933239754870075


In [16]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()  # confusion matrix of this prediction
print((tn, fp, fn, tp))
print(roc_auc_score(y_test, y_pred))

(29014, 0, 195, 0)
0.5


In [26]:
# Random forest
clf=RandomForestClassifier(n_estimators=100)
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
print("Random forest Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(roc_auc_score(y_test, y_pred))

Random forest Accuracy: 0.9933239754870075
0.5


In [41]:
# RUSBoost
clf = RUSBoostClassifier(random_state=0)
clf.fit(X_train, y_train)  
y_pred = clf.predict(X_test)
roc_auc_score(y_test, y_pred)

0.6905688862401599

In [42]:
# SMOTEBoost
sm = SMOTE(random_state = 0) 
X_train_res, y_train_res = sm.fit_sample(X_train, y_train.ravel()) 
  
print('After OverSampling, the shape of train_X: {}'.format(X_train_res.shape)) 
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_res.shape)) 
  
print("After OverSampling, counts of label '1': {}".format(sum(y_train_res == 1))) 
print("After OverSampling, counts of label '0': {}".format(sum(y_train_res == 0)))

After OverSampling, the shape of train_X: (232148, 28)
After OverSampling, the shape of train_y: (232148,) 

After OverSampling, counts of label '1': 116074
After OverSampling, counts of label '0': 116074


In [43]:
abc = AdaBoostClassifier(learning_rate=1) # n_estimators=50,
model = abc.fit(X_train_res, y_train_res.ravel())
y_pred = model.predict(X_test)
roc_auc_score(y_test, y_pred)

0.6903312344886366