In [30]:
import numpy as np
import pandas as pd
from sklearn.metrics import r2_score,accuracy_score
from sklearn.model_selection import cross_val_score,GridSearchCV,StratifiedKFold
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import make_column_selector,make_column_transformer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler,ADASYN,SMOTE,SVMSMOTE

In [16]:
from imblearn.over_sampling import ADASYN

In [2]:
comp_bankruptcy=pd.read_csv("data.csv")
comp_bankruptcy.head()
X=comp_bankruptcy.drop('Bankrupt?',axis=1)
y=comp_bankruptcy['Bankrupt?']

In [3]:
y.value_counts(normalize=True)*100

Bankrupt?
0    96.77372
1     3.22628
Name: proportion, dtype: float64

In [4]:
# Imbalance Data

## make data Balance

In [5]:
!pip install imblearn



## 1. Under Sampling

In [7]:
u_sampler=RandomUnderSampler(random_state=23)
X_us,y_us=u_sampler.fit_resample(X,y)

In [8]:
y_us.value_counts()

Bankrupt?
0    220
1    220
Name: count, dtype: int64

In [9]:
y_us.value_counts(normalize=True)*100

Bankrupt?
0    50.0
1    50.0
Name: proportion, dtype: float64

## 2. Over Sampling

In [11]:
o_sampler=RandomOverSampler(random_state=23)
X_os,y_os=o_sampler.fit_resample(X,y)

In [12]:
y_os.value_counts()

Bankrupt?
1    6599
0    6599
Name: count, dtype: int64

In [13]:
y_os.value_counts(normalize=True)*100

Bankrupt?
1    50.0
0    50.0
Name: proportion, dtype: float64

## 3. SMOTE

In [15]:
smote_sampler=SMOTE(random_state=23)
X_smote,y_smote=smote_sampler.fit_resample(X,y)

print(y_smote.value_counts())
print(y_smote.value_counts(normalize=True)*100)

## 4. Adasyn

In [17]:
adasyn=ADASYN(random_state=23)
X_adasyn,y_adasyn=adasyn.fit_resample(X,y)
print(y_adasyn.value_counts())
print(y_adasyn.value_counts(normalize=True)*100)

Bankrupt?
0    6599
1    6523
Name: count, dtype: int64
Bankrupt?
0    50.28959
1    49.71041
Name: proportion, dtype: float64


## Accuracy of Each Technique

In [18]:
kfold=StratifiedKFold(n_splits=5,shuffle=True,random_state=23)
rf=RandomForestClassifier(random_state=23)

In [19]:
results_rus=cross_val_score(rf,X_us,y_us,cv=kfold,verbose=3)

[CV] END ................................ score: (test=0.932) total time=   0.4s
[CV] END ................................ score: (test=0.852) total time=   0.4s
[CV] END ................................ score: (test=0.841) total time=   0.4s
[CV] END ................................ score: (test=0.864) total time=   0.3s
[CV] END ................................ score: (test=0.898) total time=   0.3s


In [20]:
results_rus.mean()

0.8772727272727273

In [21]:
results_ros=cross_val_score(rf,X_os,y_os,cv=kfold,verbose=3)
print(results_ros.mean())

[CV] END ................................ score: (test=0.995) total time=   5.7s
[CV] END ................................ score: (test=0.997) total time=   5.6s
[CV] END ................................ score: (test=0.992) total time=   5.4s
[CV] END ................................ score: (test=0.994) total time=   5.6s
[CV] END ................................ score: (test=0.994) total time=   5.6s
0.9943172631965735


In [22]:
results_smote=cross_val_score(rf,X_smote,y_smote,cv=kfold,verbose=3)
print(results_smote.mean())

[CV] END ................................ score: (test=0.981) total time=  13.1s
[CV] END ................................ score: (test=0.986) total time=  13.0s
[CV] END ................................ score: (test=0.981) total time=  13.2s
[CV] END ................................ score: (test=0.974) total time=  13.7s
[CV] END ................................ score: (test=0.983) total time=  12.8s
0.9810573908849772


In [23]:
results_adasyn=cross_val_score(rf,X_adasyn,y_adasyn,cv=kfold,verbose=3)
print(results_adasyn.mean())

[CV] END ................................ score: (test=0.979) total time=  13.2s
[CV] END ................................ score: (test=0.979) total time=  12.7s
[CV] END ................................ score: (test=0.982) total time=  13.0s
[CV] END ................................ score: (test=0.984) total time=  12.8s
[CV] END ................................ score: (test=0.978) total time=  13.2s
0.980490969802555


In [25]:
print("Accuracy of RUS: ",results_rus.mean())
print("Accuracy of ROS: ",results_ros.mean())
print("Accuracy of SMOTE: ",results_smote.mean())
print("Accuracy of ADASYN: ",results_adasyn.mean())

Accuracy of RUS:  0.8772727272727273
Accuracy of ROS:  0.9943172631965735
Accuracy of SMOTE:  0.9810573908849772
Accuracy of ADASYN:  0.980490969802555


In [29]:
sklearn.__version__

'1.3.2'

## 5. SVMSMOTE

In [35]:
svmsmote=SVMSMOTE(random_state=23,sampling_strategy=1)
X_svmsmote,y_svmsmote=svmsmote.fit_resample(X,y)
print(y_svmsmote.value_counts())
print(y_svmsmote.value_counts(normalize=True)*100)

Bankrupt?
0    6599
1    3726
Name: count, dtype: int64
Bankrupt?
0    63.912833
1    36.087167
Name: proportion, dtype: float64


In [33]:
results_svmsmote=cross_val_score(rf,X_svmsmote,y_svmsmote,cv=kfold,verbose=3)
print("Accuracy of SVMSMOTE",results_svmsmote.mean())

[CV] END ................................ score: (test=0.977) total time=   8.7s
[CV] END ................................ score: (test=0.978) total time=   8.5s
[CV] END ................................ score: (test=0.981) total time=   8.3s
[CV] END ................................ score: (test=0.978) total time=   8.5s
[CV] END ................................ score: (test=0.978) total time=   8.9s
Accuracy of SVMSMOTE 0.9782082324455207
