In [169]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE

In [170]:
df=pd.read_csv('classification.csv')

In [171]:
df

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Ensemble_Cluster,Risk
0,2.765073,0.669945,0.146876,-0.133644,-0.463177,-0.653504,-0.744759,-1.235859,1.072726,0,Non-Risk
1,-1.190808,-1.491168,0.146876,-0.133644,-0.463177,1.000937,0.949342,2.247070,1.072726,0,Non-Risk
2,1.182721,0.669945,-1.383079,-0.133644,-0.463177,-0.653504,-0.416354,-0.738298,0.061674,0,Non-Risk
3,0.831087,0.669945,0.146876,-2.015948,-0.463177,-0.653504,1.633430,1.749509,0.567200,0,Non-Risk
4,1.534354,0.669945,0.146876,-2.015948,-0.463177,-0.653504,0.566380,0.256825,-0.949378,0,Non-Risk
...,...,...,...,...,...,...,...,...,...,...,...
995,-0.399632,-1.491168,-1.383079,-0.133644,-0.463177,-0.653504,-0.543890,-0.738298,0.567200,0,Non-Risk
996,0.391544,0.669945,1.676831,-0.133644,-0.463177,-0.653504,0.207509,0.754386,-0.949378,0,Non-Risk
997,0.215727,0.669945,0.146876,-0.133644,-0.463177,-0.653504,-0.874066,-0.738298,1.072726,1,Risk
998,-1.102900,0.669945,0.146876,-2.015948,-0.463177,-0.653504,-0.505275,1.998289,1.072726,0,Non-Risk


In [172]:
X = df.drop(columns=['Risk','Ensemble_Cluster'])
y = df['Risk']

In [173]:
X

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose
0,2.765073,0.669945,0.146876,-0.133644,-0.463177,-0.653504,-0.744759,-1.235859,1.072726
1,-1.190808,-1.491168,0.146876,-0.133644,-0.463177,1.000937,0.949342,2.247070,1.072726
2,1.182721,0.669945,-1.383079,-0.133644,-0.463177,-0.653504,-0.416354,-0.738298,0.061674
3,0.831087,0.669945,0.146876,-2.015948,-0.463177,-0.653504,1.633430,1.749509,0.567200
4,1.534354,0.669945,0.146876,-2.015948,-0.463177,-0.653504,0.566380,0.256825,-0.949378
...,...,...,...,...,...,...,...,...,...
995,-0.399632,-1.491168,-1.383079,-0.133644,-0.463177,-0.653504,-0.543890,-0.738298,0.567200
996,0.391544,0.669945,1.676831,-0.133644,-0.463177,-0.653504,0.207509,0.754386,-0.949378
997,0.215727,0.669945,0.146876,-0.133644,-0.463177,-0.653504,-0.874066,-0.738298,1.072726
998,-1.102900,0.669945,0.146876,-2.015948,-0.463177,-0.653504,-0.505275,1.998289,1.072726


In [174]:
y

0      Non-Risk
1      Non-Risk
2      Non-Risk
3      Non-Risk
4      Non-Risk
         ...   
995    Non-Risk
996    Non-Risk
997        Risk
998    Non-Risk
999    Non-Risk
Name: Risk, Length: 1000, dtype: object

In [175]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)

In [176]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)


In [177]:
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

In [178]:
rf_clf = RandomForestClassifier(random_state=42)
log_reg_clf = LogisticRegression(random_state=42)
gb_clf = GradientBoostingClassifier(random_state=42)

In [179]:
ensemble_clf = VotingClassifier(estimators=[
    ('rf', rf_clf),
    ('log_reg', log_reg_clf),
    ('gb', gb_clf)
], voting='hard') 

In [180]:
ensemble_clf.fit(X_train, y_train)

In [181]:
y_pred = ensemble_clf.predict(X_test)

In [182]:
accuracy = accuracy_score(y_test, y_pred)

In [183]:
class_report = classification_report(y_test, y_pred)

In [184]:
conf_matrix = confusion_matrix(y_test, y_pred)

In [185]:
print(f"Accuracy: {accuracy}")
print("\nClassification Report:")
print(class_report)

Accuracy: 0.98

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.98      0.99       199
           1       0.00      0.00      0.00         1

    accuracy                           0.98       200
   macro avg       0.50      0.49      0.49       200
weighted avg       0.99      0.98      0.98       200



In [186]:
conf_matrix

array([[196,   3],
       [  1,   0]], dtype=int64)

In [187]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy*100}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")


Accuracy: 98.0
Precision: 0.0
Recall: 0.0
F1-Score: 0.0


In [188]:
import pickle
with open('ensemble_model.pkl', 'wb') as f:
    pickle.dump(ensemble_clf, f)
