In [224]:
import numpy as np
import pandas as pd
from category_encoders.binary import BinaryEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score

In [225]:
def evaluate_results(y_test, y_predict):
    print('Classification results:')
    f1 = f1_score(y_test, y_predict)
    print("f1: %.2f%%" % (f1 * 100.0)) 
    roc = roc_auc_score(y_test, y_predict)
    print("roc: %.2f%%" % (roc * 100.0)) 
    rec = recall_score(y_test, y_predict, average='binary')
    print("recall: %.2f%%" % (rec * 100.0)) 
    prc = precision_score(y_test, y_predict, average='binary')
    print("precision: %.2f%%" % (prc * 100.0)) 

In [261]:
df = pd.read_csv('data.csv')
df

Unnamed: 0,Home Ownership,Annual Income,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Purpose,Term,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score,Credit Default
0,Own Home,482087.0,11.0,26.3,685960.0,debt consolidation,Short Term,99999999.0,47386.0,7914.0,749.0,0
1,Own Home,1025487.0,15.0,15.3,1181730.0,debt consolidation,Long Term,264968.0,394972.0,18373.0,737.0,1
2,Home Mortgage,751412.0,11.0,35.0,1182434.0,debt consolidation,Short Term,99999999.0,308389.0,13651.0,742.0,0
3,Own Home,805068.0,8.0,22.5,147400.0,debt consolidation,Short Term,121396.0,95855.0,11338.0,694.0,0
4,Rent,776264.0,13.0,13.6,385836.0,debt consolidation,Short Term,125840.0,93309.0,7180.0,719.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
7495,Rent,402192.0,3.0,8.5,107866.0,other,Short Term,129360.0,73492.0,1900.0,697.0,0
7496,Home Mortgage,1533984.0,10.0,26.5,686312.0,debt consolidation,Long Term,444048.0,456399.0,12783.0,7410.0,1
7497,Rent,1878910.0,12.0,32.1,1778920.0,buy a car,Short Term,99999999.0,477812.0,12479.0,748.0,0
7498,Home Mortgage,,21.0,26.5,1141250.0,debt consolidation,Short Term,615274.0,476064.0,37118.0,,0


In [262]:
cat_features = ['Home Ownership', 'Purpose']

In [263]:
encoder = BinaryEncoder()
df = pd.concat((encoder.fit_transform(df[cat_features]), df.drop(cat_features, axis=1)), axis=1)
df['Term'] = np.where(df['Term'] == 'Short Term', 0, 1)

In [264]:
df

Unnamed: 0,Home Ownership_0,Home Ownership_1,Home Ownership_2,Purpose_0,Purpose_1,Purpose_2,Purpose_3,Annual Income,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Term,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score,Credit Default
0,0,0,1,0,0,0,1,482087.0,11.0,26.3,685960.0,0,99999999.0,47386.0,7914.0,749.0,0
1,0,0,1,0,0,0,1,1025487.0,15.0,15.3,1181730.0,1,264968.0,394972.0,18373.0,737.0,1
2,0,1,0,0,0,0,1,751412.0,11.0,35.0,1182434.0,0,99999999.0,308389.0,13651.0,742.0,0
3,0,0,1,0,0,0,1,805068.0,8.0,22.5,147400.0,0,121396.0,95855.0,11338.0,694.0,0
4,0,1,1,0,0,0,1,776264.0,13.0,13.6,385836.0,0,125840.0,93309.0,7180.0,719.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7495,0,1,1,0,0,1,0,402192.0,3.0,8.5,107866.0,0,129360.0,73492.0,1900.0,697.0,0
7496,0,1,0,0,0,0,1,1533984.0,10.0,26.5,686312.0,1,444048.0,456399.0,12783.0,7410.0,1
7497,0,1,1,0,1,0,1,1878910.0,12.0,32.1,1778920.0,0,99999999.0,477812.0,12479.0,748.0,0
7498,0,1,0,0,0,0,1,,21.0,26.5,1141250.0,0,615274.0,476064.0,37118.0,,0


In [265]:
df.loc[df['Annual Income'].isna(), 'Annual Income'] = df['Annual Income'].median()
df.loc[df['Credit Score'].isna(), 'Credit Score'] = df['Credit Score'].median()

In [266]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, :-1], df.iloc[:, -1], random_state=10)

In [267]:
model = LogisticRegression(random_state=10)
model.fit(X_train, y_train)
predict = model.predict(X_test)
evaluate_results(y_test, predict)

Classification results:
f1: 31.41%
roc: 58.98%
recall: 19.17%
precision: 86.89%


In [276]:
indexes = np.random.choice(df[df.iloc[:, -1] == 1].index, size=int(df[df.iloc[:, -1] == 1].shape[0]*0.25), replace=False)

In [277]:
df['class_test'] = -1
df.loc[indexes, 'class_test'] = 1

In [278]:
neg_sample = df[df['class_test'] == -1][:(df['class_test'] == 1).sum()]
sample_test = df[df['class_test'] == -1][(df['class_test'] == 1).sum():]
pos_sample = df[df['class_test'] == 1]
sample_train = pd.concat([neg_sample, pos_sample])

In [279]:
new_model = LogisticRegression(random_state=10)
new_model.fit(sample_train.iloc[:, :-2], sample_train.iloc[:, -2])
predict_new = model.predict(sample_test.iloc[:,:-2])
evaluate_results(sample_test.iloc[:,-2], predict_new)

Classification results:
f1: 33.09%
roc: 59.57%
recall: 20.55%
precision: 84.77%


In [242]:
# precision снизился, но все остальные метрики выросли