In [1]:
import pandas as pd
feats = pd.read_csv('data/bank_data_feats_e3.csv', index_col=0)
target = pd.read_csv('data/bank_data_target_e2.csv', index_col=0)

In [2]:
from sklearn.model_selection import train_test_split
test_size = 0.2
random_state = 42
X_train, X_test, y_train, y_test = train_test_split(feats, target, test_size=test_size, random_state=random_state)

In [6]:
print(f'Shape of X_train: {X_train.shape}')
print(f'Shape of y_train: {y_train.shape}')
print(f'Shape of X_test: {X_test.shape}')
print(f'Shape of y_test: {y_test.shape}')

Shape of X_train: (3616, 32)
Shape of y_train: (3616, 1)
Shape of X_test: (905, 32)
Shape of y_test: (905, 1)


Model Create

In [7]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(random_state=42)      # 기본은 max_iter 가 100

In [8]:
model.fit(X_train, y_train['y'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [16]:
# 위와 같은 에러가 나는 건 default max_iter 100 으로는 converged 가 되지 않았다는 것이기 때문
model = LogisticRegression(random_state=42, max_iter=3000)
model.fit(X_train, y_train['y'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=3000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [18]:
y_pred = model.predict(X_test)

우선 정확성을 고려해본다.

In [20]:
from sklearn import metrics
accuracy = metrics.accuracy_score(y_pred=y_pred, y_true=y_test)
print(f'Accuracy of the model is {accuracy*100:.4f}%')

Accuracy of the model is 89.9448%


In [21]:
precision, recall, fscore, _ = metrics.precision_recall_fscore_support(y_pred=y_pred, y_true=y_test, average='binary')
print(f'Precision: {precision:.4f}\nRecall: {recall:.4f}\nfscore: {fscore:.4f}')

Precision: 0.5814
Recall: 0.2551
fscore: 0.3546


모델의 계수도 출력해서 전체 예측 결과에 어느 특성이 많은 영향을 주는지 알아볼 수 있다.

In [27]:
coef_list = [f'{feature}: {coef}' for coef, feature in sorted(zip(model.coef_[0], X_train.columns.values.tolist()))]
for item in coef_list:
    print(item)

poutcome_failure: -0.8887885077345801
is_loan: -0.7543614757855233
is_housing: -0.6985034352802764
job_entrepreneur: -0.6977513169619206
marital_married: -0.6459581623611642
job_blue-collar: -0.5523232101807054
education_primary: -0.4873389259197728
job_unemployed: -0.46877339684922203
marital_single: -0.46868083393674775
job_services: -0.43913926271864206
job_technician: -0.369200120743572
education_secondary: -0.2910939848047349
job_self-employed: -0.23880513826041697
job_admin.: -0.14697574260964213
job_housemaid: -0.14462614759320988
poutcome_other: -0.0850959092896899
job_management: -0.08318407799852068
campaign: -0.07232422373806496
education_tertiary: -0.059217131898586184
month: -0.032285104298002784
age: -0.007324223771700347
day: -0.00014715726227599454
balance: 3.115923462606925e-06
duration: 0.004113621895246346
previous: 0.018878357100397824
job_student: 0.11498644302778163
is_default: 0.24493814661514385
contact_telephone: 0.693388488327777
job_retired: 0.771610779835141