# Создание модели оценки кредитоспособности заемщиков

Сначала составим модель оценки на меньшем файле, затем опробуем ее на большем файле

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression

Сначала рассмотрим меньший файл

In [2]:
small_banks = pd.read_csv('A:\\Python\\Datasets\\Banks\\bank.csv', sep=';')

In [3]:
small_banks.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


Рассмотрим все уникальные значения каждого атрибута

In [4]:
age_ALL = np.asarray(sorted(small_banks.age.unique()))

In [5]:
age_ALL

array([19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
       36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52,
       53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
       70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 83, 84, 86, 87],
      dtype=int64)

In [6]:
job_ALL = np.asarray(sorted(small_banks.job.unique()))

In [7]:
job_ALL

array(['admin.', 'blue-collar', 'entrepreneur', 'housemaid', 'management',
       'retired', 'self-employed', 'services', 'student', 'technician',
       'unemployed', 'unknown'], dtype='<U13')

In [8]:
marital_ALL = np.asarray(sorted(small_banks.marital.unique()))

In [9]:
marital_ALL

array(['divorced', 'married', 'single'], dtype='<U8')

In [10]:
education_ALL = np.asarray(sorted(small_banks.education.unique()))

In [11]:
education_ALL

array(['primary', 'secondary', 'tertiary', 'unknown'], dtype='<U9')

In [12]:
default_ALL = np.asarray(sorted(small_banks.default.unique()))

In [13]:
default_ALL

array(['no', 'yes'], dtype='<U3')

In [14]:
balance_ALL = np.asarray(sorted(small_banks.balance.unique()))

In [15]:
balance_ALL

array([-3313, -2082, -1746, ..., 27733, 42045, 71188], dtype=int64)

In [16]:
housing_ALL = np.asarray(sorted(small_banks.housing.unique()))

In [17]:
housing_ALL

array(['no', 'yes'], dtype='<U3')

In [18]:
loan_ALL = np.asarray(sorted(small_banks.loan.unique()))

In [19]:
loan_ALL

array(['no', 'yes'], dtype='<U3')

In [20]:
contact_ALL = np.asarray(sorted(small_banks.contact.unique()))

In [21]:
contact_ALL

array(['cellular', 'telephone', 'unknown'], dtype='<U9')

In [22]:
day_ALL = np.asarray(sorted(small_banks.day.unique()))

In [23]:
day_ALL

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31],
      dtype=int64)

In [24]:
month_ALL = np.asarray(sorted(small_banks.month.unique()))

In [25]:
month_ALL

array(['apr', 'aug', 'dec', 'feb', 'jan', 'jul', 'jun', 'mar', 'may',
       'nov', 'oct', 'sep'], dtype='<U3')

In [26]:
duration_ALL = np.asarray(sorted(small_banks.duration.unique()))

In [27]:
duration_ALL

array([   4,    5,    6,    7,    8,    9,   10,   11,   12,   13,   14,
         15,   16,   17,   18,   19,   20,   21,   22,   23,   24,   25,
         26,   27,   28,   29,   30,   31,   32,   33,   34,   35,   36,
         37,   38,   39,   40,   41,   42,   43,   44,   45,   46,   47,
         48,   49,   50,   51,   52,   53,   54,   55,   56,   57,   58,
         59,   60,   61,   62,   63,   64,   65,   66,   67,   68,   69,
         70,   71,   72,   73,   74,   75,   76,   77,   78,   79,   80,
         81,   82,   83,   84,   85,   86,   87,   88,   89,   90,   91,
         92,   93,   94,   95,   96,   97,   98,   99,  100,  101,  102,
        103,  104,  105,  106,  107,  108,  109,  110,  111,  112,  113,
        114,  115,  116,  117,  118,  119,  120,  121,  122,  123,  124,
        125,  126,  127,  128,  129,  130,  131,  132,  133,  134,  135,
        136,  137,  138,  139,  140,  141,  142,  143,  144,  145,  146,
        147,  148,  149,  150,  151,  152,  153,  1

In [28]:
campaign_ALL = np.asarray(sorted(small_banks.campaign.unique()))

In [29]:
campaign_ALL

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 28, 29, 30, 31, 32, 44, 50],
      dtype=int64)

In [30]:
pdays_ALL = np.asarray(sorted(small_banks.pdays.unique()))

In [31]:
pdays_ALL

array([ -1,   1,   2,   3,   5,   7,  28,  38,  56,  57,  58,  59,  60,
        61,  62,  63,  64,  69,  73,  74,  75,  76,  77,  78,  79,  80,
        81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,
        94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106,
       107, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
       122, 123, 124, 126, 127, 130, 131, 133, 135, 136, 137, 138, 139,
       140, 141, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153,
       154, 158, 159, 160, 161, 162, 164, 165, 166, 167, 168, 169, 170,
       171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183,
       184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 195, 196, 197,
       198, 199, 201, 204, 205, 206, 207, 208, 209, 210, 211, 212, 219,
       221, 222, 223, 224, 225, 227, 231, 232, 234, 235, 238, 239, 241,
       242, 244, 246, 247, 248, 249, 250, 253, 254, 255, 256, 258, 259,
       260, 261, 262, 264, 265, 266, 267, 268, 270, 271, 272, 27

In [32]:
previous_ALL = np.asarray(sorted(small_banks.previous.unique()))

In [33]:
previous_ALL

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 17,
       18, 19, 20, 22, 23, 24, 25], dtype=int64)

In [34]:
poutcome_ALL = np.asarray(sorted(small_banks.poutcome.unique()))

In [35]:
poutcome_ALL

array(['failure', 'other', 'success', 'unknown'], dtype='<U7')

In [36]:
y_ALL = np.asarray(sorted(small_banks.y.unique()))

In [37]:
y_ALL

array(['no', 'yes'], dtype='<U3')

Для всех строковых атрибутов определим количественную оценку, и определим значение корреляции всех с результирующим атрибутом

In [38]:
dict = {'name':[], 
        'result': []} 
 
df_corr = pd.DataFrame(dict)

In [39]:
y_0 = small_banks.y
y_0 = y_0.replace({'no': 0, 
                   'yes': 1})
y_0.head()

0    0
1    0
2    0
3    0
4    0
Name: y, dtype: int64

In [76]:
small_banks.y.count()

4521

In [79]:
small_banks.query("y == 'no'").y.count()

4000

In [80]:
small_banks.query("y == 'yes'").y.count()

521

In [40]:
age_0 = small_banks.age
age_0 = np.asarray(age_0)
age_0 = age_0.reshape(-1, 1)
model_1 = LinearRegression().fit(age_0, y_0)
model_1_score = model_1.score(age_0, y_0)
model_1_score
df_corr = df_corr.append({'name':'age', 
                          'result': model_1_score}, ignore_index=True)

In [41]:
job_0 = small_banks.job
job_0 = job_0.replace({'unknown': 1, 
                       'unemployed': 4,
                       'student': 2,
                       'housemaid': 4,
                       'retired': 0,
                       'self-employed': 4,
                       'entrepreneur': 5,
                       'admin.': 4,
                       'blue-collar': 6,
                       'management': 3,
                       'services': 5,
                       'technician': 4})
job_0 = np.asarray(job_0)
job_0 = job_0.reshape(-1, 1)
model_2 = LinearRegression().fit(job_0, y_0)
model_2_score = model_2.score(job_0, y_0)
model_2_score
df_corr = df_corr.append({'name':'job', 
                          'result': model_2_score}, ignore_index=True)

In [42]:
marital_0 = small_banks.marital
marital_0 = marital_0.replace({'divorced': 0, 
                               'married': 1,
                               'single': 0})
marital_0 = np.asarray(marital_0)
marital_0 = marital_0.reshape(-1, 1)
model_3 = LinearRegression().fit(marital_0, y_0)
model_3_score = model_3.score(marital_0, y_0)
model_3_score
df_corr = df_corr.append({'name':'marital', 
                          'result': model_3_score}, ignore_index=True)

In [43]:
education_0 = small_banks.education
education_0 = education_0.replace({'primary': 1, 
                                   'secondary': 1,
                                   'tertiary': 3,
                                   'unknown': 0}) 
education_0 = np.asarray(education_0)
education_0 = education_0.reshape(-1, 1)
model_4 = LinearRegression().fit(education_0, y_0)
model_4_score = model_4.score(education_0, y_0)
model_4_score
df_corr = df_corr.append({'name':'education', 
                          'result': model_4_score}, ignore_index=True)

In [44]:
default_0 = small_banks.default
default_0 = default_0.replace({'no': 0, 
                               'yes': 1})
default_0 = np.asarray(default_0)
default_0 = default_0.reshape(-1, 1)
model_5 = LinearRegression().fit(default_0, y_0)
model_5_score = model_5.score(default_0, y_0)
model_5_score
df_corr = df_corr.append({'name':'default', 
                          'result': model_5_score}, ignore_index=True)

In [45]:
balance_0 = small_banks.balance
balance_0 = np.asarray(balance_0)
balance_0 = balance_0.reshape(-1, 1)
model_6 = LinearRegression().fit(balance_0, y_0)
model_6_score = model_6.score(balance_0, y_0)
model_6_score
df_corr = df_corr.append({'name':'balance', 
                          'result': model_6_score}, ignore_index=True)

In [46]:
housing_0 = small_banks.housing
housing_0 = housing_0.replace({'no': 0, 
                               'yes': 1})
housing_0 = np.asarray(housing_0)
housing_0 = housing_0.reshape(-1, 1)
model_7 = LinearRegression().fit(housing_0, y_0)
model_7_score = model_7.score(housing_0, y_0)
model_7_score
df_corr = df_corr.append({'name':'housing', 
                          'result': model_7_score}, ignore_index=True)

In [47]:
loan_0 = small_banks.loan
loan_0 = loan_0.replace({'no': 0, 
                         'yes': 1})
loan_0 = np.asarray(loan_0)
loan_0 = loan_0.reshape(-1, 1)
model_8 = LinearRegression().fit(loan_0, y_0)
model_8_score = model_8.score(loan_0, y_0)
model_8_score
df_corr = df_corr.append({'name':'loan', 
                          'result': model_8_score}, ignore_index=True)

In [48]:
contact_0 = small_banks.contact
contact_0 = contact_0.replace({'cellular': 1, 
                               'telephone': 1,
                               'unknown': 0})
contact_0 = np.asarray(contact_0)
contact_0 = contact_0.reshape(-1, 1)
model_9 = LinearRegression().fit(contact_0, y_0)
model_9_score = model_9.score(contact_0, y_0)
model_9_score
df_corr = df_corr.append({'name':'contact', 
                          'result': model_9_score}, ignore_index=True)

In [49]:
day_0 = small_banks.day
day_0 = np.asarray(day_0)
day_0 = day_0.reshape(-1, 1)
model_10 = LinearRegression().fit(day_0, y_0)
model_10_score = model_10.score(day_0, y_0)
model_10_score
df_corr = df_corr.append({'name':'day', 
                          'result': model_10_score}, ignore_index=True)

In [50]:
month_0 = small_banks.month
month_0 = month_0.replace({'jan': 1, 
                           'feb': 2,
                           'mar': 3,
                           'apr': 4,
                           'may': 5,
                           'jun': 6,
                           'jul': 7,
                           'aug': 8,
                           'sep': 9,
                           'oct': 10,
                           'nov': 11,
                           'dec': 12}) 
month_0 = np.asarray(month_0)
month_0 = month_0.reshape(-1, 1)
model_11 = LinearRegression().fit(month_0, y_0)
model_11_score = model_11.score(month_0, y_0)
model_11_score
df_corr = df_corr.append({'name':'month', 
                          'result': model_11_score}, ignore_index=True)

In [51]:
duration_0 = small_banks.duration
duration_0 = np.asarray(duration_0)
duration_0 = duration_0.reshape(-1, 1)
model_12 = LinearRegression().fit(duration_0, y_0)
model_12_score = model_12.score(duration_0, y_0)
model_12_score
df_corr = df_corr.append({'name':'duration', 
                          'result': model_12_score}, ignore_index=True)

In [52]:
campaign_0 = small_banks.campaign
campaign_0 = np.asarray(campaign_0)
campaign_0 = campaign_0.reshape(-1, 1)
model_13 = LinearRegression().fit(campaign_0, y_0)
model_13_score = model_13.score(campaign_0, y_0)
model_13_score
df_corr = df_corr.append({'name':'campaign', 
                          'result': model_13_score}, ignore_index=True)

In [53]:
pdays_0 = small_banks.pdays
pdays_0 = np.asarray(pdays_0)
pdays_0 = pdays_0.reshape(-1, 1)
model_14 = LinearRegression().fit(pdays_0, y_0)
model_14_score = model_14.score(pdays_0, y_0)
model_14_score
df_corr = df_corr.append({'name':'pdays', 
                          'result': model_14_score}, ignore_index=True)

In [54]:
previous_0 = small_banks.previous
previous_0 = np.asarray(previous_0)
previous_0 = previous_0.reshape(-1, 1)
model_15 = LinearRegression().fit(previous_0, y_0)
model_15_score = model_15.score(previous_0, y_0)
model_15_score
df_corr = df_corr.append({'name':'previous', 
                          'result': model_15_score}, ignore_index=True)

In [55]:
poutcome_0 = small_banks.poutcome
poutcome_0 = poutcome_0.replace({'failure': 0, 
                                 'other': 1,
                                 'success': 6,
                                 'unknown': 0})
poutcome_0 = np.asarray(poutcome_0)
poutcome_0 = poutcome_0.reshape(-1, 1)
model_16 = LinearRegression().fit(poutcome_0, y_0)
model_16_score = model_16.score(poutcome_0, y_0)
model_16_score
df_corr = df_corr.append({'name':'poutcome', 
                          'result': model_16_score}, ignore_index=True)

In [56]:
df_corr = df_corr.sort_values(by=['result'], ascending=False)
df_corr

Unnamed: 0,name,result
11,duration,0.160896
15,poutcome,0.08423
8,contact,0.019432
1,job,0.014043
14,previous,0.013622
6,housing,0.010959
13,pdays,0.010834
7,loan,0.004973
2,marital,0.004179
12,campaign,0.003739


В качестве признаков последующей модели выберем только атрибуты, значение корреляции которых превысило установленный порог: 1%

Таких признаков 7:

1) duration

2) poutcome

3) contact

4) job

5) previous

6) housing

7) pdays

Остальные признаки отбрасываем как малозначащие

In [57]:
duration_f = small_banks.duration
poutcome_f = small_banks.poutcome
poutcome_f = poutcome_f.replace({'failure': 0, 
                                 'other': 1,
                                 'success': 6,
                                 'unknown': 0})
contact_f = small_banks.contact
contact_f = contact_f.replace({'cellular': 1, 
                               'telephone': 1,
                               'unknown': 0})
job_f = small_banks.job
job_f = job_f.replace({'unknown': 1, 
                       'unemployed': 4,
                       'student': 2,
                       'housemaid': 4,
                       'retired': 0,
                       'self-employed': 4,
                       'entrepreneur': 5,
                       'admin.': 4,
                       'blue-collar': 6,
                       'management': 3,
                       'services': 5,
                       'technician': 4})
previous_f = small_banks.previous
housing_f = small_banks.housing
housing_f = housing_f.replace({'no': 0, 
                               'yes': 1})
pdays_f = small_banks.pdays
y_f = small_banks.y
y_f = y_f.replace({'no': 0, 
                   'yes': 1})

In [58]:
final_dict = {'duration': duration_f, 
              'poutcome': poutcome_f,
              'contact': contact_f,
              'job': job_f,
              'previous': previous_f,
              'housing': housing_f,
              'pdays': pdays_f,
              'y':y_f} 
 
df_final = pd.DataFrame(final_dict)
df_final.head()

Unnamed: 0,duration,poutcome,contact,job,previous,housing,pdays,y
0,79,0,1,4,0,0,-1,0
1,220,0,1,5,4,1,339,0
2,185,0,1,3,1,1,330,0
3,199,0,0,3,0,1,-1,0
4,226,0,0,6,0,1,-1,0


Воспользуемся методом RandomForest и получим оценку модели

In [59]:
X_f = df_final.drop(['y'],axis=1)
X_f.head()

Unnamed: 0,duration,poutcome,contact,job,previous,housing,pdays
0,79,0,1,4,0,0,-1
1,220,0,1,5,4,1,339
2,185,0,1,3,1,1,330
3,199,0,0,3,0,1,-1
4,226,0,0,6,0,1,-1


In [60]:
y_f = df_final.y
y_f.head()

0    0
1    0
2    0
3    0
4    0
Name: y, dtype: int64

In [61]:
clf_rf = RandomForestClassifier(criterion='entropy',random_state=0)

In [62]:
parametrs = {'n_estimators':range(10,50,10),
             'max_depth':range(1,12,2),
             'min_samples_leaf':range(1,7),
             'min_samples_split':range(2,9,2)}

In [63]:
grid_search_cv_clf = GridSearchCV(clf_rf,
                                  parametrs,
                                  cv=3,
                                  n_jobs=-1)

In [64]:
grid_search_cv_clf.fit(X_f,y_f)

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='entropy',
                                              max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False, random_state=0,
                                              verbose=0, warm_start=False),
             iid

In [65]:
grid_search_cv_clf.best_params_

{'max_depth': 9,
 'min_samples_leaf': 1,
 'min_samples_split': 8,
 'n_estimators': 30}

In [69]:
best_clf = grid_search_cv_clf.best_estimator_
best_clf

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=9, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=8,
                       min_weight_fraction_leaf=0.0, n_estimators=30,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [71]:
feature_importances = best_clf.feature_importances_
feature_importances

array([0.61285231, 0.10233155, 0.0494665 , 0.06638005, 0.04778924,
       0.02762784, 0.09355251])

In [73]:
feature_importances_df = pd.DataFrame({'features':list(X_f),'feature_importances':feature_importances})
feature_importances_df.sort_values('feature_importances',ascending=False)
feature_importances_df

Unnamed: 0,features,feature_importances
0,duration,0.612852
1,poutcome,0.102332
2,contact,0.049466
3,job,0.06638
4,previous,0.047789
5,housing,0.027628
6,pdays,0.093553


In [81]:
best_clf.score(X_f,y_f)

0.9259013492590135

Воспроизведем полученную модель на большем файле и сравним результаты

In [99]:
big_banks = pd.read_csv('A:\\Python\\Datasets\\Banks\\bank-full.csv', sep=';')
big_banks.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [100]:
duration_t = big_banks.duration
poutcome_t = big_banks.poutcome
poutcome_t = poutcome_t.replace({'failure': 0, 
                                 'other': 1,
                                 'success': 6,
                                 'unknown': 0})
contact_t = big_banks.contact
contact_t = contact_t.replace({'cellular': 1, 
                               'telephone': 1,
                               'unknown': 0})
job_t = big_banks.job
job_t = job_t.replace({'unknown': 1, 
                       'unemployed': 4,
                       'student': 2,
                       'housemaid': 4,
                       'retired': 0,
                       'self-employed': 4,
                       'entrepreneur': 5,
                       'admin.': 4,
                       'blue-collar': 6,
                       'management': 3,
                       'services': 5,
                       'technician': 4})
previous_t = big_banks.previous
housing_t = big_banks.housing
housing_t = housing_t.replace({'no': 0, 
                               'yes': 1})
pdays_t = big_banks.pdays
y_t = big_banks.y
y_t = y_t.replace({'no': 0, 
                   'yes': 1})

In [101]:
test_dict = {'duration': duration_t, 
              'poutcome': poutcome_t,
              'contact': contact_t,
              'job': job_t,
              'previous': previous_t,
              'housing': housing_t,
              'pdays': pdays_t,
              'y':y_t} 
 
df_test = pd.DataFrame(test_dict)
df_test.head()

Unnamed: 0,duration,poutcome,contact,job,previous,housing,pdays,y
0,261,0,0,3,0,1,-1,0
1,151,0,0,4,0,1,-1,0
2,76,0,0,5,0,1,-1,0
3,92,0,0,6,0,1,-1,0
4,198,0,0,1,0,0,-1,0


In [102]:
X_t = df_test.drop(['y'],axis=1)
X_t.head()

Unnamed: 0,duration,poutcome,contact,job,previous,housing,pdays
0,261,0,0,3,0,1,-1
1,151,0,0,4,0,1,-1
2,76,0,0,5,0,1,-1
3,92,0,0,6,0,1,-1
4,198,0,0,1,0,0,-1


In [103]:
y_t = df_test.y
y_t.head()

0    0
1    0
2    0
3    0
4    0
Name: y, dtype: int64

In [105]:
best_clf.score(X_t,y_t)

0.9028997367897194

Модель показала себя приемлимо, на большом тестовом датасете точность оказалась всего на 2% меньше, чем на обучающем.