In [1]:
# 1. Основные библиотеки
import numpy as np
import pandas as pd
import pickle   # сохранение модели

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# 2. Разделение датасета
from sklearn.model_selection import train_test_split, KFold, GridSearchCV

# 3. Модели
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.svm import SVR

# 4. Метрики качества
from sklearn.metrics import mean_squared_error as mse, r2_score as r2

# 5. Для визуализации внешних картинок в ноутбуке
from IPython.display import Image

**Пути к директориям и файлам**

In [2]:
TRAIN_DATASET_PATH = 'D:/repo/Python-Data-Science-2/course_project_train.csv'
TEST_DATASET_PATH = 'D:/repo/Python-Data-Science-2/course_project_test.csv'

**Загрузка тренировочных данных**

In [3]:
df_train = pd.read_csv(TRAIN_DATASET_PATH)
df_train.head()

Unnamed: 0,Home Ownership,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Purpose,Term,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score,Credit Default
0,Own Home,482087.0,,0.0,11.0,26.3,685960.0,1.0,,1.0,debt consolidation,Short Term,99999999.0,47386.0,7914.0,749.0,0
1,Own Home,1025487.0,10+ years,0.0,15.0,15.3,1181730.0,0.0,,0.0,debt consolidation,Long Term,264968.0,394972.0,18373.0,737.0,1
2,Home Mortgage,751412.0,8 years,0.0,11.0,35.0,1182434.0,0.0,,0.0,debt consolidation,Short Term,99999999.0,308389.0,13651.0,742.0,0
3,Own Home,805068.0,6 years,0.0,8.0,22.5,147400.0,1.0,,1.0,debt consolidation,Short Term,121396.0,95855.0,11338.0,694.0,0
4,Rent,776264.0,8 years,0.0,13.0,13.6,385836.0,1.0,,0.0,debt consolidation,Short Term,125840.0,93309.0,7180.0,719.0,0


**Обзор номинативных/категориальных признаков**

In [4]:
for cat_colname in df_train.select_dtypes(include='object').columns:
    print(str(cat_colname) + '\n\n' + str(df_train[cat_colname].value_counts()) + '\n' + '*' * 100 + '\n')

Home Ownership

Home Mortgage    3637
Rent             3204
Own Home          647
Have Mortgage      12
Name: Home Ownership, dtype: int64
****************************************************************************************************

Years in current job

10+ years    2332
2 years       705
3 years       620
< 1 year      563
5 years       516
1 year        504
4 years       469
6 years       426
7 years       396
8 years       339
9 years       259
Name: Years in current job, dtype: int64
****************************************************************************************************

Purpose

debt consolidation      5944
other                    665
home improvements        412
business loan            129
buy a car                 96
medical bills             71
major purchase            40
take a trip               37
buy house                 34
small business            26
wedding                   15
moving                    11
educational expenses      10
vacation  

### Обработка пропусков<a class="anchor" id="gaps"></a>

In [7]:
df_train.isna().sum()

Home Ownership                     0
Annual Income                   1557
Years in current job             371
Tax Liens                          0
Number of Open Accounts            0
Years of Credit History            0
Maximum Open Credit                0
Number of Credit Problems          0
Months since last delinquent    4081
Bankruptcies                       0
Purpose                            0
Term                               0
Current Loan Amount                0
Current Credit Balance             0
Monthly Debt                       0
Credit Score                    1557
Credit Default                     0
dtype: int64

**Заполняем пропуски признака 'Bankruptcies' модой**

In [6]:
df_train.loc[pd.isnull(df_train['Bankruptcies']), 'Bankruptcies'] = df_train['Bankruptcies'].mode()[0]

**Переводим категории в числа**

In [8]:
df_train.replace({'Home Ownership':{'Home Mortgage':0, 'Rent':0.33, 'Own Home':0.66, 'Have Mortgage':1}, 
                  'Years in current job':{'10+ years':0, '2 years':0.1, '3 years':0.2, '< 1 year':0.3, '5 years':0.4, 
                                          '1 year':0.5, '4 years':0.6, '6 years':0.7, '7 years':0.8, '8 years':0.9, '9 years':1}, 
                  'Purpose':{'debt consolidation':0, 'other':0.07, 'home improvements':0.14, 'business loan':0.21, 'buy a car':0.28, 
                             'medical bills':0.35, 'major purchase':0.42, 'take a trip':0.5, 'buy house':0.57, 'small business':0.64, 
                             'wedding':0.71, 'moving':0.78, 'educational expenses':0.85, 'vacation':0.92, 'renewable energy':1}, 
                  'Term':{'Short Term':0, 'Long Term':1}}, inplace = True)

**Корректируем выбросы параметра 'Credit Score'**

Исправляем опечатки (убираем лишний ноль в младшем разряде)

In [None]:
#df_train.loc[df_train['Credit Score'] > 999, 'Credit Score'] = df_train['Credit Score']/10

**Разбиваем на датафреймы X и y на тренировку и валидацию**

In [9]:
X_train, X_valid, y_train, y_valid = train_test_split(df_train.drop(['Annual Income', 
    'Months since last delinquent', 'Years in current job', 'Credit Score'], axis = 'columns'),
    df_train['Years in current job'], test_size = 0.2, random_state = 42)

### Нормализуем датасет<a class="anchor" id="outliers"></a>

In [12]:
cols_for_scale = ['Number of Open Accounts', 'Years of Credit History', 'Maximum Open Credit', 
                  'Bankruptcies', 'Current Loan Amount', 'Current Credit Balance', 'Monthly Debt']

In [13]:
scaler = RobustScaler()

In [14]:
X_train[cols_for_scale] = scaler.fit_transform(X_train[cols_for_scale])

In [15]:
X_valid[cols_for_scale] = scaler.transform(X_valid[cols_for_scale])

### Классифицируем по KNN<a class="anchor" id="outliers"></a>

In [16]:
knn = KNeighborsClassifier(n_neighbors=7)

knn.fit(X_train, y_train)

y_pred = knn.predict(X_valid)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
# Альтернативный вариант
k_vals = np.arange(2,10)

accuracy_valid = []
accuracy_train = []

for val in k_vals:
    knn = KNeighborsClassifier(n_neighbors=val)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_valid)
    y_pred_train = knn.predict(X_valid)
    acc_valid = accuracy_score(y_valid, y_pred)
    acc_train = accuracy_score(y_train, y_pred_train)
    accuracy_valid.append(acc_valid)
    accuracy_train.append(acc_train)
    print('n_neighbors = {} \n\t acc_valid = {} \n\t acc_train = {}\n'.format(val, acc_valid, acc_train))

In [None]:
accuracy y_pred train acc_valid