In [64]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from collections import Counter
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold

warnings.filterwarnings("ignore")

# Peer-graded Assignment: Построение baseline-решений

Обучите 3 разные baseline-модели на полученных наборах данных и оцените их качество. На прошлой неделе вы выбрали методику оценки качества моделей на основе кросс-валидации, а также основную и вспомогательные метрики. Оцените с их помощью получившуюся модель. Обратите внимание, что под разными моделями понимаются именно разные алгоритмы классификации. Например, 2 модели, реализующие метод k ближайших соседей с разными k, будут считаться одним baseline-решением (хотя и с разными параметрами). Напоминаем, что отложенная выборка (hold-out dataset) не должна использоваться для построения и оценки baseline-моделей!

Можно (но не обязательно) рассмотреть следующий набор алгоритмов:

* Линейная модель (например, реализация sklearn.linear_model.RidgeClassifier)
* Случайный лес (например, реализация sklearn.ensemble.RandomForestClassifier)
* Градиентный бустинг (например, реализация sklearn.ensemble.GradientBoostingClassifier)

In [3]:
# Загрузим данные  
data = pd.read_csv('../Data/train_dataset.csv')
data.head()

Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var8,Var9,Var10,...,Var222,Var223,Var224,Var225,Var226,Var227,Var228,Var229,Var230,label
0,,,,,,819.0,7.0,,,,...,z9ub4Lm,LM8l689qOp,,,5Acm,vJ_w8kB,WfJ2BB2SFSqauljlfOB,,,-1
1,,,,,,2401.0,21.0,,,,...,v5hz20V,LM8l689qOp,,kG3k,FSa2,RAYp,55YFVY9,mj86,,-1
2,,,,,,343.0,0.0,,,,...,20HE4Qn,LM8l689qOp,,,Xa3G,RAYp,F2FyR07IdsN7I,,,-1
3,,,,,,826.0,7.0,,,,...,4XQyovK,LM8l689qOp,,ELof,453m,RAYp,F2FyR07IdsN7I,,,-1
4,,,,,3960.0,,,,,1641096.0,...,LTMqFbB,LM8l689qOp,,,FSa2,RAYp,F2FyR07IdsN7I,,,-1


1.1 Обработка данных 

In [4]:
# Посмотрим, насколько велика доля пропусков в каждой из переменных
for var in data.columns:
    missed_rate = data[var].isna().sum()/data.shape[0]
    if missed_rate >= 0.95:
        print(var,'missed rate: ',missed_rate)

Var1 missed rate:  0.9851666666666666
Var2 missed rate:  0.9742333333333333
Var3 missed rate:  0.9742666666666666
Var4 missed rate:  0.9695
Var5 missed rate:  0.9715
Var8 missed rate:  1.0
Var9 missed rate:  0.9851666666666666
Var10 missed rate:  0.9715
Var11 missed rate:  0.9742666666666666
Var12 missed rate:  0.9878666666666667
Var14 missed rate:  0.9742666666666666
Var15 missed rate:  1.0
Var16 missed rate:  0.9715
Var17 missed rate:  0.9695
Var18 missed rate:  0.9695
Var19 missed rate:  0.9695
Var20 missed rate:  1.0
Var23 missed rate:  0.9715
Var26 missed rate:  0.9715
Var27 missed rate:  0.9715
Var29 missed rate:  0.9851666666666666
Var30 missed rate:  0.9851666666666666
Var31 missed rate:  1.0
Var32 missed rate:  1.0
Var33 missed rate:  0.9842
Var34 missed rate:  0.9742333333333333
Var36 missed rate:  0.9742333333333333
Var37 missed rate:  0.9695
Var39 missed rate:  1.0
Var40 missed rate:  0.9742333333333333
Var41 missed rate:  0.9851666666666666
Var42 missed rate:  1.0
Var43 mi

In [5]:
# Удалим все столбцы, состоящие только из NaN
deleted = []
for var in data.columns:
    if data[var].isna().sum() == data.shape[0]:
        data.drop(var, axis=1,inplace=True)
        deleted.append(var)
# В категориальных признаках заменим пропуски на новую категорию
data.loc[:,'Var191':] = data.loc[:,'Var191':].fillna('missing_value')
np.array(deleted)

array(['Var8', 'Var15', 'Var20', 'Var31', 'Var32', 'Var39', 'Var42',
       'Var48', 'Var52', 'Var55', 'Var79', 'Var141', 'Var167', 'Var169',
       'Var175', 'Var185', 'Var209', 'Var230'], dtype='<U6')

In [6]:
data.Var57.isna().sum() # признак без пропусков - пригодится позднее

0

In [7]:
# Проверим есть ли значение 0.0001 в данных, чтобы в дальнейшем заполнить им пропуски 
0.0001 in data.loc[:, 'Var1':'Var190'].values

False

In [8]:
data.loc[:, 'Var1':'Var190'] = data.loc[:, 'Var1':'Var190'].fillna(0.0001)

In [9]:
# Создадим отдельный датасет и заполним пропуски нулями
# Это нужно для того, чтобы в дальнейшем считать корреляцию на данном датасете
# X_numeric = data.loc[:, 'Var1':'Var190'].copy().fillna(0.0001)
# X_numeric = pd.concat([X_numeric,data['label']], axis=1)

In [10]:
# Посмотрим, сколько уникальных значений в каждой из категориальных переменных
# Не будем учитывать в корреляции признаки, в которых больше 500 категорий
# (рекомендация с форума, число выбрано интуитивно на основе представленных ниже данных)
# for var in data.loc[:,'Var191':'Var229'].columns:
#     print(var, '- unique values -',len(data[var].unique()))

In [11]:
# Выведем соотношение классов
not_churn = data.label.value_counts()[-1]
churn = data.label.value_counts()[1]

print('Доля класса "отток": ', churn/data.shape[0])
print('Доля класса "не отток": ', not_churn/data.shape[0])

Доля класса "отток":  0.073
Доля класса "не отток":  0.927


Были выбраны несколько стратегий преобразования категориальных переменных, поэтому попробуем каждую из них


In [45]:
# Создадим датасеты
cat_features = data.loc[:,'Var191':]
numeric_features = data.loc[:, 'Var1':'Var190']
y = data['label']

In [49]:
# Зададим параметры модели
scorings = ['precision', 'recall','f1','roc_auc']
cv = StratifiedKFold(n_splits=5)

# OrdinalEncoder

In [47]:
OE_enc = OrdinalEncoder()
OE_enc.fit(cat_features)
oe_features = pd.DataFrame(OE_enc.transform(cat_features), columns = cat_features.columns)
X_oe = pd.concat([numeric_features,oe_features], axis=1)
X_oe.head()

Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var9,Var10,Var11,...,Var221,Var222,Var223,Var224,Var225,Var226,Var227,Var228,Var229,label
0,0.0001,0.0001,0.0001,0.0001,0.0001,819.0,7.0,0.0001,0.0001,0.0001,...,2.0,3344.0,0.0,1.0,2.0,2.0,6.0,16.0,1.0,0.0
1,0.0001,0.0001,0.0001,0.0001,0.0001,2401.0,21.0,0.0001,0.0001,0.0001,...,4.0,3170.0,0.0,1.0,1.0,7.0,2.0,2.0,2.0,0.0
2,0.0001,0.0001,0.0001,0.0001,0.0001,343.0,0.0,0.0001,0.0001,0.0001,...,4.0,86.0,0.0,1.0,2.0,13.0,2.0,8.0,1.0,0.0
3,0.0001,0.0001,0.0001,0.0001,0.0001,826.0,7.0,0.0001,0.0001,0.0001,...,4.0,208.0,0.0,1.0,0.0,1.0,2.0,8.0,1.0,0.0
4,0.0001,0.0001,0.0001,0.0001,3960.0,0.0001,0.0001,0.0001,1641096.0,0.0001,...,4.0,1156.0,0.0,1.0,2.0,7.0,2.0,8.0,1.0,0.0


In [53]:
lr_model = LogisticRegression()
scores_oe = cross_validate(lr_model, X_oe, y, cv=cv, scoring=scorings)



In [57]:
print('Средние значения метрик при кодировке по числу значений категории \nЛогистическая Регрессия\n')
for key in scores_oe.keys():
    print(key, ':',scores_oe[key].mean())

fit_time : 22.641246557235718
score_time : 0.14193601608276368
test_precision : 0.2523847167325428
test_recall : 0.005479452054794521
test_f1 : 0.010576387125884604
test_roc_auc : 0.5224952753436151


# Кодировка по числу значений

In [58]:
X_count_enc = pd.DataFrame()
for feat in cat_features.columns:
    X_dict = Counter(cat_features[feat])
    X_count_enc[feat] = cat_features[feat].map(X_dict) 
X_ce = pd.concat([numeric_features,X_count_enc], axis=1)
X_ce.head()

Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var9,Var10,Var11,...,Var221,Var222,Var223,Var224,Var225,Var226,Var227,Var228,Var229,label
0,0.0001,0.0001,0.0001,0.0001,0.0001,819.0,7.0,0.0001,0.0001,0.0001,...,993,2,21925,29507,15662,1303,373,167,17073,27810
1,0.0001,0.0001,0.0001,0.0001,0.0001,2401.0,21.0,0.0001,0.0001,0.0001,...,22251,6,21925,29507,6249,4775,21080,2558,5898,27810
2,0.0001,0.0001,0.0001,0.0001,0.0001,343.0,0.0,0.0001,0.0001,0.0001,...,22251,41,21925,29507,15662,839,21080,19681,17073,27810
3,0.0001,0.0001,0.0001,0.0001,0.0001,826.0,7.0,0.0001,0.0001,0.0001,...,22251,21,21925,29507,6643,1338,21080,19681,17073,27810
4,0.0001,0.0001,0.0001,0.0001,3960.0,0.0001,0.0001,0.0001,1641096.0,0.0001,...,22251,15,21925,29507,15662,4775,21080,19681,17073,27810


In [59]:
X_ce = pd.concat([numeric_features,X_count_enc], axis=1)

In [61]:
lr_model = LogisticRegression()
scores_ce = cross_validate(lr_model, X_ce, y, cv=cv, scoring=scorings)



In [66]:
print('Средние значения метрик при кодировке по числу значений категории \nЛогистическая Регрессия\n')
for key in scores_ce.keys():
    print(key, ':',scores_ce[key].mean())

Средние значения метрик при кодировке по числу значений категории 
Логистическая Регрессия

fit_time : 16.677666854858398
score_time : 0.06913142204284668
test_precision : 0.6738467023172905
test_recall : 0.28949771689497716
test_f1 : 0.341655698271794
test_roc_auc : 0.8631785485001782
