In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from math import ceil
from sklearn.preprocessing import LabelEncoder, StandardScaler
from category_encoders.binary import BinaryEncoder
import seaborn as sns

from matplotlib.pyplot import figure

from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV

from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import precision_score, make_scorer

import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('bank.csv', sep=';')
df.head()

In [None]:
df.shape

In [None]:
df.dropna().shape

In [None]:
df.describe()

In [None]:
df.describe(include = ["object"])

In [None]:
df["y"].value_counts()

In [None]:
df["marital"].value_counts(normalize = True)

In [None]:
numeric_columns = ('age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous')

In [None]:
cols = 3

fig, axes = plt.subplots(nrows=ceil(len(numeric_columns)/cols), ncols=cols, figsize=(20, 10))

for i, column in enumerate(numeric_columns):
    df.boxplot(column=column, ax=axes[i//cols,i%cols])
    
plt.show()

In [None]:
df.corr()

In [None]:
df['y'].unique()

In [None]:
nominal_cols = ['job', 'marital', 'education', 'contact', 'poutcome']
binary_cols = ['default', 'housing', 'loan', 'y']
ordinal_cols = ['month']
numeric_cols = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

# Преобразование категориальных признаков

In [None]:
df_encoded = df.copy()

### Преобразование бинарных признаков

In [None]:
le = LabelEncoder()
le.fit(df_encoded[binary_cols[0]])

for col_name in binary_cols:
    df_encoded[col_name] = le.transform(df_encoded[col_name])

In [None]:
le.classes_

### Преобразование порядковых признаков

In [None]:
month_encoding = {
    'jan': 1,
    'feb': 2,
    'mar': 3,
    'apr': 4,
    'may': 5,
    'jun': 6,
    'jul': 7,
    'aug': 8,
    'sep': 9,
    'oct': 10,
    'nov': 11,
    'dec': 12
}

df_encoded['month'] = df_encoded['month'].map(month_encoding)

### Преобразование номинальных признаков

In [None]:
be = BinaryEncoder(cols=nominal_cols).fit(df_encoded)
df_encoded = be.transform(df_encoded)

#### Результат преобразования категориальных признаков в числовые:

In [None]:
df_encoded

In [None]:
df_encoded.describe()

In [None]:
df_encoded[numeric_cols+ordinal_cols+binary_cols].hist(figsize=(30,30))

In [None]:
corr = df_encoded.corr()
fig, ax = plt.subplots(figsize=(10,7))   

sns.heatmap(corr, 
        xticklabels=corr.columns,
        yticklabels=corr.columns, ax=ax)

In [None]:

corr = df_encoded[['y', 'age', 'default', 'balance', 'housing', 'loan', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous']].corr()
fig, ax = plt.subplots(figsize=(10,7))         # Sample figsize in inches

# plot the heatmap
sns.heatmap(corr, 
        xticklabels=corr.columns,
        yticklabels=corr.columns, ax=ax)

In [None]:
# Отобразить признаки, которые сильно коррелируют
for col in df_encoded.columns:
    items = list(df_encoded.corr()[col].items())
    cors = list(filter(lambda x: abs(x[1])>0.7 and x[1]!=1, items))
    if cors:
        print(col, cors)
    # print()

In [None]:
list(df_encoded.corr()['y'].items())

In [None]:
df.hist(figsize=(30,30))

In [None]:
ax = sns.countplot(x = df["y"]) 
plt.show()

## Стандартизация

In [None]:
df_scaled = df_encoded.copy()

In [None]:
for col_name in numeric_cols:
    scaler = StandardScaler()
    scaler.fit(df_scaled[col_name].to_numpy().reshape(-1, 1))
    df_scaled[col_name] = scaler.transform(df_scaled[col_name].to_numpy().reshape(-1, 1))

In [None]:
df_scaled[numeric_cols].hist(figsize=(30,30))

In [None]:
df_scaled

## Обучение базовых моделей

In [None]:
estimators = [
    SGDClassifier(),
    SVC(),
    KNeighborsClassifier(),
    DecisionTreeClassifier()
]

In [None]:
X = df_scaled.drop('y', 1)
y = df_scaled['y']

In [None]:
def get_estimators_scores(estimators, X, y, range_n=20):
    results = {}
    for estimator in estimators:
        for i in range(range_n):
            result = cross_val_score(estimator, X, y, scoring='precision').mean()
            if estimator.__class__.__name__ + str(id(estimator)) in results:
                results[estimator.__class__.__name__ + str(id(estimator))].append(result)
            else:
                results[estimator.__class__.__name__ + str(id(estimator))] = [result]
    return results
                
def print_results(results):
    for estimator in results:
        print('{}\t\t\tmin={}\tmax={}\tmean={}'.format(estimator, round(min(results[estimator]),3), round(max(results[estimator]), 3), round(sum(results[estimator])/len(results[estimator]), 3)))

In [None]:
results = get_estimators_scores(estimators, X, y)

In [None]:
print_results(results)

### Использование балансировки классов

#### Вспомогательные функции

In [None]:
# Функции для балансировки выборки

def over_sample(X, y, random_state=None):
  ros = RandomOverSampler(random_state=random_state)
  X_resampled, y_resampled = ros.fit_resample(X, y)
  return X_resampled, y_resampled

def under_sample(X, y, random_state=None):
  rus = RandomUnderSampler(random_state=random_state)
  X_resampled, y_resampled = rus.fit_resample(X, y)

  return  X_resampled, y_resampled


#### Использование over-sapling метода

In [None]:
X_, y_ = over_sample(X, y)
over_results = get_estimators_scores(estimators, X_, y_)

In [None]:
print_results(over_results)

#### Использование under-sapling метода

In [None]:
X_, y_ = under_sample(X, y)
under_results = get_estimators_scores(estimators, X_, y_)
print_results(under_results)

## Подбор гиперпараметров

#### Вспомогательные функции

In [None]:
def make_pipeline(estimator, sampler=None):
    if sampler:
        return Pipeline([('sampler', sampler), ('estimator', estimator)])
    return Pipeline([('estimator', estimator)])

def make_estimator(estimator, params_grid, data, labels, scorer='accuracy', sampler=None):
    pipeline = make_pipeline(estimator, sampler)
    grid_cv = RandomizedSearchCV(pipeline, params_grid, scoring=scorer, cv=5, random_state=42, n_iter=100)
    grid_cv.fit(data, labels)
    return grid_cv

#### Определение гиперпараметров

In [None]:
estimators_params = {
    'SGDClassifier': {
        'class': SGDClassifier,
        'sampling': 'over',
        'params': {
            'estimator__loss': ['hinge', 'log', 'modified_huber'],
            'estimator__penalty': ['l2', 'l1', 'elasticnet'],
            'estimator__shuffle': [True, False],
            'estimator__alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3],
            'estimator__max_iter': [5, 10, 20, 50, 100],
        },
    },
    'SVC': {
        'class': SVC,
        'sampling': 'over',
        'params': {
            'estimator__C': [1, 1.5, 0.5, 0.2, 2, 2.5],  
            'estimator__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],   
            'estimator__degree': [2, 3, 4, 5, 6],
            'estimator__gamma': ['scale', 'auto'],
        },
    },
    'KNeighborsClassifier': {
        'class': KNeighborsClassifier,
        'sampling': 'over',
        'params': {
            'estimator__n_neighbors': [2, 5, 10, 15, 20],    
            'estimator__weights': ['uniform', 'distance'],        
            'estimator__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
            'estimator__p': [1, 2],                                               
        },
    },
    'DecisionTreeClassifier': {
        'class': DecisionTreeClassifier,
        'sampling': 'over',
        'params': {
            'estimator__criterion': ['gini', 'entropy', 'log_loss'],                               
            'estimator__splitter': ['best', 'random'],     
            'estimator__min_samples_split': [1, 2, 5, 7, 10, 15],      
            'estimator__max_features': ['auto', 'sqrt', 'log2'],    
        },
    },
}

In [None]:
best_estimators = {}

for estimator in estimators:
    estimator_name = estimator.__class__.__name__
    if estimators_params[estimator_name]['sampling'] == 'over':
        X_, y_ = over_sample(X, y)
    elif estimators_params[estimator_name]['sampling'] == 'under':
        X_, y_ = under_sample(X, y)
    else:
        X_, y_ = X, y
        
    grid = make_estimator(estimator, estimators_params[estimator_name]['params'], X, y, scorer=make_scorer(precision_score))
    best_estimators[estimator_name] = {'best_score': grid.best_score_, 'best_params': grid.best_params_}

In [None]:
best_estimators

### Оценка качества классификации с подобранными гиперпараметрами

In [None]:
def build_estimator(estimator_cl, best_params):
    params = {param.replace('estimator__', ''): best_params[param] for param in best_params}
    return estimator_cl(**params)

In [None]:
over_sampling = [build_estimator(estimators_params[est]['class'], best_estimators[est]['best_params']) for est in best_estimators if estimators_params[est]['sampling']=='over']

X_, y_ = over_sample(X, y)

print_results(get_estimators_scores(over_sampling, X_, y_, 20))

# Применение ансамблей для улучшения качества работы моделей

In [None]:
estimators_list = [(est, build_estimator(estimators_params[est]['class'], best_estimators[est]['best_params'])) for est in best_estimators]

stacking_ens = StackingClassifier(estimators_list)

In [None]:
stacking_ens

In [None]:
# Составляем беггинг-ансамбли, состоящие из 4х моделей
bagging_ens = []
for est in best_estimators:
    bagging_ens.append(BaggingClassifier(build_estimator(estimators_params[est]['class'], best_estimators[est]['best_params']), n_estimators=4))


In [None]:
boosting_ens = [GradientBoostingClassifier(n_estimators=4)]

In [None]:
result = get_estimators_scores([stacking_ens] + bagging_ens + boosting_ens, X, y, 5)
print_results(result)

### Влияние количества моделей в ансамбле на точность классификации

In [None]:
ens_results = []
nn = [5, 10, 20, 50, 100, 150, 200, 500]
for n_estimators in nn:
    
    bagging_ens = []
    for est in best_estimators:
        bagging_ens.append(BaggingClassifier(build_estimator(estimators_params[est]['class'], best_estimators[est]['best_params']), n_estimators=n_estimators))
    
    boosting_ens = [GradientBoostingClassifier(n_estimators=n_estimators)]
    
    result = get_estimators_scores(bagging_ens + boosting_ens, X, y, 5)
    print('n_estimators = {}'.format(n_estimators))
    print_results(result)
    ens_results.append(result)


In [None]:
ensebles_scores = [[],[],[],[],[]]

for i, res in enumerate(ens_results):
    for j, n_result in enumerate(res):
        scores = ens_results[i][n_result]
        ensebles_scores[j].append(sum(scores)/len(scores))


In [None]:
figure(figsize=(10, 8), dpi=80)

x = nn
y1 = ensebles_scores[0]
y2 = ensebles_scores[1]
y3 = ensebles_scores[2]
y4 = ensebles_scores[3]
y5 = ensebles_scores[4]
plt.plot(x, y1, x, y2, x, y3, x, y4, x, y5)
plt.xlabel('n estimators', fontsize=15)
plt.ylabel('Precision', fontsize=15)
plt.legend(['Bagging SGD', 'Bagging SVC', 'Bagging KNeighbors', 'Bagging DecisionTree', 'GradientBoosting'], fontsize=12)
plt.show()

#### Балансировка классов для стекинга моделей

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=400, stratify=y, random_state=42)

final_est = 'KNeighborsClassifier'
estimator = build_estimator(estimators_params[final_est]['class'], best_estimators[final_est]['best_params'])

final_ens = BaggingClassifier(estimator, n_estimators=100)

final_ens.fit(X_train, y_train)
preds = final_ens.predict(X_test)
score_test = precision_score(y_test.to_list(), preds)
print('test score', score_test)
print('cross_vale_score', cross_val_score(final_ens, X_train, y_train).mean())

In [None]:
X_, y_ = over_sample(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_, y_, test_size=400, stratify=y_, random_state=42)

final_ens.fit(X_train, y_train)
preds = final_ens.predict(X_test)
score_test = accuracy_score(y_test.to_list(), preds)
print('test score', score_test)
print('cross_vale_score', cross_val_score(final_ens, X_train, y_train).mean())

In [None]:
X_, y_ = under_sample(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_, y_, test_size=400, stratify=y_, random_state=42)

final_ens.fit(X_train, y_train)
preds = final_ens.predict(X_test)
score_test = accuracy_score(y_test, preds)
print('test score', score_test)
print('cross_vale_score', cross_val_score(final_ens, X_train, y_train).mean())