# Импорт библиотек

In [2]:
import sys
import dill
import warnings
from functools import partial


import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from feature_engine.selection import DropDuplicateFeatures, DropCorrelatedFeatures, DropFeatures, DropConstantFeatures
from feature_engine.transformation import BoxCoxTransformer, YeoJohnsonTransformer
from feature_engine.outliers import Winsorizer
from feature_engine.wrappers import SklearnTransformerWrapper
from feature_engine.encoding import RareLabelEncoder, OneHotEncoder

from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

from sklearn.base import BaseEstimator
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from lightgbm import LGBMClassifier

from bayes_opt import BayesianOptimization

from datetime import datetime
from IPython.core.display import display, HTML
from sklearn.preprocessing import FunctionTransformer, StandardScaler

ModuleNotFoundError: No module named 'dill'

# Настройка ноутбука

In [1]:
display(HTML('<style>.container {width:95% !important;}</style>'))
warnings.filterwarnings('ignore')

NameError: name 'HTML' is not defined

# Загрузка данных

In [3]:
with open(f'data/ga_hits.pkl', 'rb') as file:
    ga_hits = dill.load(file)

In [4]:
with open(f'data/ga_sessions.pkl', 'rb') as file:
    ga_sessions = dill.load(file)

# Объединение таблиц

In [5]:
event = ['sub_car_claim_click', 'sub_car_claim_submit_click',
'sub_open_dialog_click', 'sub_custom_question_submit_click',
'sub_call_number_click', 'sub_callback_submit_click', 'sub_submit_success',
'sub_car_request_submit_click']

In [6]:
ga_hits['event_value'] = ga_hits.apply(lambda x: 1 if x.event_action in event else 0, axis=1)
ga_hits = ga_hits[['session_id', 'event_value']]
hits = ga_hits.groupby(by='session_id').sum()
hits['event_value'] = hits.apply(lambda x: 1 if x.event_value > 0 else 0, axis=1)
df_full = pd.merge(left=ga_sessions, right=hits, on='session_id', how='inner')

In [7]:
del ga_hits, ga_sessions

In [8]:
df_full.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1732266 entries, 0 to 1732265
Data columns (total 19 columns):
 #   Column                    Dtype 
---  ------                    ----- 
 0   session_id                object
 1   client_id                 object
 2   visit_date                object
 3   visit_time                object
 4   visit_number              int64 
 5   utm_source                object
 6   utm_medium                object
 7   utm_campaign              object
 8   utm_adcontent             object
 9   utm_keyword               object
 10  device_category           object
 11  device_os                 object
 12  device_brand              object
 13  device_model              object
 14  device_screen_resolution  object
 15  device_browser            object
 16  geo_country               object
 17  geo_city                  object
 18  event_value               int64 
dtypes: int64(2), object(17)
memory usage: 264.3+ MB


# Пропуски

In [9]:
def missings(data: pd.DataFrame) -> pd.DataFrame:
    data = data.copy()
    missing_values = [float('nan'), '(none)', '(not set)', '0x0']
    
    if 'device_screen_resolution' in data.columns:
        data['device_screen_resolution'] = data['device_screen_resolution'].replace(missing_values, data.device_screen_resolution.mode()[0])
        
    if 'geo_country' in data.columns:
        data['geo_country'] = data['geo_country'].replace(missing_values, data.geo_country.mode()[0])
    
    if 'device_os' in data.columns:
        data.device_os[(data.device_os.isna()) & (data['device_brand'] == 'Apple')] = 'iOS'
        data.device_os[(data.device_os.isna()) & (data['device_brand'] != '') & (data['device_brand'] != '(not set)')] = 'Android'
    
    return(data.fillna('(not set)'))

# Feature_engineering

In [10]:
def new_features(data: pd.DataFrame) -> pd.DataFrame:
    
    data = data.copy()
    
    distance_from_moscow = pd.read_csv('data/distance_from_moscow.csv', index_col='geo_city').to_dict()
    
    def distance(city: str) -> float:    
        return distance_from_moscow['distance_from_moscow'].get(str(city).lower(), -1)
    
    organic = ['organic', 'referral', '(none)']
    social = ['QxAxdyPLuQMEcrdZWdWb', 'MvfHsxITijuriZxsqZqt', 'ISrKoXQCxqqYvAZICvjs', 'IZEXUFLARCUMynmHNBGo', 'PlbkrSYoHuZBWfYjYnfw', 'gVRrcxiDQubJiljoTbGm']
    ussr = ['Azerbaijan', 'Armenia', 'Belarus', 'Georgia', 'Kazakhstan', 'Kyrgyzstan', 'Latvia', 'Lithuania', 'Moldova', 'Tajikistan', 'Turkmenistan', 'Estonia', 'Uzbekistan']
    moscow_region = ['Aprelevka', 'Balashikha', 'Chekhov', 'Chernogolovka', 'Dedovsk', 
    'Dmitrov', 'Dolgoprudny', 'Domodedovo', 'Dubna', 'Dzerzhinsky', 
    'Elektrogorsk', 'Elektrostal', 'Elektrougli', 'Fryazino', 'Golitsyno', 
    'Istra', 'Ivanteyevka', 'Kalininets', 'Kashira', 'Khimki', 'Khotkovo', 
    'Klimovsk', 'Klin', 'Kolomna', 'Korolyov', 'Kotelniki', 'Krasnoarmeysk', 
    'Krasnogorsk', 'Krasnoznamensk', 'Kubinka', 'Kurovskoye', 
    'Likino-Dulyovo', 'Lobnya', 'Losino-Petrovsky', 'Lukhovitsy', 
    'Lytkarino', 'Lyubertsy', 'Mozhaysk', 'Mytishchi', 'Naro-Fominsk', 
    'Noginsk', 'Odintsovo', 'Orekhovo-Zuyevo', 'Pavlovsky Posad', 'Podolsk', 
    'Protvino', 'Pushchino', 'Pushkino', 'Ramenskoye', 'Reutov', 'Ruza', 
    'Sergiyev Posad', 'Serpukhov', 'Shatura', 'Shchyolkovo', 
    'Solnechnogorsk', 'Staraya Kupavna', 'Stupino', 'Vidnoye', 
    'Volokolamsk', 'Voskresensk', 'Yakhroma', 'Yegoryevsk', 'Zvenigorod']
    millionaire_cities = ['Kazan', 'Moscow', 'Yekaterinburg', 'Saint Petersburg', 'Novosibirsk', 'Nizhny Novgorod', 'Chelyabinsk', 'Omsk', 'Samara', 'Ufa', 'Rostov-on-Don', 'Volgograd', 'Krasnodar', 'Perm']
    
    if 'session_id' in data.columns:
        data = data.set_index('session_id')
        
    if 'visit_date' in data.columns:
        data['visit_date'] = data['visit_date'].astype('datetime64[ns]')
        data['weekday'] = data['visit_date'].dt.weekday.astype('category')
        data['visit_day'] = data['visit_date'].dt.day.astype('category')
    
    if 'visit_time' in data.columns:
        data['visit_time'] = data['visit_time'].astype(str)
        data['visit_time'] = data['visit_time'].astype('datetime64[ns]')
        data['hour'] = data.apply(lambda x: x['visit_time'].hour, axis=1).astype('category')
        
    if 'utm_medium' in data.columns:
        data['is_organic'] = data['utm_medium'].isin(organic)
        
    if 'utm_source' in data.columns:
        data['is_social'] = data['utm_source'].isin(social)
        
    if 'device_screen_resolution' in data.columns:
        data[['screen_width', 'screen_height']] = data['device_screen_resolution'].str.split('x', expand=True).astype(float)
        data['screen_ratio'] = data['screen_width'] / data['screen_height']
        data['screen_area'] = data['screen_width'] * data['screen_height']
        
    if 'geo_country' in data.columns:
        data['is_ussr'] = data['geo_country'].isin(ussr)
        
    if 'geo_city' in data.columns:
        data['moscow_region'] = data['geo_city'].isin(moscow_region)
        data['millionaire_city'] = data['geo_city'].isin(millionaire_cities)
        data['distance_from_moscow'] = data['geo_city'].apply(lambda x: distance(x))
    
    return data

In [11]:
def convert_float(data: pd.DataFrame) -> pd.DataFrame:
    return data.astype(float)

# Pipeline

In [12]:
preprocessor = Pipeline(steps=[
    ('imputer', FunctionTransformer(missings)),
    ('engineer', FunctionTransformer(new_features)),
    ('dropper', DropFeatures(['client_id', 'visit_date', 'visit_time', 'device_screen_resolution'])),    
#    ('normalization', BoxCoxTransformer()), #*
    ('normalization', YeoJohnsonTransformer()),
    ('outliers', Winsorizer()),
    ('scaler', SklearnTransformerWrapper(StandardScaler())),
    ('rare', RareLabelEncoder(tol = 0.005)),
    ('oneHotEncoder', OneHotEncoder(drop_last_binary=True)),
    ('bool_convert', FunctionTransformer(convert_float)),
    ('constantDropper', DropConstantFeatures(tol=0.98)), 
    ('duplicatedDropper', DropDuplicateFeatures()),
    ('correlatedDropper', DropCorrelatedFeatures())    
])

# Modeling

In [13]:
X_train_valid, X_test, y_train_valid, y_test = train_test_split(df_full.drop(columns=['event_value']), df_full.event_value, 
                                                                test_size=0.15, stratify=df_full.event_value, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid, y_train_valid, test_size=0.15, stratify=y_train_valid, random_state=42)

In [14]:
print(f'Размер тренировочной выборки: {X_train.shape}')
print(f'Размер валидационной выборки: {X_valid.shape}')
print(f'Размер тестовой выборки: {X_test.shape}')

Размер тренировочной выборки: (1251562, 18)
Размер валидационной выборки: (220864, 18)
Размер тестовой выборки: (259840, 18)


In [15]:
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_valid_preprocessed = preprocessor.transform(X_valid)

## Поиск лучшего порога

In [16]:
def find_threshold(y: pd.Series, y_proba: pd.Series, best_threshold: float = 0.5, metric_name: str='roc_auc') -> float:    
    
    def f1(threshold: float) -> float:
        predict = (y_proba > threshold).astype(int)
        return f1_score(y, predict)
    
    best_metric = f1(best_threshold)
    direction = 1
    shift = 0.2
    
    for i in range(300):
        threshold = best_threshold + direction*shift
        shift *= 0.9
        metric = f1(threshold)
        if metric > best_metric:
            best_metric = metric
            best_threshold = threshold
        else:
            direction *= -1
    return best_threshold

## Вывод метрик

In [17]:
def print_metrics(model: BaseEstimator, X: pd.DataFrame, y: pd.Series) -> None:
    probas = model.predict_proba(X)[:, 1]
    threshold = find_threshold(y, probas)
    predict = (probas>threshold).astype(int)
    
    print(f"{roc_auc_score(y, probas)} - roc_auc на вероятностях")
    print(f'best threshold: {threshold}')
#    print(f'roc-auc: {roc_auc_score(y, predict)}')
    print(f'accuracy: {accuracy_score(y, predict)}')
    print(f'recall: {recall_score(y, predict)}')
    print(f'precision: {precision_score(y, predict)}')
    print(f'f1: {f1_score(y, predict)}\n')
    print(confusion_matrix(y, predict))
        

# Modeling

## Dummy

In [18]:
dummy = DummyClassifier(strategy='constant', constant=0)
dummy.fit(X_train_preprocessed, y_train)

DummyClassifier(constant=0, strategy='constant')

In [19]:
print_metrics(dummy, X_valid_preprocessed, y_valid)

0.5 - roc_auc на вероятностях
best threshold: 0.5
accuracy: 0.9709549768183136
recall: 0.0
precision: 0.0
f1: 0.0

[[214449      0]
 [  6415      0]]


## logistic regression

In [20]:
def optimization_lr(
    rare_encoder_tol: float, 
    constant_dropper_threshold: float,
    correlated_dropper_threshold: float,
    max_iter: float,
    C: float
) -> float:
    
    
    model = Pipeline(steps=[
        ('imputer', FunctionTransformer(missings)),
        ('engineer', FunctionTransformer(new_features)),
        ('dropper', DropFeatures(['client_id', 'visit_date', 'visit_time', 'device_screen_resolution'])),    
#        ('normalization', BoxCoxTransformer()),
        ('normalization', YeoJohnsonTransformer()),
        ('outliers', Winsorizer()),
        ('scaler', SklearnTransformerWrapper(StandardScaler())),
        ('rare', RareLabelEncoder(tol = rare_encoder_tol)),
        ('oneHotEncoder', OneHotEncoder(drop_last_binary=True)),
        ('bool_convert', FunctionTransformer(convert_float)),
        ('constantDropper', DropConstantFeatures(tol=constant_dropper_threshold)),
        ('duplicatetDropper', DropDuplicateFeatures()),
        ('correlatedDropper', DropCorrelatedFeatures(threshold=correlated_dropper_threshold)),
        ('model', LogisticRegression(max_iter=int(max_iter), C=C))])
    model.fit(X_train, y_train)
    prediction = model.predict_proba(X_valid)[:, 1]
    return roc_auc_score(y_valid, prediction)

In [21]:
optimizing_parameters_lr = {
    'C': (0.5, 3), 
    'max_iter': (50, 500),
    'rare_encoder_tol': (0.01, 0.1), 
    'constant_dropper_threshold': (0.95, 0.99),
    'correlated_dropper_threshold': (0.7, 0.99) 
    }

In [22]:
best_score_lr, best_parameters_lr = 0.0, dict()

optimizer_lr = BayesianOptimization(
    optimization_lr, 
    optimizing_parameters_lr, random_state=42)
optimizer_lr.maximize(init_points=3, n_iter=2)
    
best_score_lr = optimizer_lr.max['target']
best_parameters_lr.update(optimizer_lr.max['params'])

|   iter    |  target   |     C     | consta... | correl... | max_iter  | rare_e... |
-------------------------------------------------------------------------------------
| [0m1        [0m | [0m0.6674   [0m | [0m1.436    [0m | [0m0.988    [0m | [0m0.9123   [0m | [0m319.4    [0m | [0m0.02404  [0m |
| [0m2        [0m | [0m0.6589   [0m | [0m0.89     [0m | [0m0.9523   [0m | [0m0.9512   [0m | [0m320.5    [0m | [0m0.07373  [0m |
| [0m3        [0m | [0m0.6652   [0m | [0m0.5515   [0m | [0m0.9888   [0m | [0m0.9414   [0m | [0m145.6    [0m | [0m0.02636  [0m |
| [0m4        [0m | [0m0.6584   [0m | [0m0.5803   [0m | [0m0.9553   [0m | [0m0.899    [0m | [0m145.6    [0m | [0m0.08718  [0m |
| [0m5        [0m | [0m0.6552   [0m | [0m2.766    [0m | [0m0.9542   [0m | [0m0.7427   [0m | [0m466.4    [0m | [0m0.07936  [0m |


In [23]:
lr = LogisticRegression(max_iter=int(best_parameters_lr['max_iter']), C=best_parameters_lr['C'], solver='saga', penalty='l2')
lr.fit(X_train_preprocessed, y_train)

LogisticRegression(C=1.4363502971184063, max_iter=319, solver='saga')

In [24]:
print_metrics(lr, X_valid_preprocessed, y_valid)

0.6673027055176629 - roc_auc на вероятностях
best threshold: 0.5
accuracy: 0.9709549768183136
recall: 0.0
precision: 0.0
f1: 0.0

[[214449      0]
 [  6415      0]]


## Decision Tree

In [25]:
tree = DecisionTreeClassifier(random_state=42)

In [26]:
%%time
tree.fit(X_train_preprocessed, y_train)

CPU times: user 16.3 s, sys: 111 ms, total: 16.4 s
Wall time: 16.4 s


DecisionTreeClassifier(random_state=42)

In [27]:
print_metrics(tree, X_valid_preprocessed, y_valid)

0.5181655706696522 - roc_auc на вероятностях
best threshold: 0.7
accuracy: 0.945763003477253
recall: 0.057833203429462195
precision: 0.058832857595940376
f1: 0.058328747739957554

[[208514   5935]
 [  6044    371]]


## RandomForest

In [28]:
rf = RandomForestClassifier(random_state=42)

In [29]:
%%time
rf.fit(X_train_preprocessed, y_train)

CPU times: user 2min 45s, sys: 1.1 s, total: 2min 47s
Wall time: 2min 47s


RandomForestClassifier(random_state=42)

In [30]:
print_metrics(rf, X_valid_preprocessed, y_valid)

0.6290601100283227 - roc_auc на вероятностях
best threshold: 0.07991778605075303
accuracy: 0.8794733410605622
recall: 0.21730319563522993
precision: 0.06062714739268473
f1: 0.09480413492927096

[[192850  21599]
 [  5021   1394]]


## MLP

In [31]:
mlp = MLPClassifier(random_state=42)

In [32]:
%%time
mlp.fit(X_train_preprocessed, y_train)

CPU times: user 12min 2s, sys: 3min 26s, total: 15min 28s
Wall time: 2min 18s


MLPClassifier(random_state=42)

In [33]:
print_metrics(mlp, X_valid_preprocessed, y_valid)

0.6823950838471218 - roc_auc на вероятностях
best threshold: 0.07396250588826768
accuracy: 0.8972127644161113
recall: 0.2265003897116134
precision: 0.07570468399937477
f1: 0.11348016244923463

[[196709  17740]
 [  4962   1453]]


## Метод опорных векторов

In [34]:
svc = LinearSVC(class_weight='balanced')

In [35]:
%%time
svc.fit(X_train_preprocessed, y_train)

CPU times: user 6min 12s, sys: 1.05 s, total: 6min 13s
Wall time: 6min 13s


LinearSVC(class_weight='balanced')

In [36]:
roc_auc_score(y_valid, svc.predict(X_valid_preprocessed))

0.6190236351409708

## Градиентный бустинг

In [37]:
def optimization_hb(
    rare_encoder_tol: float, 
    constant_dropper_threshold: float,
    correlated_dropper_threshold: float,
    max_iter: float,
    min_samples_leaf: float,
    max_leaf_nodes: float,
    max_bins: float,
    learning_rate: float
) -> float:
    
    
    model = Pipeline(steps=[
        ('imputer', FunctionTransformer(missings)),
        ('engineer', FunctionTransformer(new_features)),
        ('dropper', DropFeatures(['client_id', 'visit_date', 'visit_time', 'device_screen_resolution'])),    
#        ('normalization', BoxCoxTransformer()),
        ('normalization', YeoJohnsonTransformer()),
        ('outliers', Winsorizer()),
        ('scaler', SklearnTransformerWrapper(StandardScaler())),
        ('rare', RareLabelEncoder(tol = rare_encoder_tol)),
        ('oneHotEncoder', OneHotEncoder(drop_last_binary=True)),
        ('bool_convert', FunctionTransformer(convert_float)),
        ('constantDropper', DropConstantFeatures(tol=constant_dropper_threshold)),
        ('duplicatetDropper', DropDuplicateFeatures()),
        ('correlatedDropper', DropCorrelatedFeatures(threshold=correlated_dropper_threshold)),
        ('model', HistGradientBoostingClassifier(max_iter=int(max_iter), max_bins=int(max_bins),learning_rate=learning_rate, min_samples_leaf=int(min_samples_leaf), max_leaf_nodes=int(max_leaf_nodes)))])
    model.fit(X_train, y_train)
    prediction = model.predict_proba(X_valid)[:, 1]
    return roc_auc_score(y_valid, prediction)

In [38]:
optimizing_parameters_hb = {
    'learning_rate': (0.01, 0.2), 
    'max_iter': (50, 500),
    'rare_encoder_tol': (0.01, 0.1), 
    'constant_dropper_threshold': (0.95, 0.99),
    'correlated_dropper_threshold': (0.7, 0.99), 
    'max_leaf_nodes': (15, 60),
    'min_samples_leaf': (12, 35),
    'max_bins': (100, 255)
    }

In [39]:
best_score_hb, best_parameters_hb = 0.0, dict()

optimizer_hb = BayesianOptimization(
    optimization_hb, 
    optimizing_parameters_hb, random_state=42)
optimizer_hb.maximize(init_points=3, n_iter=2)
    
best_score_hb = optimizer_hb.max['target']
best_parameters_hb.update(optimizer_hb.max['params'])

|   iter    |  target   | consta... | correl... | learni... | max_bins  | max_iter  | max_le... | min_sa... | rare_e... |
-------------------------------------------------------------------------------------------------------------------------
| [0m1        [0m | [0m0.6869   [0m | [0m0.965    [0m | [0m0.9757   [0m | [0m0.1491   [0m | [0m192.8    [0m | [0m120.2    [0m | [0m22.02    [0m | [0m13.34    [0m | [0m0.08796  [0m |
| [95m2        [0m | [95m0.6895   [0m | [95m0.974    [0m | [95m0.9053   [0m | [95m0.01391  [0m | [95m250.3    [0m | [95m424.6    [0m | [95m24.56    [0m | [95m16.18    [0m | [95m0.02651  [0m |
| [95m3        [0m | [95m0.6911   [0m | [95m0.9622   [0m | [95m0.8522   [0m | [95m0.09207  [0m | [95m145.1    [0m | [95m325.3    [0m | [95m21.28    [0m | [95m18.72    [0m | [95m0.04297  [0m |
| [95m4        [0m | [95m0.6914   [0m | [95m0.9658   [0m | [95m0.8233   [0m | [95m0.07795  [0m | [95m144.9    [0m | [

In [40]:
hb = HistGradientBoostingClassifier(max_iter=int(best_parameters_hb['max_iter']), 
                                    max_bins=int(best_parameters_hb['max_bins']), 
                                    learning_rate=best_parameters_hb['learning_rate'],
                                    max_leaf_nodes=int(best_parameters_hb['max_leaf_nodes']), 
                                    min_samples_leaf=int(best_parameters_hb['min_samples_leaf']),
                                    random_state=42)
hb.fit(X_train_preprocessed, y_train)

HistGradientBoostingClassifier(learning_rate=0.0779484687028613, max_bins=144,
                               max_iter=320, max_leaf_nodes=23,
                               min_samples_leaf=29, random_state=42)

In [41]:
print_metrics(hb, X_valid_preprocessed, y_valid)

0.6918319067059522 - roc_auc на вероятностях
best threshold: 0.05973362045702502
accuracy: 0.915703781512605
recall: 0.20280592361652378
precision: 0.0878757176629517
f1: 0.12262016965127237

[[200945  13504]
 [  5114   1301]]


## LGBMClassifier

In [42]:
def optimization_lgbm(
    rare_encoder_tol: float, 
    constant_dropper_threshold: float,
    correlated_dropper_threshold: float, 
    n_estimators: float, 
    num_leaves: float, 
    learning_rate: float, 
    reg_lambda: float, 
    reg_alpha: float, 
    min_child_samples: float,
    boosting_type: str = 'goss'
) -> float:
    
    
    model = Pipeline(steps=[
        ('imputer', FunctionTransformer(missings)),
        ('engineer', FunctionTransformer(new_features)),
        ('dropper', DropFeatures(['client_id', 'visit_date', 'visit_time', 'device_screen_resolution'])),    
#        ('normalization', BoxCoxTransformer()),
        ('normalization', YeoJohnsonTransformer()),
        ('outliers', Winsorizer()),
        ('scaler', SklearnTransformerWrapper(StandardScaler())),
        ('rare', RareLabelEncoder(tol = rare_encoder_tol)),
        ('oneHotEncoder', OneHotEncoder(drop_last_binary=True)),
        ('bool_convert', FunctionTransformer(convert_float)),
        ('constantDropper', DropConstantFeatures(tol=constant_dropper_threshold)),
        ('duplicatetDropper', DropDuplicateFeatures()),
        ('correlatedDropper', DropCorrelatedFeatures(threshold=correlated_dropper_threshold)),
        ('model', LGBMClassifier(n_estimators=int(n_estimators), num_leaves=int(num_leaves), learning_rate=learning_rate, reg_lambda=reg_lambda,
                    reg_alpha=reg_alpha, min_child_samples=int(min_child_samples), boosting_type=boosting_type, random_state=42))])
    model.fit(X_train, y_train)
    prediction = model.predict_proba(X_valid)[:, 1]
    return roc_auc_score(y_valid, prediction)
    

In [43]:
optimizing_parameters_lgbm = {
    'rare_encoder_tol': (0.01, 0.1), 
    'constant_dropper_threshold': (0.95, 0.99),
    'correlated_dropper_threshold': (0.7, 0.99), 
    'n_estimators': (500, 5000), 
    'num_leaves': (10, 120), 
    'learning_rate': (0.01, 0.25), 
    'reg_lambda': (0, 50),  
    'reg_alpha': (0, 50),
    'min_child_samples': (10, 45)}

In [46]:
best_score_lgbm, best_parameters_lgbm = 0.0, dict()
for boosting_type in ('gbdt', 'goss'):

    print(f'boosting_type = {boosting_type}')
    optimizer_lgbm = BayesianOptimization(
        partial(optimization_lgbm, boosting_type=boosting_type), 
        optimizing_parameters_lgbm, random_state=42)
    optimizer_lgbm.maximize(init_points=5, n_iter=10)
    
    if best_score_lgbm < optimizer_lgbm.max['target']:
        best_score_lgbm = optimizer_lgbm.max['target']
        best_parameters_lgbm.update(optimizer_lgbm.max['params'])
        best_parameters_lgbm['boosting_type'] = boosting_type

boosting_type = gbdt
|   iter    |  target   | consta... | correl... | learni... | min_ch... | n_esti... | num_le... | rare_e... | reg_alpha | reg_la... |
-------------------------------------------------------------------------------------------------------------------------------------
| [0m1        [0m | [0m0.6888   [0m | [0m0.965    [0m | [0m0.9757   [0m | [0m0.1857   [0m | [0m30.95    [0m | [0m1.202e+03[0m | [0m27.16    [0m | [0m0.01523  [0m | [0m43.31    [0m | [0m30.06    [0m |
| [95m2        [0m | [95m0.6965   [0m | [95m0.9783   [0m | [95m0.706    [0m | [95m0.2428   [0m | [95m39.14    [0m | [95m1.456e+03[0m | [95m30.0     [0m | [95m0.02651  [0m | [95m15.21    [0m | [95m26.24    [0m |
| [0m3        [0m | [0m0.6893   [0m | [0m0.9673   [0m | [0m0.7845   [0m | [0m0.1568   [0m | [0m14.88    [0m | [0m1.815e+03[0m | [0m50.3     [0m | [0m0.05105  [0m | [0m39.26    [0m | [0m9.984    [0m |
| [0m4        [0m | [0m0.6826  

In [47]:
best_parameters_lgbm

{'constant_dropper_threshold': 0.9763040173185428,
 'correlated_dropper_threshold': 0.9419706591416945,
 'learning_rate': 0.06441747665717384,
 'min_child_samples': 44.441219949190845,
 'n_estimators': 1464.260147113776,
 'num_leaves': 28.66089028191194,
 'rare_encoder_tol': 0.024244275099912872,
 'reg_alpha': 26.375317773499894,
 'reg_lambda': 29.600022434984556,
 'boosting_type': 'goss'}

In [48]:
lgbm = LGBMClassifier(n_estimators=int(best_parameters_lgbm['n_estimators']), num_leaves=int(best_parameters_lgbm['num_leaves']),
                             learning_rate=best_parameters_lgbm['learning_rate'], reg_lambda=best_parameters_lgbm['reg_lambda'], 
                             reg_alpha=best_parameters_lgbm['reg_alpha'], min_child_samples=int(best_parameters_lgbm['min_child_samples']), 
                             boosting_type=best_parameters_lgbm['boosting_type'], random_state=42)

In [50]:
lgbm.fit(X_train_preprocessed, y_train)

LGBMClassifier(boosting_type='goss', learning_rate=0.06441747665717384,
               min_child_samples=44, n_estimators=1464, num_leaves=28,
               random_state=42, reg_alpha=26.375317773499894,
               reg_lambda=29.600022434984556)

In [51]:
print_metrics(lgbm, X_valid_preprocessed, y_valid)

0.6973434373950153 - roc_auc на вероятностях
best threshold: 0.05990332137564615
accuracy: 0.9016725224572588
recall: 0.23741231488698364
precision: 0.08300632221495531
f1: 0.12300609780721236

[[197624  16825]
 [  4892   1523]]


# Final model

In [54]:
final_pipe = Pipeline(steps=[
    ('imputer', FunctionTransformer(missings)),
    ('engineer', FunctionTransformer(new_features)),
    ('dropper', DropFeatures(['client_id', 'visit_date', 'visit_time', 'device_screen_resolution'])),    
#    ('normalization', BoxCoxTransformer()),
    ('normalization', YeoJohnsonTransformer()),
    ('outliers', Winsorizer()),
    ('scaler', SklearnTransformerWrapper(StandardScaler())),
    ('rare', RareLabelEncoder(tol = best_parameters_lgbm['rare_encoder_tol'])),
    ('oneHotEncoder', OneHotEncoder(drop_last_binary=True)),
    ('bool_convert', FunctionTransformer(convert_float)),
    ('constantDropper', DropConstantFeatures(tol=best_parameters_lgbm['constant_dropper_threshold'])),
    ('duplicatedDropper', DropDuplicateFeatures()),
    ('correlatedDropper', DropCorrelatedFeatures(threshold= best_parameters_lgbm['correlated_dropper_threshold'])),
    ('model', LGBMClassifier(n_estimators=int(best_parameters_lgbm['n_estimators']), num_leaves=int(best_parameters_lgbm['num_leaves']),
                             learning_rate=best_parameters_lgbm['learning_rate'], reg_lambda=best_parameters_lgbm['reg_lambda'], 
                             reg_alpha=best_parameters_lgbm['reg_alpha'], min_child_samples=int(best_parameters_lgbm['min_child_samples']), 
                             boosting_type=best_parameters_lgbm['boosting_type'], random_state=42))
])

# Metrics

In [55]:
final_pipe.fit(X_train_valid, y_train_valid)

Pipeline(steps=[('imputer',
                 FunctionTransformer(func=<function missings at 0x7f9dc7ec9280>)),
                ('engineer',
                 FunctionTransformer(func=<function new_features at 0x7f9dc7ec9940>)),
                ('dropper',
                 DropFeatures(features_to_drop=['client_id', 'visit_date',
                                                'visit_time',
                                                'device_screen_resolution'])),
                ('normalization', YeoJohnsonTransformer()),
                ('outliers', Winsorizer()),
                ('sca...
                 DropConstantFeatures(tol=0.9763040173185428)),
                ('duplicatedDropper', DropDuplicateFeatures()),
                ('correlatedDropper',
                 DropCorrelatedFeatures(threshold=0.9419706591416945)),
                ('model',
                 LGBMClassifier(boosting_type='goss',
                                learning_rate=0.06441747665717384,
               

In [56]:
test_proba = final_pipe.predict_proba(X_test)[:, 1]
final_threshold = find_threshold(y_test, test_proba)
test_prediction = (test_proba > final_threshold).astype(int)

In [57]:
print_metrics(final_pipe, X_test, y_test)

0.7040793594168728 - roc_auc на вероятностях
best threshold: 0.06392535588911787
accuracy: 0.9143203509852217
recall: 0.21915993109844972
precision: 0.0917665335108744
f1: 0.12936529662508311

[[235923  16370]
 [  5893   1654]]


# Обучение на всех данных

In [58]:
final_model = final_pipe.named_steps['model']
final_preprocessor = final_pipe.set_params(model=None)

In [59]:
session_prepared = final_preprocessor.fit_transform(df_full.drop(columns=['event_value']))

In [60]:
final_model.fit(session_prepared, df_full.event_value);

In [61]:
session_prepared.duplicated().sum()

260360

In [62]:
columns_from_droppers = []
for step in ('constantDropper', 'duplicatedDropper', 'correlatedDropper'):
    print(step + ':')
    for column in final_preprocessor.named_steps[step].features_to_drop_:
        print(f'\t{column}')
        columns_from_droppers.append(column)

constantDropper:
	is_ussr
	device_category_tablet
	device_os_Rare
	device_model_(not set)
duplicatedDropper:
	utm_medium_(none)
correlatedDropper:
	utm_medium_banner
	device_category_desktop
	utm_campaign_FTjNLDyTrXaWYgZymFkV
	device_brand_
	device_brand_Apple


In [63]:
feature_importance = pd.Series(
    data=final_model.feature_importances_, 
    index=session_prepared.columns).sort_values(ascending=False)

In [64]:
feature_importance.head(10)

distance_from_moscow     5831
screen_ratio             4114
visit_number             3700
screen_height            3396
screen_area              2682
screen_width             1857
utm_campaign_Rare         320
hour_Rare                 281
utm_keyword_(not set)     278
weekday_1                 256
dtype: int32

In [65]:
final_pipe = final_pipe.set_params(model=final_model)

In [66]:
final_pipe.metadata = {
    'name': 'СберАвтоподписка',
    'description': 'Модель для определения совершения целевого действия',
    'model': final_model.__class__.__name__,
    'version': 1.0,
    'model_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    'author': 'Sergey Tuktarov',
    'metrics': {
        'threshold': final_threshold,
        'roc_auc': roc_auc_score(y_test, test_prediction),
        'accuracy': accuracy_score(y_test, test_prediction),
        'precision': precision_score(y_test, test_prediction), 
        'recall': recall_score(y_test, test_prediction), 
        'f1': f1_score(y_test, test_prediction),
               }
}

In [None]:
filename = f'model_{datetime.now():%Y%m%d%H%M%S}.pkl'
with open(f'models/{filename}', 'wb') as file:
    dill.dump(final_pipe, file)