# Imports

In [1]:
import os

import pandas as pd
import numpy as np
import scipy as stats
import joblib
import time
import bisect
import gc
import sklearn.metrics as mtrcs
# import optuna

from random import randint
from typing import Text
from tqdm import tqdm
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, KFold
from catboost import CatBoostRegressor, CatBoostClassifier

from sklearn.preprocessing import label_binarize

from sklearn.metrics import f1_score, accuracy_score, \
roc_auc_score, classification_report, r2_score, precision_score, recall_score, \
log_loss

import warnings
warnings.filterwarnings("ignore");

RAND = 10
N_FOLDS = 5

In [2]:
from get_metrics import get_metrics_classification, roc_auc_score, f1_score
from sklearn.metrics import f1_score, accuracy_score, \
roc_auc_score, classification_report, r2_score, precision_score, recall_score, \
log_loss

In [3]:
def get_metrics_multiclass(y_test_bin, y_test, y_pred, y_prob, name,
                           type_multi):
    """
    y_test_bin - бинаризованные тестовые метки класса
    y_test - метки класса без бинаризации
    y_prob - предсказанные вероятности классов
    name - название модели/подхода
    type_multi - тип многоклассовой классификации для ROC-AUC (ovo/ovr)
    """
    
    df_metrics = pd.DataFrame()

    df_metrics['model'] = [name]
    
#     df_metrics['ROC_AUC'] = roc_auc_score(y_test_bin,
#                                           y_prob,
#                                           multi_class=type_multi)
#     df_metrics['Precision_micro'] = precision_score(y_test,
#                                                     y_pred,
#                                                     average='micro')
#     df_metrics['Precision_macro'] = precision_score(y_test,
#                                                     y_pred,
#                                                     average='macro')

#     df_metrics['Recall_micro'] = recall_score(y_test, y_pred, average='micro')
#     df_metrics['Recall_macro'] = recall_score(y_test, y_pred, average='macro')
    df_metrics['F1_weighted'] = f1_score(y_test, y_pred, average="weighted") 
#     df_metrics['F1_weighted_train'] = f1_score(y_test, y_pred_train, average="weighted") 
#     df_metrics['F1_micro'] = f1_score(y_test, y_pred, average='micro')
#     df_metrics['F1_macro'] = f1_score(y_test, y_pred, average='macro')
    
#     df_metrics['Logloss'] = log_loss(y_test, y_prob)

    return df_metrics

In [4]:
def get_f1_score_overfitting(model, X_train, y_train, X_test, y_test, name):
    """
    
    """
    df_metrics = pd.DataFrame()
    
    df_metrics['model'] = [name]
    
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    value_train = f1_score(y_train, y_pred_train, average="weighted")
    value_test = f1_score(y_test, y_pred_test, average="weighted")

    
    df_metrics["F1_weighted_train"] = float('{:.4f}'.format(value_train))
    df_metrics["F1_weighted_test"] = float('{:.4f}'.format(value_test))
    df_metrics["delta"] = f'{(abs(value_train - value_test)/value_test*100):.1f} %'

    return df_metrics

## Расположение папок с данными

In [5]:
LOCAL_DATA_PATH = "../data"
DATA_FILE = 'competition_data_final_pqt'

# целевые переменные
TARGET_FILE_AGE = 'targets_age_prep.parquet'
TARGET_FILE_MALE = 'targets_is_male_prep.parquet'

# id, по которым нужно предсказать пол и возраст
SUBMIT_FILE = 'submit_2.pqt'

# папка, куда будут сохраняться предобработанные данные
PREP_DATA = 'preprocessed_data'

LOCAL_DATA_PATH_mts = "../data/ml_cup_data"

In [6]:
def get_dataset(dataset_path: Text) -> pd.DataFrame:
    """
    Получение данных по заданному пути
    :param dataset_path: путь до данных
    :return: датасет
    """
    return pd.read_parquet(f"{LOCAL_DATA_PATH}/{PREP_DATA}/{dataset_path}")

In [7]:
# Split in train/test
def split_train_test(dataset_path: Text, target: Text): #dataset: pd.DataFrame,
    """
    Разделение данных на train/test
    :param dataset: датасет
    :return: train/test датасеты
    """

    # get data
    dataset = get_dataset(dataset_path)
    cat_features = dataset.select_dtypes('category').columns.tolist()
    
    X = dataset.drop(['user_id', target], axis=1)
    y = dataset[target]

    # Тестовые
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size = 0.3,
        stratify=y,
        random_state = RAND
    )

    # Валидационные
    X_train_, X_val, y_train_, y_val = train_test_split(X_train,
                                                        y_train,
                                                        test_size=0.15,
                                                        random_state=RAND)

    eval_set = [(X_val, y_val)]

    print(f"split_train_test {target} done")
    
    return X_train, X_test, y_train, y_test, eval_set, cat_features, X_train_, y_train_

# 5. ML parts

## 5.1.1. Получим оценку по возрасту

### CatBoostClassifier baseline

In [8]:
def study_age_1(study_name: Text):
    """
    """

    # get params
#     with open(config_path) as file:
#         config = yaml.load(file, Loader=yaml.FullLoader)
#     preprocessing_config = config["preprocessing"]
#     train_config = config["train"]
    
#     target=preprocessing_config["target_column"]

    X_train, X_test, y_train, y_test, eval_set, cat_features, X_train_, y_train_ = split_train_test(
        "prep_data_with_targets_age.parquet",
        "age_target")
    
    cat_params = {
        'n_estimators' : 1500,
        'random_state' : RAND,
        'early_stopping_rounds' : 100,
        'custom_loss' : ['TotalF1'],
        'cat_features' : cat_features
    }

    model = CatBoostClassifier(**cat_params,
                             task_type="GPU")

    model.fit(X_train,
            y_train,
            eval_set=eval_set,
            verbose=False)
    
    print(f"study_age_1 {study_name} fit")
    
#     return model

    # датасеты с метриками моделей
    y = get_dataset("prep_data_with_targets_age.parquet")["age_target"]
    y_test_bin = label_binarize(y_test, classes=list(set(y)))

    y_pred = model.predict(X_test)
    y_score = model.predict_proba(X_test)
    

    # Метрики
    df_metrics = get_metrics_multiclass(y_test_bin=y_test_bin,
                                         y_test=y_test,
                                         y_pred=y_pred,
                                         y_prob=y_score,
                                         name=study_name,
                                         type_multi='ovo')
    df_metrics.to_parquet(f"{LOCAL_DATA_PATH}/{PREP_DATA}/df_age_metrics.parquet")
    
    # Переобучение
    df_overfitting = get_f1_score_overfitting(model, 
                                              X_train, 
                                              y_train, 
                                              X_test, 
                                              y_test, 
                                              study_name)
    df_overfitting.to_parquet(f"{LOCAL_DATA_PATH}/{PREP_DATA}/df_age_overfitting.parquet")    
    
    print(f"study_age_1 {study_name} done")
    
#     return df_metrics, df_overfitting

In [9]:
# df_metrics, df_overfitting = study_age_1('Cat_base')
study_age_1('Cat_age_base')

split_train_test age_target done
study_age_1 Cat_age_base fit
study_age_1 Cat_age_base done


In [10]:
# df_metrics
# pd.read_parquet(f"{LOCAL_DATA_PATH}/{PREP_DATA}/df_metrics.parquet")
get_dataset('df_age_metrics.parquet')

Unnamed: 0,model,F1_weighted
0,Cat_age_base,0.424165


In [11]:
# df_overfitting
# pd.read_parquet(f"{LOCAL_DATA_PATH}/{PREP_DATA}/df_overfitting.parquet")
get_dataset('df_age_overfitting.parquet')

Unnamed: 0,model,F1_weighted_train,F1_weighted_test,delta
0,Cat_age_base,0.491,0.4242,15.8 %


In [10]:
# df_metrics
# pd.read_parquet(f"{LOCAL_DATA_PATH}/{PREP_DATA}/df_metrics.parquet")
get_dataset('df_age_metrics.parquet')

Unnamed: 0,model,F1_weighted
0,Cat_age_base,0.451353


In [11]:
# df_overfitting
# pd.read_parquet(f"{LOCAL_DATA_PATH}/{PREP_DATA}/df_overfitting.parquet")
get_dataset('df_age_overfitting.parquet')

Unnamed: 0,model,F1_weighted_train,F1_weighted_test,delta
0,Cat_age_base,0.5141,0.4514,13.9 %


тестовые подборы параметров:
<!-- # 250
# learning_rate : 0.32709699869155884
# age f1_weighted score: 0.4428 
# 8%

# 500
# age f1_weighted score: 0.4463
# f1_score_weighted train: 0.494
# f1_score_weighted test: 0.446
# delta = 10.7 %  

# 1000
# learning_rate : 0.1784750074148178
# age f1_weighted score: 0.4502

# 1500
# learning_rate : 0.11958499997854231
# age f1_weighted score: 0.4514
# f1_score_weighted train: 0.514
# f1_score_weighted test: 0.451
# delta = 13.9 %

# 2500
# f1_score_weighted train: 0.526
# f1_score_weighted test: 0.454
# delta = 15.9 %

# 5000
# learning_rate : 0.08833400160074234
# age f1_weighted score: 0.457
# f1_score_weighted train: 0.543
# f1_score_weighted test: 0.457
# delta = 18.8 % -->

### Подбор параметров с помощью randomized grid search

In [18]:
def func_rgs_age( X_train, X_test, y_train, y_test, eval_set, cat_features):
    # если требуется подбор параметров
    grid = {
        "n_estimators": [1500], 
        "learning_rate": [0.11958499997854231],
        "boosting_type" : ['Ordered', 'Plain'], #
        "bootstrap_type" : ["Bayesian", "Bernoulli", "MVS"], 
        "grow_policy": ["SymmetricTree", "Depthwise", "Lossguide"],
        "custom_metric" : ['F1'],
        "max_depth" : list(range(6, 10)),
        "l2_leaf_reg" : [*np.arange(1, 10)], #
        "random_state": [RAND]
    }
    
    model = CatBoostClassifier(silent=True,
                               cat_features=cat_features,
                               task_type="GPU",
                               early_stopping_rounds=100
                               )
    
    grid_search_result = model.randomized_search(grid,
                                                 X=X_train,
                                                 y=y_train,
                                                 cv=5, # 
                                                 n_iter=50, #
                                                 refit=True, # 
                                                 shuffle=True, #
                                                 stratified=True, #
                                                 calc_cv_statistics=True, ##
                                                 search_by_train_test_split=True, ##
                                                 verbose=False,
                                                 plot=False) # True
    
#     joblib.dump(grid_search_result['params'], f'{LOCAL_DATA_PATH}/{PREP_DATA}/grid_search_result_age.pkl')
    print("rgs age done")
    
    return grid_search_result['params']

In [19]:
def study_age_2(study_name: Text, rgs ='N'):
    """
    """

    # get params
#     with open(config_path) as file:
#         config = yaml.load(file, Loader=yaml.FullLoader)
#     preprocessing_config = config["preprocessing"]
#     train_config = config["train"]
    
#     target=preprocessing_config["target_column"]


    X_train, X_test, y_train, y_test, eval_set, cat_features, X_train_, y_train_ = split_train_test(
        "prep_data_with_targets_age.parquet",
        "age_target")
    
    # Подбор параметров
    if rgs == 'Y':
        # Если требуется подбор параметров
        grid_search_result = func_rgs_age(X_train, X_test, y_train, y_test, eval_set, cat_features)
    elif rgs == 'N':
        # пользуемся уже подобранными параметрами rgs
        grid_search_result = joblib.load(f'{LOCAL_DATA_PATH}/{PREP_DATA}/grid_search_result_age.pkl')    
    
    # тюнинг модели подобранными параметрами
    cat_rgs = CatBoostClassifier(**grid_search_result,
                              task_type="GPU",
                              loss_function='MultiClass')
    
    cat_rgs.fit(X_train_,
                 y_train_,
                 cat_features=cat_features,
                 eval_set=eval_set,
                 verbose=False,
                 early_stopping_rounds=100)
    
    # saving model
    joblib.dump(cat_rgs, f'{LOCAL_DATA_PATH}/{PREP_DATA}/model_cat_grid_age.pkl')    
    print(f"study_age_2 {study_name} done")
    
    # датасеты с метриками моделей
    y = get_dataset("prep_data_with_targets_age.parquet")["age_target"]
    y_test_bin = label_binarize(y_test, classes=list(set(y)))
    
    y_pred = cat_rgs.predict(X_test)
    y_score = cat_rgs.predict_proba(X_test)
    
    # Добавление метрик
    df_metrics = get_dataset('df_age_metrics.parquet')
    df_metrics = pd.concat([df_metrics,
                             get_metrics_multiclass(y_test_bin=y_test_bin,
                                                    y_test=y_test,
                                                    y_pred=y_pred,
                                                    y_prob=y_score,
                                                    name=study_name,
                                                    type_multi='ovo')
                            ])
    
#     df_metrics = df_metrics.append(
#         get_metrics_multiclass(y_test_bin=y_test_bin,
#                                y_test=y_test,
#                                y_pred=y_pred,
#                                y_prob=y_score,
#                                name=study_name,
#                                type_multi='ovo'))
    
    df_metrics.to_parquet(f"{LOCAL_DATA_PATH}/{PREP_DATA}/df_age_metrics.parquet")
    
    df_overfitting = get_dataset('df_age_overfitting.parquet')
    df_overfitting = pd.concat([df_overfitting, get_f1_score_overfitting(cat_rgs, 
                                                                         X_train, 
                                                                         y_train, 
                                                                         X_test, 
                                                                         y_test, 
                                                                         study_name)
                               ])
                                
#     df_overfitting = df_overfitting.append(get_f1_score_overfitting(cat_rgs, 
#                                                                     X_train, 
#                                                                     y_train, 
#                                                                     X_test, 
#                                                                     y_test, 
#                                                                     study_name))
    df_overfitting.to_parquet(f"{LOCAL_DATA_PATH}/{PREP_DATA}/df_age_overfitting.parquet")
    
#     return df_metrics, df_overfitting, grid_search_result

In [20]:
# df_metrics, df_overfitting, grid_search_result = study_age_2('Cat_rgs')
study_age_2('Cat_age_rgs', 'Y')

split_train_test age_target done
bestTest = 1.301975038
bestIteration = 1462
bestTest = 1.294323449
bestIteration = 1290
bestTest = 1.301137833
bestIteration = 1455
bestTest = 1.29541919
bestIteration = 1496
bestTest = 1.303697625
bestIteration = 1490
bestTest = 1.305211228
bestIteration = 1497
bestTest = 1.301601761
bestIteration = 1450
bestTest = 1.292792413
bestIteration = 1495
bestTest = 1.299842829
bestIteration = 1495
bestTest = 1.308838023
bestIteration = 824
bestTest = 1.293286769
bestIteration = 1447
bestTest = 1.30557563
bestIteration = 592
bestTest = 1.295195942
bestIteration = 1187
bestTest = 1.2929211
bestIteration = 1492
bestTest = 1.303169459
bestIteration = 1231
bestTest = 1.294264283
bestIteration = 1470
bestTest = 1.304725007
bestIteration = 956
Training on fold [0/5]
bestTest = 1.295514384
bestIteration = 1499
Training on fold [1/5]
bestTest = 1.297975178
bestIteration = 1329
Training on fold [2/5]
bestTest = 1.292417657
bestIteration = 1492
Training on fold [3/5]
be

In [15]:
# df_metrics = df_metrics[:1]
get_dataset('df_age_metrics.parquet').set_index('model').style.highlight_max(axis=0, color='lightblue')

Unnamed: 0_level_0,F1_weighted
model,Unnamed: 1_level_1
Cat_age_base,0.451353
Cat_age_rgs,0.45403


In [16]:
# df_overfitting = df_overfitting[:1]
get_dataset('df_age_overfitting.parquet').set_index('model').style.highlight_max(axis=0, color='lightblue')

Unnamed: 0_level_0,F1_weighted_train,F1_weighted_test,delta
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Cat_age_base,0.5141,0.4514,13.9 %
Cat_age_rgs,0.5637,0.454,24.1 %


### CatBoostClassifier grid_search_result

Получаем модель по предсказанию возрастных классов на основе подобранных параметров

после подбора параметров метрика немного выросла, сделаем предсказание нужных сабмитов

## 5.1.2. Обучение на всей подготовленной выборке по возрасту.

In [16]:
def study_age_3(study_name: Text):
    """
    """

    # get params
#     with open(config_path) as file:
#         config = yaml.load(file, Loader=yaml.FullLoader)
#     preprocessing_config = config["preprocessing"]
#     train_config = config["train"]
    
#     target=preprocessing_config["target_column"]

    X_train, X_test, y_train, y_test, eval_set, cat_features, X_train_, y_train_ = split_train_test(
        "prep_data_with_targets_age.parquet",
        "age_target")

    # get data
    dataset = get_dataset("prep_data_with_targets_age.parquet")
    cat_features = dataset.select_dtypes('category').columns.tolist()
    X = dataset.drop(['user_id', 'age_target'], axis=1)
    y = dataset['age_target']
    del dataset

    cat_rgs_all = joblib.load(f'{LOCAL_DATA_PATH}/{PREP_DATA}/model_cat_grid_age.pkl')

    # обучение модели на всём датасете
    cat_rgs_all.fit(X,
                    y,
                    cat_features,
                    verbose = False)

    print(f"study_age_3 {study_name} fit done")   
    
    #save
    joblib.dump(cat_rgs_all, f'{LOCAL_DATA_PATH}/{PREP_DATA}/model_fin_age.pkl')
    
    # предсказание возраста по сабмитам
    df_submit = get_dataset('df_submit.parquet')
    
    fin_submit = df_submit[['user_id']]
    fin_submit['age'] = cat_rgs_all.predict(df_submit.drop(['user_id'], axis = 1))

    fin_submit.to_csv(f'{LOCAL_DATA_PATH}/{PREP_DATA}/fin_submit.csv', index = False)
    
    print(f"study_age_3 {study_name} pred submit done") 

    
    # датасеты с метриками моделей
    y = get_dataset("prep_data_with_targets_age.parquet")["age_target"]
    y_test_bin = label_binarize(y_test, classes=list(set(y)))

    y_pred = cat_rgs_all.predict(X_test)
    y_score = cat_rgs_all.predict_proba(X_test)
    
    # Добавление
    df_metrics = get_dataset('df_age_metrics.parquet')
    df_metrics = pd.concat([df_metrics,
                             get_metrics_multiclass(y_test_bin=y_test_bin,
                                                    y_test=y_test,
                                                    y_pred=y_pred,
                                                    y_prob=y_score,
                                                    name=study_name,
                                                    type_multi='ovo')
                            ])  
    
#     df_metrics = df_metrics.append(
#         get_metrics_multiclass(y_test_bin=y_test_bin,
#                                y_test=y_test,
#                                y_pred=y_pred,
#                                y_prob=y_score,
#                                name=study_name,
#                                type_multi='ovo'))
    
    df_metrics.to_parquet(f"{LOCAL_DATA_PATH}/{PREP_DATA}/df_age_metrics.parquet")
    
    df_overfitting = get_dataset('df_age_overfitting.parquet')
    
    df_overfitting = pd.concat([df_overfitting, get_f1_score_overfitting(cat_rgs_all, 
                                                                         X_train, 
                                                                         y_train, 
                                                                         X_test, 
                                                                         y_test, 
                                                                         study_name)
                               ])
    
#     df_overfitting = df_overfitting.append(get_f1_score_overfitting(cat_rgs_all, 
#                                                                     X_train, 
#                                                                     y_train, 
#                                                                     X_test, 
#                                                                     y_test, 
#                                                                     study_name))
    df_overfitting.to_parquet(f"{LOCAL_DATA_PATH}/{PREP_DATA}/df_age_overfitting.parquet")
    
#     return fin_submit

In [17]:
# fin_submit = study_age_3('Cat_rgs_all_data')
study_age_3('Cat_age_rgs_all_data')

split_train_test age_target done
study_age_3 Cat_age_rgs_all_data fit done
study_age_3 Cat_age_rgs_all_data pred submit done


In [18]:
pd.read_csv(f'{LOCAL_DATA_PATH}/{PREP_DATA}/fin_submit_age.csv')

Unnamed: 0,user_id,age,is_male
0,27,1,0.234487
1,83,0,0.872940
2,100,1,0.787966
3,115,0,0.382228
4,171,1,0.818335
...,...,...,...
144719,415180,2,0.460594
144720,415195,1,0.641969
144721,415248,1,0.536050
144722,415267,2,0.586211


In [19]:
get_dataset('df_age_metrics.parquet').set_index('model').style.highlight_max(axis=0, color='lightblue')

Unnamed: 0_level_0,F1_weighted
model,Unnamed: 1_level_1
Cat_age_base,0.451353
Cat_age_rgs,0.45403
Cat_age_rgs_all_data,0.55611


In [20]:
get_dataset('df_age_overfitting.parquet').set_index('model').style.highlight_max(axis=0, color='lightblue')

Unnamed: 0_level_0,F1_weighted_train,F1_weighted_test,delta
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Cat_age_base,0.5141,0.4514,13.9 %
Cat_age_rgs,0.5637,0.454,24.1 %
Cat_age_rgs_all_data,0.5527,0.5561,0.6 %


In [22]:
# df_metrics = get_dataset('df_age_metrics.parquet')[:3]
# df_metrics.to_parquet(f"{LOCAL_DATA_PATH}/{PREP_DATA}/df_age_metrics.parquet")

# df_overfitting = get_dataset('df_age_overfitting.parquet')[:3]
# df_overfitting.to_parquet(f"{LOCAL_DATA_PATH}/{PREP_DATA}/df_age_overfitting.parquet")  

## 5.2.1. Получим оценку по полу

### CatBoostClassifier baseline

In [21]:
def get_metrics_gini(y_test, y_pred, name):  
    
    df_metrics = pd.DataFrame()

    df_metrics['model'] = [name]
        
    df_metrics['Gini'] =  float('{:.4f}'.format(2 * mtrcs.roc_auc_score(y_test, y_pred) - 1))
#     df_metrics['Gini'] =  int(f'{2 * mtrcs.roc_auc_score(y_test, y_pred) - 1:2.3f}')
#     df_metrics['F1_weighted'] = f1_score(y_test, y_pred, average="weighted") 
    return df_metrics

In [22]:
def check_gini_overfitting(model, X_train, y_train, X_test, y_test, metric_fun, name):
    """
    Проверка на overfitting для классификации
    """
    df_metrics = pd.DataFrame()
    
    df_metrics['model'] = [name]
    
    y_pred_train = model.predict_proba(X_train)[:,1]
    y_pred_test = model.predict_proba(X_test)[:,1]
    value_train = 2 * metric_fun(y_train, y_pred_train) - 1
    value_test = 2 * metric_fun(y_test, y_pred_test) - 1

    df_metrics["Gini_train"] = float('{:.4f}'.format(value_train))
    df_metrics["Gini_test"] = float('{:.4f}'.format(value_test))
    df_metrics["delta"] = f'{(abs(value_train - value_test)/value_test*100):.1f} %'
    
#     print(f'Gini train: %.3f' % value_train)
#     print(f'Gini test: %.3f' % value_test)
#     print(f'delta = {(abs(value_train - value_test)/value_test*100):.1f} %')
    return df_metrics

In [23]:
def study_gender_1(study_name: Text):
    """
    """

    X_train, X_test, y_train, y_test, eval_set, cat_features, X_train_, y_train_ = split_train_test(
        "prep_data_with_targets_is_male.parquet",
        "is_male")
    
    cat_params = {'n_estimators' : 1500,
              'random_state' : RAND,
              'early_stopping_rounds' : 100,
              'cat_features' : cat_features
             }

    model = CatBoostClassifier(**cat_params,
                             task_type="GPU")

    model.fit(X_train,
            y_train,
            eval_set=eval_set,
            verbose=False)
    
    y_pred = model.predict_proba(X_test)[:,1]
    
    print(f"study_1 {study_name} fit")
    
    # Метрики
    df_metrics = get_metrics_gini(y_test=y_test,
                                  y_pred=y_pred,
                                  name=study_name)
    df_metrics.to_parquet(f"{LOCAL_DATA_PATH}/{PREP_DATA}/df_gender_metrics.parquet")
    
    # Переобучение
    df_overfitting = check_gini_overfitting(model, 
                                            X_train, 
                                            y_train, 
                                            X_test, 
                                            y_test, 
                                            roc_auc_score,
                                            study_name)
    df_overfitting.to_parquet(f"{LOCAL_DATA_PATH}/{PREP_DATA}/df_gender_overfitting.parquet")    
    
    print(f"study_1 {study_name} done")
    
#     return df_metrics, df_overfitting

In [24]:
study_gender_1("Cat_gender_base")

split_train_test is_male done
study_1 Cat_gender_base fit
study_1 Cat_gender_base done


In [27]:
get_dataset('df_gender_metrics.parquet')#.set_index('model').style.highlight_max(axis=0, color='lightblue')

Unnamed: 0,model,Gini
0,Cat_gender_base,0.7247


In [28]:
get_dataset('df_gender_overfitting.parquet')#.set_index('model').style.highlight_max(axis=0, color='lightblue')

Unnamed: 0,model,Gini_train,Gini_test,delta
0,Cat_gender_base,0.7534,0.7247,4.0 %


подбор оптимальных значений для rgs
<!-- 
# 'n_estimators' : 250
# GINI по полу 0.693
# learning_rate : 0.08112899959087372

# 'n_estimators' : 1500
# GINI по полу 0.725
# learning_rate : 0.03863900154829025

# 'n_estimators' : 2500
# GINI по полу 0.730
# learning_rate : 0.03127399832010269

# 'n_estimators' : 5000
# GINI по полу 0.735
# learning_rate : 0.023471999913454056 -->

### Подбор параметров с помощью randomized grid search

In [27]:
def func_rgs_gender():
    # если требуется подбор параметров
    grid = {
    "n_estimators": [1500], 
    "learning_rate": [0.03863900154829025],
    "boosting_type" : ['Plain','Ordered'], 
    "bootstrap_type" : ["Bayesian", "Bernoulli"], # ,"MVS" прерывает работу rgs 
    "grow_policy": ["SymmetricTree", "Depthwise", "Lossguide"],
    "l2_leaf_reg": np.arange(0.1, 1, 0.05), # , "None"
    "random_strength": [1, 2, 5, 10, 20, 50, 100, "None"],
    "random_state": [RAND]
    }
    
    model = CatBoostClassifier(silent=True,
                               cat_features=cat_features,
                               task_type="GPU",
                               early_stopping_rounds=100
                               )
    
    grid_search_result = model.randomized_search(grid,
                                                 X=X_train,
                                                 y=y_train,
                                                 cv=5, # 
                                                 n_iter=50, #
                                                 refit=True, # 
                                                 shuffle=True, #
                                                 stratified=True, #
                                                 calc_cv_statistics=True, ##
                                                 search_by_train_test_split=True, ##
                                                 verbose=False,
                                                 plot=False) # True
    
    joblib.dump(grid_search_result['params'], f'{LOCAL_DATA_PATH}/{PREP_DATA}/grid_search_result_gender.pkl')
    
    print("rgs gender done")
    
    return grid_search_result['params']

### CatBoostClassifier grid_search_result

Получаем модель по предсказанию пола на основе подобранных параметров

In [30]:
def study_gender_2(study_name: Text):
    """
    """


    X_train, X_test, y_train, y_test, eval_set, cat_features, X_train_, y_train_ = split_train_test(
        "prep_data_with_targets_is_male.parquet",
        "is_male")
    
    # Если требуется подбор параметров #################
#     grid_search_result = func_rgs_gender()

    # пользуемся уже подобранными параметрами rgs
    grid_search_result = joblib.load(f'{LOCAL_DATA_PATH}/{PREP_DATA}/grid_search_result_gender.pkl')    
    
    # тюнинг модели подобранными параметрами
    cat_rgs = CatBoostClassifier(**grid_search_result,
                                 task_type="GPU")
    
    cat_rgs.fit(X_train_,
                 y_train_,
                 cat_features=cat_features,
                 eval_set=eval_set,
                 verbose=False,
                 early_stopping_rounds=100)
    y_pred = cat_rgs.predict_proba(X_test)[:,1]  
    
    print(f"study_gender_2 {study_name} fit")    
    
    # saving model
    joblib.dump(cat_rgs, f'{LOCAL_DATA_PATH}/{PREP_DATA}/model_cat_rgs_gender.pkl')    


    # датасеты с метриками моделей  
    # Добавление метрик
    df_metrics = get_dataset('df_gender_metrics.parquet')
    df_metrics = pd.concat([df_metrics, get_metrics_gini(y_test=y_test,
                                                         y_pred=y_pred,
                                                         name=study_name)
                           ])
    df_metrics.to_parquet(f"{LOCAL_DATA_PATH}/{PREP_DATA}/df_gender_metrics.parquet")
    
    # Переобучение
    df_overfitting = get_dataset('df_gender_overfitting.parquet')
    df_overfitting = pd.concat([df_overfitting, check_gini_overfitting(cat_rgs,
                                                                       X_train,
                                                                       y_train, 
                                                                       X_test, 
                                                                       y_test, 
                                                                       roc_auc_score,
                                                                       study_name)
                               ])
    df_overfitting.to_parquet(f"{LOCAL_DATA_PATH}/{PREP_DATA}/df_gender_overfitting.parquet")    
    
#     return df_metrics, df_overfitting, grid_search_result

In [31]:
study_gender_2('Cat_genedr_rgs')

split_train_test is_male done
study_gender_2 Cat_genedr_rgs fit


In [32]:
get_dataset('df_gender_metrics.parquet')#.set_index('model').style.highlight_max(axis=0, color='lightblue')

Unnamed: 0,model,Gini
0,Cat_gender_base,0.7247
0,Cat_genedr_rgs,0.7307


In [33]:
get_dataset('df_gender_overfitting.parquet')#.set_index('model').style.highlight_max(axis=0, color='lightblue')

Unnamed: 0,model,Gini_train,Gini_test,delta
0,Cat_gender_base,0.7534,0.7247,4.0 %
0,Cat_genedr_rgs,0.8135,0.7307,11.3 %


## 5.2.2. Обучение на всей подготовленной выборке по полу.

## 5.2.3. Предсказание пола по submit.

In [34]:
X_train, X_test, y_train, y_test, eval_set, cat_features, X_train_, y_train_ = split_train_test(
        "prep_data_with_targets_is_male.parquet",
        "is_male")

split_train_test is_male done


In [35]:
dataset = get_dataset("prep_data_with_targets_is_male.parquet")
cat_features = dataset.select_dtypes('category').columns.tolist()
X = dataset.drop(['user_id', 'is_male'], axis=1)
y = dataset['is_male']
del dataset

In [36]:
# load moel
cat_rgs_all = joblib.load(f'{LOCAL_DATA_PATH}/{PREP_DATA}/model_cat_rgs_gender.pkl')

cat_rgs_all.fit(X, 
                y,
                cat_features,
                verbose = False);
    
y_pred = cat_rgs_all.predict_proba(X_test)[:,1] 
# print(f"study_gender_3 {study_name} fit") 

In [40]:
# предсказание возраста по сабмитам
df_submit = get_dataset('df_submit.parquet')
    
fin_submit = pd.read_csv(f'{LOCAL_DATA_PATH}/{PREP_DATA}/fin_submit.csv')
fin_submit['is_male'] = cat_rgs_all.predict_proba(df_submit.drop(['user_id'], axis = 1))[:,1]
    
print(f"study_gender_3 {study_name} pred submit done")  

In [86]:
# датасеты с метриками моделей  
# Добавление метрик
df_metrics = get_dataset('df_gender_metrics.parquet')
df_metrics = df_metrics.append(
    get_metrics_gini(y_test=y_test,
                     y_pred=y_pred,
                     name=study_name))
df_metrics.to_parquet(f"{LOCAL_DATA_PATH}/{PREP_DATA}/df_gender_metrics.parquet")
    
Переобучение
df_overfitting = get_dataset('df_gender_overfitting.parquet')
df_overfitting = pd.concat([df_overfitting, 
                            check_gini_overfitting(cat_rgs_all,
                                                   X_train,
                                                   y_train, 
                                                   X_test, 
                                                   y_test, 
                                                   roc_auc_score,
                                                   study_name)
                           ])

df_overfitting.to_parquet(f"{LOCAL_DATA_PATH}/{PREP_DATA}/df_gender_overfitting.parquet")  

In [87]:
df_metrics

Unnamed: 0,model,Gini
0,cat_base_gender,0.7246
0,cat_rgs_gender,0.731
0,cat_rgs_all_gender,0.7964


---

In [41]:
def study_gender_3(study_name: Text):
    """
    """

    # get params
#     with open(config_path) as file:
#         config = yaml.load(file, Loader=yaml.FullLoader)
#     preprocessing_config = config["preprocessing"]
#     train_config = config["train"]
    
#     target=preprocessing_config["target_column"]


    X_train, X_test, y_train, y_test, eval_set, cat_features, X_train_, y_train_ = split_train_test(
        "prep_data_with_targets_is_male.parquet",
        "is_male")
    
    dataset = get_dataset("prep_data_with_targets_is_male.parquet")
    cat_features = dataset.select_dtypes('category').columns.tolist()
    X = dataset.drop(['user_id', 'is_male'], axis=1)
    y = dataset['is_male']
    del dataset

    # load moel
    cat_rgs_all = joblib.load(f'{LOCAL_DATA_PATH}/{PREP_DATA}/model_cat_rgs_gender.pkl')

    cat_rgs_all.fit(X, 
                    y,
                    cat_features,
                    verbose = False);
    
    y_pred = cat_rgs_all.predict_proba(X_test)[:,1] 
    
    print(f"study_gender_3 {study_name} fit")    
    
    # save model
    joblib.dump(cat_rgs_all, f'{LOCAL_DATA_PATH}/{PREP_DATA}/model_fin_gender.pkl')    
    
    
    # предсказание возраста по сабмитам
    df_submit = get_dataset('df_submit.parquet')
    
    fin_submit = pd.read_csv(f'{LOCAL_DATA_PATH}/{PREP_DATA}/fin_submit.csv')
    fin_submit['is_male'] = cat_rgs_all.predict_proba(df_submit.drop(['user_id'], axis = 1))[:,1]

    fin_submit.to_csv(f'{LOCAL_DATA_PATH}/{PREP_DATA}/fin_submit_age.csv', index = False)
    
    print(f"study_gender_3 {study_name} pred submit done")  
    
    
    # датасеты с метриками моделей  
    # Добавление метрик
    df_metrics = get_dataset('df_gender_metrics.parquet')
    df_metrics = pd.concat([df_metrics, get_metrics_gini(y_test=y_test,
                                                         y_pred=y_pred,
                                                         name=study_name)
                           ])
    df_metrics.to_parquet(f"{LOCAL_DATA_PATH}/{PREP_DATA}/df_gender_metrics.parquet")
    
    # Переобучение
    df_overfitting = get_dataset('df_gender_overfitting.parquet')
    df_overfitting = pd.concat([df_overfitting, check_gini_overfitting(cat_rgs_all,
                                                                       X_train,
                                                                       y_train, 
                                                                       X_test, 
                                                                       y_test, 
                                                                       roc_auc_score,
                                                                       study_name)
                               ])

    df_overfitting.to_parquet(f"{LOCAL_DATA_PATH}/{PREP_DATA}/df_gender_overfitting.parquet")    
    
#     return df_metrics, df_overfitting, grid_search_result

In [42]:
study_gender_3("Cat_gener_rgs_all_data")

split_train_test is_male done
study_gender_3 Cat_gener_rgs_all_data fit
study_gender_3 Cat_gener_rgs_all_data pred submit done


In [36]:
get_dataset('df_gender_metrics.parquet')#.set_index('model').style.highlight_max(axis=0, color='lightblue')

Unnamed: 0,model,Gini
0,Cat_gender_base,0.7247
0,Cat_genedr_rgs,0.7307
0,Cat_gener_rgs_all_data,0.7959


In [37]:
get_dataset('df_gender_overfitting.parquet')#.set_index('model').style.highlight_max(axis=0, color='lightblue')

Unnamed: 0,model,Gini_train,Gini_test,delta
0,Cat_gender_base,0.7534,0.7247,4.0 %
0,Cat_genedr_rgs,0.8135,0.7307,11.3 %
0,Cat_gener_rgs_all_data,0.7965,0.7959,0.1 %


# Предсказание по  случайным данным

In [22]:
def rand_os_name() -> list:
    """
    ф-ция возвращает список со случайным
    произодителем и операционной системой
    """
    # загрузка данных
    df = get_dataset('df_submit.parquet')
    
    # словаврь со всеми производителями и ос
    my_dict = {
        "name": df['cpe_manufacturer_name'].unique().to_list(),
        "type": df['cpe_model_os_type'].unique().to_list()
    }
    
    # выбор производителя
    rnd = len(my_dict['name'])
    rand_pred_name = my_dict['name'][randint(0, rnd-1)]
    # rand_pred_name = my_dict['name'][1] # если нужен Apple и iOS

    if rand_pred_name == 'Apple':
        rand_pred_os = my_dict['type'][1]
        return rand_pred_name, rand_pred_os
    else:
        rand_pred_os = my_dict['type'][0]
        return rand_pred_name, rand_pred_os

In [23]:
def rand_data_pred() -> pd.DataFrame:
    """
    ф-ция возвращает датасет с 1 строкой
    заполненной случайным орбазом
    """
    # загрузка данных
    df = get_dataset('df_submit.parquet').drop(['user_id'], axis = 1)[:1] # 
    
    # Заполнение всех признаков случайным числом
    for col in df:
        df[col] = randint(0, 1)

    # Заполнение произв и ос случайными значениями
    name, os_type = rand_os_name()
    df[['cpe_manufacturer_name']] = name
    df[['cpe_model_os_type']] = os_type
    
    return df

In [25]:
cat_rgs_age = joblib.load(f'{LOCAL_DATA_PATH}/{PREP_DATA}/model_cat_grid_age.pkl')

cat_rgs_gendr = joblib.load(f'{LOCAL_DATA_PATH}/{PREP_DATA}/model_fin_gender.pkl')

In [26]:
df = rand_data_pred()
df['is_male'] = cat_rgs_gendr.predict_proba(df)[:,1]#.drop(['user_id'], axis = 1))
df['age'] = cat_rgs_age.predict(df)#.drop(['user_id'], axis = 1))
df[['cpe_manufacturer_name','cpe_model_os_type','is_male','age']]

Unnamed: 0,cpe_manufacturer_name,cpe_model_os_type,is_male,age
0,Yandex LLC,Android,0.998476,1


## Примеры предсказаний по случайным данным

In [42]:
df[['cpe_manufacturer_name','cpe_model_os_type','is_male','age']]

Unnamed: 0,cpe_manufacturer_name,cpe_model_os_type,is_male,age
0,Samsung,Android,0.363823,0


In [47]:
df[['cpe_manufacturer_name','cpe_model_os_type','is_male','age']]

Unnamed: 0,cpe_manufacturer_name,cpe_model_os_type,is_male,age
0,HTC,Android,0.176554,2


In [31]:
df[['cpe_manufacturer_name','cpe_model_os_type','is_male','age']]

Unnamed: 0,cpe_manufacturer_name,cpe_model_os_type,is_male,age
0,LeEco,Android,0.990112,2


# 6. Score.

Метрика соревнования — ROC-AUC – для определения пола, f1 weighted – для определения возраста. 

Все решения рассчитываются по формуле - 2 * f1_weighted(по 6 возрастным бакетам) + gini по полу.

In [45]:
pd.read_csv(f'{LOCAL_DATA_PATH}/{PREP_DATA}/fin_submit_age.csv')

Unnamed: 0,user_id,age,is_male
0,27,1,0.235905
1,83,0,0.872758
2,100,1,0.803667
3,115,0,0.380761
4,171,1,0.824334
...,...,...,...
144719,415180,2,0.467285
144720,415195,1,0.644674
144721,415248,1,0.530685
144722,415267,2,0.583750


In [46]:
df1 = get_dataset('df_age_metrics.parquet').reset_index(drop = True)
df1.set_index('model').style.highlight_max(axis=0, color='lightblue')

Unnamed: 0_level_0,F1_weighted
model,Unnamed: 1_level_1
Cat_age_base,0.451353
Cat_age_rgs,0.45403
Cat_age_rgs_all_data,0.55611


In [47]:
df2 = get_dataset('df_gender_metrics.parquet').reset_index(drop = True)
df2.set_index('model').style.highlight_max(axis=0, color='lightblue')

Unnamed: 0_level_0,Gini
model,Unnamed: 1_level_1
Cat_gender_base,0.7247
Cat_genedr_rgs,0.7307
Cat_gener_rgs_all_data,0.7959


baseline

In [48]:
2 * df1['F1_weighted'][0] + df2['Gini'][0]

1.6274055435251666

random grid search best params

In [49]:
2 * df1['F1_weighted'][1] + df2['Gini'][1]

1.638760255923968

вся выборка

In [50]:
2 * df1['F1_weighted'][2] + df2['Gini'][2]

1.908119652856524

на лучших параметрах по rgs и всей выборке модель предсказания возраста переобучается (delta 10% - 20%)