# Imports

In [1]:
import os

import pandas as pd
import numpy as np
import scipy as stats
import joblib
import time
import bisect
import gc
import sklearn.metrics as mtrcs
import optuna

from tqdm import tqdm
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, KFold
from catboost import CatBoostRegressor, CatBoostClassifier

from sklearn.preprocessing import label_binarize

from sklearn.metrics import f1_score, accuracy_score, \
roc_auc_score, classification_report, r2_score, precision_score, recall_score, \
log_loss

import warnings
warnings.filterwarnings("ignore");

RAND = 10
N_FOLDS = 5

In [2]:
from get_metrics import get_metrics_classification, roc_auc_score, f1_score
from sklearn.metrics import f1_score, accuracy_score, \
roc_auc_score, classification_report, r2_score, precision_score, recall_score, \
log_loss

In [3]:
def check_overfitting(model, X_train, y_train, X_test, y_test, metric_fun):
    """
    Проверка на overfitting для классификации
    """
    y_pred_train = model.predict_proba(X_train)[:,1]
    y_pred_test = model.predict_proba(X_test)[:,1]
    value_train = metric_fun(y_train, y_pred_train)
    value_test = metric_fun(y_test, y_pred_test)

    print(f'{metric_fun.__name__} train: %.3f' % value_train)
    print(f'{metric_fun.__name__} test: %.3f' % value_test)
    print(f'delta = {(abs(value_train - value_test)/value_test*100):.1f} %')


## Расположение папок с данными

In [4]:
LOCAL_DATA_PATH = "../data"
DATA_FILE = 'competition_data_final_pqt'

# целевые переменные
TARGET_FILE_AGE = 'targets_age_prep.parquet'
TARGET_FILE_MALE = 'targets_is_male_prep.parquet'

# id, по которым нужно предсказать пол и возраст
SUBMIT_FILE = 'submit_2.pqt'

# папка, куда будут сохраняться предобработанные данные
PREP_DATA = 'preprocessed_data'

LOCAL_DATA_PATH_mts = "../data/ml_cup_data"

# 5. ML parts

## 5.1.1. Получим оценку по возрасту

### CatBoostClassifier baseline

In [5]:
# load
df = pd.read_parquet(f"{LOCAL_DATA_PATH}/{PREP_DATA}/prep_data_with_targets_age.parquet")

cat_features = df.select_dtypes('category').columns.tolist()

# df[['user_id','age_target']].head()

In [6]:
X = df.drop(['user_id', 'age_target'], axis=1)
y = df['age_target']

# Тестовые
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size = 0.3,
                                                    stratify=y,
                                                    random_state = RAND)

# Валидационные
X_train_, X_val, y_train_, y_val = train_test_split(X_train,
                                                    y_train,
                                                    test_size=0.15,
                                                    shuffle=True,
                                                    random_state=RAND)

eval_set = [(X_val, y_val)]

In [7]:
cat_params = {'n_estimators' : 1500,
              'random_state' : RAND,
              'early_stopping_rounds' : 100,
              'custom_loss' : ['TotalF1'],
              'cat_features' : cat_features
}

In [8]:
%%time

clf = CatBoostClassifier(**cat_params,
                         task_type="GPU")


clf.fit(X_train,
        y_train,
        eval_set=eval_set,
        verbose=False)

y_pred = clf.predict(X_test)

Wall time: 1min 35s


In [9]:
f1 = f1_score(y_test, y_pred, average="weighted")
print("age f1_weighted score: {}".format( round( f1, 4 ) ))

age f1_weighted score: 0.4514


In [10]:
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)
value_train = f1_score(y_train, y_pred_train, average="weighted")
value_test = f1_score(y_test, y_pred_test, average="weighted")

print(f'f1_score_weighted train: %.4f' % value_train)
print(f'f1_score_weighted test: %.4f' % value_test)
print(f'delta = {(abs(value_train - value_test)/value_test*100):.1f} %')

f1_score_weighted train: 0.5141
f1_score_weighted test: 0.4514
delta = 13.9 %


In [11]:
clf.get_best_score()

{'learn': {'TotalF1': 0.5141637935142694, 'MultiClass': 1.150442939921801},
 'validation': {'TotalF1': 0.5094104165290579,
  'MultiClass': 1.1497287234160292}}

In [12]:
# 250
# learning_rate : 0.32709699869155884
# age f1_weighted score: 0.4428 

# 500
# age f1_weighted score: 0.4463
# f1_score_weighted train: 0.494
# f1_score_weighted test: 0.446
# delta = 10.7 %  

# 1000
# learning_rate : 0.1784750074148178
# age f1_weighted score: 0.4502

# 1500
# learning_rate : 0.11958499997854231
# age f1_weighted score: 0.4514
# f1_score_weighted train: 0.514
# f1_score_weighted test: 0.451
# delta = 13.9 %

# 2500
# f1_score_weighted train: 0.526
# f1_score_weighted test: 0.454
# delta = 15.9 %

# 5000
# learning_rate : 0.08833400160074234
# age f1_weighted score: 0.457
# f1_score_weighted train: 0.543
# f1_score_weighted test: 0.457
# delta = 18.8 %

In [12]:
# for key,value in clf.get_all_params().items(): 
#     print(key, ':', value)

In [13]:
##########################
# SAVE-LOAD using joblib #
##########################
# save
joblib.dump(clf, f'{LOCAL_DATA_PATH}/{PREP_DATA}/model_1_age_baseline.pkl')

['../data/preprocessed_data/model_1_age_baseline.pkl']

### Подбор параметров с помощью randomized grid search

In [15]:
grid = {
    "n_estimators": [1500],
    "learning_rate": [0.11958499997854231],
    "boosting_type" : ['Ordered', 'Plain'], #
    "bootstrap_type" : ["Bayesian", "Bernoulli", "MVS"], 
    "grow_policy": ["SymmetricTree", "Depthwise", "Lossguide"],
    "custom_metric" : ['F1'],
    "max_depth" : list(range(6, 10)),
    "l2_leaf_reg" : [*np.arange(1, 10)], #
    "random_state": [RAND]
}


model = CatBoostClassifier(silent=True,
                           cat_features=cat_features,
                           task_type="GPU",
                           early_stopping_rounds=100
                           )

grid_search_result = model.randomized_search(grid,
                                             X=X_train,
                                             y=y_train,
                                             cv=5, # 
                                             n_iter=50, #
                                             refit=True, # 
                                             shuffle=True, #
                                             stratified=True, #
                                             calc_cv_statistics=True, ##
                                             search_by_train_test_split=True, ##
                                             plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

bestTest = 1.264231126
bestIteration = 1477
0:	loss: 1.2642311	best: 1.2642311 (0)	total: 1m 29s	remaining: 1h 12m 48s
bestTest = 1.249981842
bestIteration = 1499
1:	loss: 1.2499818	best: 1.2499818 (1)	total: 3m 48s	remaining: 1h 31m 12s
bestTest = 1.263247819
bestIteration = 1457
2:	loss: 1.2632478	best: 1.2499818 (1)	total: 5m 9s	remaining: 1h 20m 55s
bestTest = 1.253952629
bestIteration = 1499
3:	loss: 1.2539526	best: 1.2499818 (1)	total: 6m 26s	remaining: 1h 14m 6s
bestTest = 1.270303515
bestIteration = 1499
4:	loss: 1.2703035	best: 1.2499818 (1)	total: 8m 6s	remaining: 1h 12m 59s
bestTest = 1.271238471
bestIteration = 1497
5:	loss: 1.2712385	best: 1.2499818 (1)	total: 9m 12s	remaining: 1h 7m 28s
bestTest = 1.264745865
bestIteration = 1499
6:	loss: 1.2647459	best: 1.2499818 (1)	total: 10m 32s	remaining: 1h 4m 42s
bestTest = 1.250409218
bestIteration = 1499
7:	loss: 1.2504092	best: 1.2499818 (1)	total: 12m 50s	remaining: 1h 7m 23s
bestTest = 1.26047024
bestIteration = 1475
8:	loss: 

In [16]:
grid_search_result['params']

{'random_seed': 10,
 'depth': 8,
 'l2_leaf_reg': 8,
 'iterations': 1500,
 'learning_rate': 0.11958499997854231,
 'grow_policy': 'SymmetricTree',
 'boosting_type': 'Plain',
 'custom_metric': 'F1',
 'bootstrap_type': 'Bernoulli'}

### CatBoostClassifier grid_search_result

Получаем модель по предсказанию возрастных классов на основе подобранных параметров

In [17]:
%%time

cat_grid = CatBoostClassifier(**grid_search_result['params'], 
                              task_type="GPU",
                              loss_function='MultiClass')
cat_grid.fit(X_train_,
             y_train_,
             cat_features=cat_features,
             eval_set=eval_set,
             verbose=False,
             early_stopping_rounds=100)

y_pred = cat_grid.predict( X_test )
# было до подбора 0.4514 1500

# 0.4428 baseline 250 8%
# 0.4362 все параметры
# 0.4431 без градиента и ньютона
# 0.443 18%

Wall time: 2min 39s


In [18]:
f1_2 = f1_score(y_test, y_pred, average="weighted")
print("age f1_weighted score: {}".format( round( f1_2, 4 ) ))
# было до подбора 0.4514 1500

age f1_weighted score: 0.454


In [19]:
y_pred_train = cat_grid.predict(X_train)
y_pred_test = cat_grid.predict(X_test)
value_train = f1_score(y_train, y_pred_train, average="weighted")
value_test = f1_score(y_test, y_pred_test, average="weighted")

print(f'f1_score_weighted train: %.4f' % value_train)
print(f'f1_score_weighted test: %.4f' % value_test)
print(f'delta = {(abs(value_train - value_test)/value_test*100):.1f} %')

f1_score_weighted train: 0.5637
f1_score_weighted test: 0.4540
delta = 24.1 %


после подбора параметров метрика немного выросла, сделаем предсказание нужных сабмитов

In [20]:
##########################
# SAVE-LOAD using joblib #
##########################
# save
joblib.dump(cat_grid, f'{LOCAL_DATA_PATH}/{PREP_DATA}/model_cat_grid_age.pkl')

['../data/preprocessed_data/model_cat_grid_age.pkl']

## 5.1.2. Обучение на всей подготовленной выборке по возрасту.

In [21]:
# load
clf = joblib.load(f'{LOCAL_DATA_PATH}/{PREP_DATA}/model_cat_grid_age.pkl')

In [22]:
# Загрузка данных с возрастом
df = pd.read_parquet(f'{LOCAL_DATA_PATH}/{PREP_DATA}/prep_data_with_targets_age.parquet')

cat_features = df.select_dtypes('category').columns.tolist()
# df.info()

In [23]:
X = df.drop(['user_id', 'age_target'], axis = 1)
y = df['age_target']

In [24]:
%%time

clf.fit(X, 
        y,
        cat_features,
        verbose = False);

y_pred = clf.predict(X_test)

Wall time: 2min 51s


<catboost.core.CatBoostClassifier at 0x28d44064c40>

In [25]:
y_pred = clf.predict(X_test)
f1_3 = f1_score(y_test, y_pred, average="weighted")
print("age f1_weighted score: {}".format( round( f1_3, 4 ) ))

age f1_weighted score: 0.5888


In [27]:
print(f"Model is fitted: {str(clf.is_fitted())}")
# print(f"Model params: {clf.get_params()}")
print(clf.get_best_score())

Model is fitted: True
{'learn': {'F1:class=3': 0.4972151052690208, 'F1:class=1': 0.6692064749476231, 'F1:class=5': 0.31211805038008644, 'F1:class=0': 0.6000638705556738, 'F1:class=4': 0.4842171059485364, 'F1:class=2': 0.5937297322338553, 'MultiClass': 1.0320477749226513}}


In [29]:
##########################
# SAVE-LOAD using joblib #
##########################
# save
joblib.dump(clf, f'{LOCAL_DATA_PATH}/{PREP_DATA}/model_fin_age.pkl')

['../data/preprocessed_data/model_fin_age.pkl']

## 5.1.3. Предсказание возраста по submit.

In [30]:
# load
# модель
clf = joblib.load(f'{LOCAL_DATA_PATH}/{PREP_DATA}/model_fin_age.pkl')

# id с данными, которые нужно предсказать
df_submit = pd.read_parquet(f'{LOCAL_DATA_PATH}/{PREP_DATA}/df_submit.parquet')
# df_submit.head()

In [31]:
fin_submit = df_submit[['user_id']]

In [32]:
# предсказание возрастных классов по сабмиту
fin_submit['age'] = clf.predict(df_submit.drop(['user_id'], axis = 1))

In [33]:
fin_submit.head()

Unnamed: 0,user_id,age
0,27,1
1,83,1
2,100,1
3,115,0
4,171,1


In [34]:
fin_submit.to_csv(f'{LOCAL_DATA_PATH}/{PREP_DATA}/fin_submit_age.csv', index = False)

## 5.2.1. Получим оценку по полу

### CatBoostClassifier baseline

In [40]:
# load
df = pd.read_parquet(f"{LOCAL_DATA_PATH}/{PREP_DATA}/prep_data_with_targets_is_male.parquet")

cat_features = df.select_dtypes('category').columns.tolist()
# df[['user_id','is_male']].head()

In [41]:
X = df.drop(['user_id', 'is_male'], axis=1)
y = df['is_male']

# Тестовые
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size = 0.3,
                                                    stratify=y,
                                                    random_state = RAND)

# Валидационные
X_train_, X_val, y_train_, y_val = train_test_split(X_train,
                                                    y_train,
                                                    test_size=0.15,
                                                    shuffle=True,
                                                    random_state=RAND)

eval_set = [(X_val, y_val)]

In [42]:
cat_params = {'n_estimators' : 1500,
              'random_state' : RAND,
              'early_stopping_rounds' : 100,
              'cat_features' : cat_features
             }

In [43]:
%%time

clf = CatBoostClassifier(**cat_params,
                         task_type="GPU")


clf.fit(X_train,
        y_train,
        eval_set=eval_set,
        verbose=False)

y_pred = clf.predict_proba(X_test)[:,1]

Wall time: 1min 9s


In [44]:
print(f'GINI по полу {2 * mtrcs.roc_auc_score(y_test, y_pred) - 1:2.3f}')

GINI по полу 0.724


In [45]:
check_overfitting(clf, X_train, y_train, X_test, y_test, roc_auc_score)

roc_auc_score train: 0.877
roc_auc_score test: 0.862
delta = 1.7 %


In [46]:
print(f'gini train {2*0.877-1}')
print(f'gign test {2*0.862-1}')

gini train 0.754
gign test 0.724


In [28]:
# for key,value in clf.get_all_params().items(): 
#     print(key, ':', value)

In [31]:
# подбор оптимальных значений для сетки

# 'n_estimators' : 250
# GINI по полу 0.693
# learning_rate : 0.08112899959087372

# 'n_estimators' : 1500
# GINI по полу 0.725
# learning_rate : 0.03863900154829025

# 'n_estimators' : 2500
# GINI по полу 0.730
# learning_rate : 0.03127399832010269

# 'n_estimators' : 5000
# GINI по полу 0.735
# learning_rate : 0.023471999913454056

In [32]:
##########################
# SAVE-LOAD using joblib #
##########################
# save
joblib.dump(clf, f'{LOCAL_DATA_PATH}/{PREP_DATA}/model_1_is_male_baseline.pkl')

['../data/preprocessed_data/model_1_is_male_baseline.pkl']

### Подбор параметров с помощью randomized grid search

In [33]:
grid = {
    "n_estimators": [1500], 
    "learning_rate": [0.03863900154829025],
    "boosting_type" : ['Plain','Ordered'], 
    "bootstrap_type" : ["Bayesian", "Bernoulli"], # ,"MVS" прерывает работу rgs 
    "grow_policy": ["SymmetricTree", "Depthwise", "Lossguide"],
    "l2_leaf_reg": np.arange(0.1, 1, 0.05), # , "None"
    "random_strength": [1, 2, 5, 10, 20, 50, 100, "None"],
    "random_state": [RAND]
}


model = CatBoostClassifier(silent=True,
                           cat_features=cat_features,
                           task_type="GPU",
                           early_stopping_rounds=100
                           )

grid_search_result = model.randomized_search(grid,
                                             X=X_train,
                                             y=y_train,
                                             cv=5, # 
                                             n_iter=50, # 
                                             refit=True, # 
                                             shuffle=True, #
                                             stratified=True, #
                                             calc_cv_statistics=True, ##
                                             search_by_train_test_split=True, ##
                                             verbose=True,
                                             plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

bestTest = 0.469961954
bestIteration = 1499
0:	loss: 0.4699620	best: 0.4699620 (0)	total: 1m 3s	remaining: 51m 58s
bestTest = 0.4681199067
bestIteration = 1499
1:	loss: 0.4681199	best: 0.4681199 (1)	total: 2m 2s	remaining: 49m 7s
bestTest = 0.4681169948
bestIteration = 1499
2:	loss: 0.4681170	best: 0.4681170 (2)	total: 3m 2s	remaining: 47m 33s
bestTest = 0.4811701596
bestIteration = 1499
3:	loss: 0.4811702	best: 0.4681170 (2)	total: 6m 34s	remaining: 1h 15m 41s
bestTest = 0.4755494058
bestIteration = 1499
4:	loss: 0.4755494	best: 0.4681170 (2)	total: 7m 29s	remaining: 1h 7m 25s
bestTest = 0.4689910566
bestIteration = 1499
5:	loss: 0.4689911	best: 0.4681170 (2)	total: 8m 28s	remaining: 1h 2m 7s
bestTest = 0.4757796087
bestIteration = 1499
6:	loss: 0.4757796	best: 0.4681170 (2)	total: 9m 23s	remaining: 57m 38s
bestTest = 0.4693329197
bestIteration = 1499
7:	loss: 0.4693329	best: 0.4681170 (2)	total: 10m 20s	remaining: 54m 17s
bestTest = 0.4752940012
bestIteration = 1499
8:	loss: 0.475294

In [53]:
grid_search_result['params']

{'random_seed': 10,
 'random_strength': 1,
 'iterations': 1500,
 'learning_rate': 0.03863900154829025,
 'l2_leaf_reg': 0.7000000000000002,
 'grow_policy': 'Lossguide',
 'boosting_type': 'Plain',
 'bootstrap_type': 'Bernoulli'}

### CatBoostClassifier grid_search_result

Получаем модель по предсказанию пола на основе подобранных параметров

In [59]:
%%time

cat_grid = CatBoostClassifier(**grid_search_result['params'], 
                              task_type="GPU")
cat_grid.fit(X_train_,
             y_train_,
             cat_features=cat_features,
             eval_set=eval_set,
             verbose=False,
             early_stopping_rounds=100)


y_pred = cat_grid.predict_proba(X_test)[:,1]

Wall time: 1min 5s


In [60]:
g = float('{:.3f}'.format(2 * (mtrcs.roc_auc_score(y_test, y_pred)) - 1))
print(f'GINI по полу: {g}')

GINI по полу: 0.731


In [61]:
check_overfitting(cat_grid, X_train, y_train, X_test, y_test, roc_auc_score)

roc_auc_score train: 0.907
roc_auc_score test: 0.865
delta = 4.8 %


In [62]:
print(f'gini train {2*0.907-1}')
print(f'gign test {2*0.865-1}')

gini train 0.8140000000000001
gign test 0.73


In [63]:
##########################
# SAVE-LOAD using joblib #
##########################
# save
joblib.dump(cat_grid, f'{LOCAL_DATA_PATH}/{PREP_DATA}/model_cat_grid_is_male.pkl')

['../data/preprocessed_data/model_cat_grid_is_male.pkl']

## 5.2.2. Обучение на всей подготовленной выборке по полу.

In [64]:
# load
clf = joblib.load(f'{LOCAL_DATA_PATH}/{PREP_DATA}/model_cat_grid_is_male.pkl')

In [65]:
# Загрузка данных с учителем
df = pd.read_parquet(f'{LOCAL_DATA_PATH}/{PREP_DATA}/prep_data_with_targets_is_male.parquet')

cat_features = df.select_dtypes('category').columns.tolist()

In [66]:
X = df.drop(['user_id', 'is_male'], axis = 1)
y = df['is_male']

In [67]:
%%time

clf.fit(X, 
        y,
        cat_features,
        verbose = False);

Wall time: 1min 18s


<catboost.core.CatBoostClassifier at 0x1e537bb6be0>

In [69]:
##########################
# SAVE-LOAD using joblib #
##########################
# save
joblib.dump(clf, f'{LOCAL_DATA_PATH}/{PREP_DATA}/model_fin_is_male.pkl')

['../data/preprocessed_data/model_fin_is_male.pkl']

## 5.2.3. Предсказание возраста по submit.

In [36]:
# load
# модель
clf = joblib.load(f'{LOCAL_DATA_PATH}/{PREP_DATA}/model_fin_is_male.pkl')

# id с данными, которые нужно предсказать
df_submit = pd.read_parquet(f'{LOCAL_DATA_PATH}/{PREP_DATA}/df_submit.parquet')

# предсказания возраста
# fin_submit =  pd.read_csv(f'{LOCAL_DATA_PATH}/{PREP_DATA}/fin_submit_age.csv')

In [37]:
# предсказание гендера по сабмиту
fin_submit['is_male'] = clf.predict_proba(df_submit.drop(['user_id'], axis=1))[:,1]

In [38]:
fin_submit.head()

Unnamed: 0,user_id,age,is_male
0,27,1,0.249684
1,83,1,0.873754
2,100,1,0.771891
3,115,0,0.413567
4,171,1,0.838263


In [39]:
fin_submit.to_csv(f'{LOCAL_DATA_PATH}/{PREP_DATA}/submission.csv', index = False)

# 6. Score.

Метрика соревнования — ROC-AUC – для определения пола, f1 weighted – для определения возраста. 

Все решения рассчитываются по формуле - 2 * f1_weighted(по 6 возрастным бакетам) + gini по полу.

baseline

- age f1_weighted score: 0.4514
- GINI по полу 0.724

In [47]:
2*0.4514 + 0.724

1.6268

random grid search best params

- age f1_weighted score: 0.4473
- GINI по полу: 0.731

In [21]:
2* 0.454 + 0.731

1.639

вся выборка

In [22]:
2 * 0.5888 + 0.731

1.9085999999999999

на лучших параметрах по rgs и всей выборке модель предсказания возраста переобучается (delta 20%)