In [2]:
import xlearn as xl

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import log_loss, roc_auc_score, accuracy_score, make_scorer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import ParameterGrid
from scipy.sparse import hstack

In [3]:
PATH = '/content/drive/MyDrive/Recsys'
data = pd.read_csv(PATH + '/data.csv')
data.head()

Unnamed: 0,date_time,zone_id,banner_id,oaid_hash,campaign_clicks,os_id,country_id,banner_id0,rate0,g0,coeff_sum0,banner_id1,rate1,g1,coeff_sum1,impressions,clicks
0,2021-09-27 00:01:30.000000,0,0,5664530014561852622,0,0,0,1240,0.067,0.035016,-7.268846,0,0.01,0.049516,-5.369901,1,1
1,2021-09-26 22:54:49.000000,1,1,5186611064559013950,0,0,1,1,0.002,0.054298,-2.657477,269,0.004,0.031942,-4.44922,1,1
2,2021-09-26 23:57:20.000000,2,2,2215519569292448030,3,0,0,2,0.014,0.014096,-3.824875,21,0.014,0.014906,-3.939309,1,1
3,2021-09-27 00:04:30.000000,3,3,6262169206735077204,0,1,1,3,0.012,0.015232,-3.461357,99,0.006,0.050671,-3.418403,1,1
4,2021-09-27 00:06:21.000000,4,4,4778985830203613115,0,1,0,4,0.019,0.051265,-4.009026,11464230,6.79,0.032005,-2.828797,1,1


### Подготовка данных

In [6]:
def train_val_test_split(data: pd.DataFrame):
    """
    Разбивает данные на тренировочную, валидационную и тестовую выборки.
    В тестовую часть попадает последний день, в валидационную - предпоследний
    """
    test_day = data['date_time'].dt.date.max()
    val_day = test_day - pd.Timedelta(days=1)

    Test = data.loc[data['date_time'].dt.date == test_day]
    Val = data.loc[data['date_time'].dt.date == val_day]
    Train = data.loc[data['date_time'].dt.date < val_day]

    Test = Test.drop(['date_time'], axis=1)
    Val = Val.drop(['date_time'], axis=1)
    Train = Train.drop(['date_time'], axis=1)

    return Train, Val, Test

In [7]:
def feature_engineering(data: pd.DataFrame) -> pd.DataFrame:
    """
    Создает фичи и подготовливает категориальные и числовые признаки для подачи в модель.
    """

    # Выделим день недели и час в отдельные переменные
    data['date_time'] = pd.to_datetime(data['date_time'], format="%Y-%m-%d %H:%M:%S.%f")
    data['weekday'] = data['date_time'].dt.weekday
    data['hour'] = data['date_time'].dt.hour
    # Так как переменная hour циклична, трансформируем ее в 2 отдельные фичи
    data['hour_sin'] = np.sin(2 * np.pi * data['hour'] / 24)
    data['hour_cos'] = np.cos(2 * np.pi * data['hour'] / 24)
    # Возьмем логарифм campaign_clicks, чтобы немного сгладить смещение
    data['log_campaign_clicks'] = np.log10(data['campaign_clicks'].to_numpy() + 1)

    # Удалим ненужные переменные
    data = data.drop(columns = ['oaid_hash', 'banner_id0', 'banner_id1',
                                'rate0', 'rate1', 'g0', 'g1', 'campaign_clicks',
                                'coeff_sum0', 'coeff_sum1', 'impressions', 'hour'])

    return data

In [8]:
data_prepared = feature_engineering(data)

In [9]:
del data

In [10]:
Train, Val, Test = train_val_test_split(data_prepared)

In [11]:
Train.head()

Unnamed: 0,zone_id,banner_id,os_id,country_id,clicks,weekday,hour_sin,hour_cos,log_campaign_clicks
0,0,0,0,0,1,0,0.0,1.0,0.0
1,1,1,0,1,1,6,-0.5,0.866025,0.0
2,2,2,0,0,1,6,-0.258819,0.965926,0.60206
3,3,3,1,1,1,0,0.0,1.0,0.0
4,4,4,1,0,1,0,0.0,1.0,0.0


In [12]:
del data_prepared

### Преобразование в ffm формат

In [19]:
# https://github.com/wngaw/blog/blob/master/xlearn_example/src/utils.py

import json
import math


def convert_to_ffm(path, df, type, target, numerics, categories, features, encoder):
    # Flagging categorical and numerical fields
    print('convert_to_ffm - START')
    for x in numerics:
        if(x not in encoder['catdict']):
            print(f'UPDATING CATDICT: numeric field - {x}')
            encoder['catdict'][x] = 0
    for x in categories:
        if(x not in encoder['catdict']):
            print(f'UPDATING CATDICT: categorical field - {x}')
            encoder['catdict'][x] = 1

    nrows = df.shape[0]
    with open(path + str(type) + "_ffm.txt", "w") as text_file:

        # Looping over rows to convert each row to libffm format
        for n, r in enumerate(range(nrows)):
            datastring = ""
            datarow = df.iloc[r].to_dict()
            datastring += str(int(datarow[target]))  # Set Target Variable here

            # For numerical fields, we are creating a dummy field here
            for i, x in enumerate(encoder['catdict'].keys()):
                if(encoder['catdict'][x] == 0):
                    # Not adding numerical values that are nan
                    if math.isnan(datarow[x]) is not True:
                        datastring = datastring + " "+str(i)+":" + str(i)+":" + str(datarow[x])
                else:

                    # For a new field appearing in a training example
                    if(x not in encoder['catcodes']):
                        print(f'UPDATING CATCODES: categorical field - {x}')
                        encoder['catcodes'][x] = {}
                        encoder['currentcode'] += 1
                        #print(f'UPDATING CATCODES: categorical value for field {x} - {datarow[x]}')
                        encoder['catcodes'][x][datarow[x]] = encoder['currentcode']  # encoding the feature

                    # For already encoded fields
                    elif(datarow[x] not in encoder['catcodes'][x]):
                        encoder['currentcode'] += 1
                        #print(f'UPDATING CATCODES: categorical value for field {x} - {datarow[x]}')
                        encoder['catcodes'][x][datarow[x]] = encoder['currentcode']  # encoding the feature

                    code = encoder['catcodes'][x][datarow[x]]
                    datastring = datastring + " "+str(i)+":" + str(int(code))+":1"

            datastring += '\n'
            text_file.write(datastring)

    # print('Encoder Summary:')
    # print(json.dumps(encoder, indent=4))
    return encoder

In [20]:
categorical = ['zone_id', 'banner_id', 'os_id', 'country_id', 'weekday']
numerical = ['log_campaign_clicks', 'hour_cos', 'hour_sin']
full = categorical + numerical
target = 'clicks'

# Инициализируем кодировщик 
encoder = {"currentcode": len(numerical),
           "catdict": {},
           "catcodes": {}}

# Выделим реальные метки, чтобы потом удалить Val и Test и не занимать ими память
y_test = Test['clicks']
y_val = Val['clicks']

In [21]:
encoder = convert_to_ffm('', Train, 'Train', target, numerical, categorical, full, encoder)

convert_to_ffm - START
UPDATING CATDICT: numeric field - log_campaign_clicks
UPDATING CATDICT: numeric field - hour_cos
UPDATING CATDICT: numeric field - hour_sin
UPDATING CATDICT: categorical field - zone_id
UPDATING CATDICT: categorical field - banner_id
UPDATING CATDICT: categorical field - os_id
UPDATING CATDICT: categorical field - country_id
UPDATING CATDICT: categorical field - weekday
UPDATING CATCODES: categorical field - zone_id
UPDATING CATCODES: categorical field - banner_id
UPDATING CATCODES: categorical field - os_id
UPDATING CATCODES: categorical field - country_id
UPDATING CATCODES: categorical field - weekday


In [22]:
encoder = convert_to_ffm('', Val, 'Val', target, numerical, categorical, full, encoder)

convert_to_ffm - START


In [23]:
encoder = convert_to_ffm('', Test, 'Test', target, numerical, categorical, full, encoder)

convert_to_ffm - START


In [27]:
del Train,Val,Test

### Обучение модели

In [10]:
# Создадим словарь с параметрами
param_dict = {
    "task": ['binary'],
    "metric": ['auc'],
    "lr": [0.1, 0.2],
    "lambda": [0.0001, 0.001],
    "k" : [4, 6, 8]
}
param_grid = list(ParameterGrid(param_dict))

In [11]:
param_grid

[{'k': 4, 'lambda': 0.0001, 'lr': 0.1, 'metric': 'auc', 'task': 'binary'},
 {'k': 4, 'lambda': 0.0001, 'lr': 0.2, 'metric': 'auc', 'task': 'binary'},
 {'k': 4, 'lambda': 0.001, 'lr': 0.1, 'metric': 'auc', 'task': 'binary'},
 {'k': 4, 'lambda': 0.001, 'lr': 0.2, 'metric': 'auc', 'task': 'binary'},
 {'k': 6, 'lambda': 0.0001, 'lr': 0.1, 'metric': 'auc', 'task': 'binary'},
 {'k': 6, 'lambda': 0.0001, 'lr': 0.2, 'metric': 'auc', 'task': 'binary'},
 {'k': 6, 'lambda': 0.001, 'lr': 0.1, 'metric': 'auc', 'task': 'binary'},
 {'k': 6, 'lambda': 0.001, 'lr': 0.2, 'metric': 'auc', 'task': 'binary'},
 {'k': 8, 'lambda': 0.0001, 'lr': 0.1, 'metric': 'auc', 'task': 'binary'},
 {'k': 8, 'lambda': 0.0001, 'lr': 0.2, 'metric': 'auc', 'task': 'binary'},
 {'k': 8, 'lambda': 0.001, 'lr': 0.1, 'metric': 'auc', 'task': 'binary'},
 {'k': 8, 'lambda': 0.001, 'lr': 0.2, 'metric': 'auc', 'task': 'binary'}]

In [42]:
best_auc = 0
best_log_loss = np.inf
best_param = None

# Проходимся по наборам параметров и сохраняем лучший результат
for param in param_grid:
    print(param)
    ffm_model = xl.create_ffm()       
    ffm_model.setTrain("Train_ffm.txt")     
    ffm_model.setValidate("Val_ffm.txt")
    ffm_model.fit(param, "./model.out")

    ffm_model.setTest("Val_ffm.txt")  
    ffm_model.setSigmoid()        
    ffm_model.predict("./model.out", "./output.txt")

    with open("./output.txt", 'r') as f:
        y_pred_proba = np.array([float(prediction) for prediction in f.readlines()])

    roc_auc_res = roc_auc_score(y_val, y_pred_proba)
    log_loss_res = log_loss(y_val, y_pred_proba)
    print('AUC:', roc_auc_res)
    print('Log-loss:', log_loss_res)
    print('----------------------------------')
    if best_auc < roc_auc_res:
        best_auc = roc_auc_res
        best_log_loss = log_loss_res
        best_param = param

{'k': 4, 'lambda': 0.0001, 'lr': 0.1, 'metric': 'auc', 'task': 'binary'}
AUC: 0.785893627389289
Log-loss: 0.10256103334432431
---------------------------------------------------------------
{'k': 4, 'lambda': 0.0001, 'lr': 0.2, 'metric': 'auc', 'task': 'binary'}
AUC: 0.789339920697833
Log-loss: 0.1014448878402344
---------------------------------------------------------------
{'k': 4, 'lambda': 0.001, 'lr': 0.1, 'metric': 'auc', 'task': 'binary'}
AUC: 0.7737935452447177
Log-loss: 0.10925722658843434
---------------------------------------------------------------
{'k': 4, 'lambda': 0.001, 'lr': 0.2, 'metric': 'auc', 'task': 'binary'}
AUC: 0.7731571120455539
Log-loss: 0.10798479707946335
---------------------------------------------------------------
{'k': 6, 'lambda': 0.0001, 'lr': 0.1, 'metric': 'auc', 'task': 'binary'}
AUC: 0.8113491433399229
Log-loss: 0.10234339770310413
---------------------------------------------------------------
{'k': 6, 'lambda': 0.0001, 'lr': 0.2, 'metric': 'a

In [43]:
print(best_param)
print('Best AUC:', best_auc)
print('Best log-loss:', best_log_loss)

{'k': 6, 'lambda': 0.0001, 'lr': 0.2, 'metric': 'auc', 'task': 'binary'}
Best AUC: 0.818783327346469
Best log-loss: 0.1002553965753616


Попробуем увеличить lr.

In [44]:
param = {'k': 6, 'lambda': 0.0001, 'lr': 0.3, 'metric': 'auc', 'task': 'binary'}
ffm_model = xl.create_ffm()       
ffm_model.setTrain("Train_ffm.txt")     
ffm_model.setValidate("Val_ffm.txt")
ffm_model.fit(param, "./model.out")

ffm_model.setTest("Val_ffm.txt")  
ffm_model.setSigmoid()        
ffm_model.predict("./model.out", "./output.txt")

with open("./output.txt", 'r') as f:
    y_pred_proba = np.array([float(prediction) for prediction in f.readlines()])

roc_auc_res = roc_auc_score(y_val, y_pred_proba)
log_loss_res = log_loss(y_val, y_pred_proba)
print('AUC:', roc_auc_res)
print('Log-loss:', log_loss_res)

AUC: 0.8186202773322994
Log-loss: 0.10027096179859396


Лучше не стало, оставим, как раньше.

In [45]:
# Результаты для тестовой части
ffm_model = xl.create_ffm()       
ffm_model.setTrain("Train_ffm.txt")     
ffm_model.setValidate("Val_ffm.txt")
ffm_model.fit(param, "./model.out")

ffm_model.setTest("Test_ffm.txt")  
ffm_model.setSigmoid()        
ffm_model.predict("./model.out", "./output.txt")

with open("./output.txt", 'r') as f:
    y_pred_proba = np.array([float(prediction) for prediction in f.readlines()])

print('AUC:', roc_auc_score(y_test, y_pred_proba))
print('Log-loss:', log_loss(y_test, y_pred_proba))

AUC: 0.801783327346469
Log-loss: 0.13327312614481404


В прошлом дз результаты модели были следующими:\
ROC AUC: 0.779, Log-loss: 0.135

Таким образом, AUC увеличилась, в то время как Log-loss совсем немного уменьшилась. Лучшая модель выбиралась именно по AUC, поэтому, возможно, нет заметного улучшения по метрике Log-loss.