In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import numpy as np
import pandas as pd

In [5]:
df = pd.read_csv('/content/drive/My Drive/Copy of data.csv')
df.drop(['banner_id0', 'banner_id1', 'rate0', 'rate1', 'g0', 'g1', 'coeff_sum0', 'coeff_sum1'], axis=1, inplace=True)
df.head()

Unnamed: 0,date_time,zone_id,banner_id,oaid_hash,campaign_clicks,os_id,country_id,impressions,clicks
0,2021-09-27 00:01:30.000000,0,0,5664530014561852622,0,0,0,1,1
1,2021-09-26 22:54:49.000000,1,1,5186611064559013950,0,0,1,1,1
2,2021-09-26 23:57:20.000000,2,2,2215519569292448030,3,0,0,1,1
3,2021-09-27 00:04:30.000000,3,3,6262169206735077204,0,1,1,1,1
4,2021-09-27 00:06:21.000000,4,4,4778985830203613115,0,1,0,1,1


Оставим только категориальные данные, от impressions избавимся. Последний день в тест, предпоследний выделим для валидации.

In [4]:
df['oaid_hash'].value_counts()

308174966294367527     5243
2890718152668627077    2511
2521895603443866206    2289
8212556321845734673    1974
3375698397737628939    1959
                       ... 
7989197060799678922       1
3207898120952090573       1
8578082337233412909       1
8779980718415233023       1
453968700792456599        1
Name: oaid_hash, Length: 6510316, dtype: int64

Видно, что присутствуют редки пользователи, но не будем их трогать

In [5]:
def feature_engineering(df: pd.DataFrame) -> pd.DataFrame:
  df.drop(['campaign_clicks', 'impressions'], axis=1, inplace=True)
  datetime = pd.to_datetime(df['date_time'])
  df['date'] = datetime.dt.date
  df.drop('date_time', axis=1, inplace=True)

  df_train = df[df['date'] < pd.Timestamp('2021-10-01').date()]
  df_train.drop('date', axis=1, inplace=True)
  # X_train = df_train.drop(['clicks', 'date'], axis=1)
  # y_train = df_train['clicks']

  df_val = df[df['date'] == pd.Timestamp('2021-10-01').date()]
  df_val.drop('date', axis=1, inplace=True)
  # X_val = df_val.drop(['clicks', 'date'], axis=1)
  # y_val = df_val['clicks']

  df_test = df[df['date'] == pd.Timestamp('2021-10-02').date()]
  df_test.drop('date', axis=1, inplace=True)
  # X_test = df_test.drop(['clicks', 'date'], axis=1)
  # y_test = df_test['clicks']

  return df_train, df_val, df_test

In [None]:
df_train, df_val, df_test = feature_engineering(df)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train.drop('date', axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_val.drop('date', axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test.drop('date', axis=1, inplace=True)


Приведем данные в нужный формат для обучения

In [6]:
#  https://github.com/Bobe24/Dataframe2libffm/blob/master/dataframe2libffm.py
category_column = [
  'zone_id',
  'banner_id',
  'os_id',
  'country_id',
  'clicks',
  'oaid_hash'
  ]

class FFMFormat:
    def __init__(self):
        self.field_index_ = None
        self.feature_index_ = None
        self.y = None

    def fit(self, df, y=None):
        self.y = y
        df_ffm = df[df.columns.difference([self.y])]
        if self.field_index_ is None:
            self.field_index_ = {col: i for i, col in enumerate(df_ffm)}

        if self.feature_index_ is not None:
            last_idx = max(list(self.feature_index_.values()))

        if self.feature_index_ is None:
            self.feature_index_ = dict()
            last_idx = 0

        for col in df_ffm.columns:
            vals = df_ffm[col].unique()
            for val in vals:
                if pd.isnull(val):
                    continue
                name = '{}_{}'.format(col, val)
                if name not in self.feature_index_:
                    self.feature_index_[name] = last_idx
                    last_idx += 1
            self.feature_index_[col] = last_idx
            last_idx += 1
        return self

    def fit_transform(self, df, y=None):
        self.fit(df, y)
        return self.transform(df)

    def transform_row_(self, row, t):
        ffm = []
        if self.y != None:
            ffm.append(str(row.loc[row.index == self.y][0]))
        if self.y is None:
            ffm.append(str(0))

        for col, val in row.loc[row.index != self.y].to_dict().items():
            col_type = t[col]
            name = '{}_{}'.format(col, val)
            # if col_type.kind == 'O':
            if col in category_column:
                ffm.append('{}:{}:1'.format(self.field_index_[col],
                                            self.feature_index_[name]))
            else:
            # elif col_type.kind == 'i':
                ffm.append('{}:{}:{}'.format(self.field_index_[col],
                                             self.feature_index_[col], val))
        return ' '.join(ffm)

    def transform(self, df):
        t = df.dtypes.to_dict()
        return pd.Series(
            {idx: self.transform_row_(row, t) for idx, row in df.iterrows()})


In [None]:
ffm_encoder = FFMFormat()
ffm_train = ffm_encoder.fit_transform(df_train, y='clicks')
ffm_train.to_csv('train.txt', index=False, header=False)

ffm_val = ffm_encoder.fit_transform(df_val, y='clicks')
ffm_val.to_csv('val.txt', index=False, header=False)

In [None]:
!cp val.txt /content/drive/MyDrive/recsys_hw2

Для обучения модели будем использовать библиотеку xlearn

In [7]:
!pip install xlearn



In [1]:
import xlearn as xl
from sklearn.model_selection import ParameterGrid

In [2]:
import os
os.environ['USER'] = 'test'

In [None]:
param_grid = {
    'task': ['binary'],
    'lr': [0.1],
    'lambda':[0.1, 0.001, 0.0001],
    'k': [4, 8],
    'metric': ['auc'],
}
for param in ParameterGrid(param_grid):
  print(param)
  ffm_model = xl.create_ffm()
  ffm_model.setTrain('/content/drive/MyDrive/recsys_hw2/train.txt')
  ffm_model.setValidate('/content/drive/MyDrive/recsys_hw2/val.txt')
  ffm_model.fit(param, 'model.out')

В колабе не отображаются логи, вставил их в кодовые ячейки

In [None]:
{'k': 4, 'lambda': 0.1, 'lr': 0.1, 'metric': 'auc', 'task': 'binary'}
----------------------------------------------------------------------------------------------
           _
          | |
     __  _| |     ___  __ _ _ __ _ __
     \ \/ / |    / _ \/ _` | '__| '_ \
      >  <| |___|  __/ (_| | |  | | | |
     /_/\_\_____/\___|\__,_|_|  |_| |_|

        xLearn   -- 0.40 Version --
----------------------------------------------------------------------------------------------

[------------] xLearn uses 8 threads for training task.
[ ACTION     ] Read Problem ...
[------------] First check if the text file has been already converted to binary format.
[------------] Binary file (/content/drive/MyDrive/recsys_hw2/train.txt.bin) found. Skip converting text to binary.
[------------] First check if the text file has been already converted to binary format.
[------------] Binary file (/content/drive/MyDrive/recsys_hw2/val.txt.bin) found. Skip converting text to binary.
[------------] Number of Feature: 5665370
[------------] Number of Field: 5
[------------] Time cost for reading problem: 10.15 (sec)
[ ACTION     ] Initialize model ...
[------------] Model size: 907.69 MB
[------------] Time cost for model initial: 1.17 (sec)
[ ACTION     ] Start to train ...
[------------] Epoch      Train log_loss       Test log_loss            Test AUC     Time cost (sec)
[   10%      ]     1            0.107819            0.176001            0.759995                4.36
[   20%      ]     2            0.106552            0.175757            0.767240                4.68
[   30%      ]     3            0.106052            0.175395            0.764262                4.44
[   40%      ]     4            0.105531            0.175051            0.766409                4.33
[   50%      ]     5            0.105007            0.175023            0.766249                4.48
[   60%      ]     6            0.104673            0.174970            0.770019                4.66
[   70%      ]     7            0.104213            0.174884            0.770983                4.43
[   80%      ]     8            0.104041            0.174500            0.770955                4.31
[   90%      ]     9            0.103614            0.174506            0.770674                4.66
[  100%      ]    10            0.103387            0.174276            0.770890                4.33
[ ACTION     ] Early-stopping at epoch 7, best AUC: 0.770983
[ ACTION     ] Start to save model ...
[------------] Model file: model.out
[------------] Time cost for saving model: 1.48 (sec)
[ ACTION     ] Finish training
[ ACTION     ] Clear the xLearn environment ...
[------------] Total time cost: 59.08 (sec)
{'k': 4, 'lambda': 0.001, 'lr': 0.1, 'metric': 'auc', 'task': 'binary'}
----------------------------------------------------------------------------------------------
           _
          | |
     __  _| |     ___  __ _ _ __ _ __
     \ \/ / |    / _ \/ _` | '__| '_ \
      >  <| |___|  __/ (_| | |  | | | |
     /_/\_\_____/\___|\__,_|_|  |_| |_|

        xLearn   -- 0.40 Version --
----------------------------------------------------------------------------------------------

[------------] xLearn uses 8 threads for training task.
[ ACTION     ] Read Problem ...
[------------] First check if the text file has been already converted to binary format.
[------------] Binary file (/content/drive/MyDrive/recsys_hw2/train.txt.bin) found. Skip converting text to binary.
[------------] First check if the text file has been already converted to binary format.
[------------] Binary file (/content/drive/MyDrive/recsys_hw2/val.txt.bin) found. Skip converting text to binary.
[------------] Number of Feature: 5665370
[------------] Number of Field: 5
[------------] Time cost for reading problem: 10.16 (sec)
[ ACTION     ] Initialize model ...
[------------] Model size: 907.69 MB
[------------] Time cost for model initial: 1.17 (sec)
[ ACTION     ] Start to train ...
[------------] Epoch      Train log_loss       Test log_loss            Test AUC     Time cost (sec)
[   10%      ]     1            0.099944            0.154582            0.787462                4.35
[   20%      ]     2            0.097858            0.153098            0.791374                4.45
[   30%      ]     3            0.096427            0.152599            0.793584                4.54
[   40%      ]     4            0.095041            0.152037            0.795730                4.28
[   50%      ]     5            0.093676            0.151128            0.799213                4.40
[   60%      ]     6            0.092375            0.150563            0.800596                4.70
[   70%      ]     7            0.091121            0.150267            0.802342                4.36
[   80%      ]     8            0.089877            0.149736            0.804047                4.42
[   90%      ]     9            0.088688            0.149366            0.805443                4.76
[  100%      ]    10            0.087512            0.148980            0.806906                4.38
[ ACTION     ] Start to save model ...
[------------] Model file: model.out
[------------] Time cost for saving model: 2.06 (sec)
[ ACTION     ] Finish training
[ ACTION     ] Clear the xLearn environment ...
[------------] Total time cost: 60.53 (sec)
{'k': 4, 'lambda': 0.0001, 'lr': 0.1, 'metric': 'auc', 'task': 'binary'}
----------------------------------------------------------------------------------------------
           _
          | |
     __  _| |     ___  __ _ _ __ _ __
     \ \/ / |    / _ \/ _` | '__| '_ \
      >  <| |___|  __/ (_| | |  | | | |
     /_/\_\_____/\___|\__,_|_|  |_| |_|

        xLearn   -- 0.40 Version --
----------------------------------------------------------------------------------------------

[------------] xLearn uses 8 threads for training task.
[ ACTION     ] Read Problem ...
[------------] First check if the text file has been already converted to binary format.
[------------] Binary file (/content/drive/MyDrive/recsys_hw2/train.txt.bin) found. Skip converting text to binary.
[------------] First check if the text file has been already converted to binary format.
[------------] Binary file (/content/drive/MyDrive/recsys_hw2/val.txt.bin) found. Skip converting text to binary.
[------------] Number of Feature: 5665370
[------------] Number of Field: 5
[------------] Time cost for reading problem: 11.09 (sec)
[ ACTION     ] Initialize model ...
[------------] Model size: 907.69 MB
[------------] Time cost for model initial: 3.64 (sec)
[ ACTION     ] Start to train ...
[------------] Epoch      Train log_loss       Test log_loss            Test AUC     Time cost (sec)
[   10%      ]     1            0.098016            0.149312            0.800069                5.08
[   20%      ]     2            0.094146            0.147409            0.808242                5.20
[   30%      ]     3            0.090263            0.145562            0.814680                5.72
[   40%      ]     4            0.085849            0.143822            0.819339                5.42
[   50%      ]     5            0.080804            0.142946            0.821973                4.58
[   60%      ]     6            0.075396            0.143146            0.821855                4.48
[   70%      ]     7            0.069875            0.143769            0.820940                4.61
[   80%      ]     8            0.064463            0.145384            0.818426                4.75
[ ACTION     ] Early-stopping at epoch 5, best AUC: 0.821973
[ ACTION     ] Start to save model ...
[------------] Model file: model.out
[------------] Time cost for saving model: 2.46 (sec)
[ ACTION     ] Finish training
[ ACTION     ] Clear the xLearn environment ...
[------------] Total time cost: 60.88 (sec)

In [None]:
{'k': 8, 'lambda': 0.1, 'lr': 0.1, 'metric': 'auc', 'task': 'binary'}
----------------------------------------------------------------------------------------------
           _
          | |
     __  _| |     ___  __ _ _ __ _ __
     \ \/ / |    / _ \/ _` | '__| '_ \
      >  <| |___|  __/ (_| | |  | | | |
     /_/\_\_____/\___|\__,_|_|  |_| |_|

        xLearn   -- 0.40 Version --
----------------------------------------------------------------------------------------------

[------------] xLearn uses 8 threads for training task.
[ ACTION     ] Read Problem ...
[------------] First check if the text file has been already converted to binary format.
[------------] Binary file (/content/drive/MyDrive/recsys_hw2/train.txt.bin) found. Skip converting text to binary.
[------------] First check if the text file has been already converted to binary format.
[------------] Binary file (/content/drive/MyDrive/recsys_hw2/val.txt.bin) found. Skip converting text to binary.
[------------] Number of Feature: 5665370
[------------] Number of Field: 5
[------------] Time cost for reading problem: 16.47 (sec)
[ ACTION     ] Initialize model ...
[------------] Model size: 1.73 GB
[------------] Time cost for model initial: 3.63 (sec)
[ ACTION     ] Start to train ...
[------------] Epoch      Train log_loss       Test log_loss            Test AUC     Time cost (sec)
[   10%      ]     1            0.107771            0.175893            0.756480                5.89
[   20%      ]     2            0.106779            0.175212            0.763524                5.80
[   30%      ]     3            0.106040            0.175245            0.769005                5.88
[   40%      ]     4            0.105339            0.175196            0.768905                6.20
[   50%      ]     5            0.104931            0.175134            0.768608                5.80
[   60%      ]     6            0.104599            0.175024            0.767195                6.16
[ ACTION     ] Early-stopping at epoch 3, best AUC: 0.769005
[ ACTION     ] Start to save model ...
[------------] Model file: model.out
[------------] Time cost for saving model: 3.50 (sec)
[ ACTION     ] Finish training
[ ACTION     ] Clear the xLearn environment ...
[------------] Total time cost: 63.24 (sec)
{'k': 8, 'lambda': 0.001, 'lr': 0.1, 'metric': 'auc', 'task': 'binary'}
----------------------------------------------------------------------------------------------
           _
          | |
     __  _| |     ___  __ _ _ __ _ __
     \ \/ / |    / _ \/ _` | '__| '_ \
      >  <| |___|  __/ (_| | |  | | | |
     /_/\_\_____/\___|\__,_|_|  |_| |_|

        xLearn   -- 0.40 Version --
----------------------------------------------------------------------------------------------

[------------] xLearn uses 8 threads for training task.
[ ACTION     ] Read Problem ...
[------------] First check if the text file has been already converted to binary format.
[------------] Binary file (/content/drive/MyDrive/recsys_hw2/train.txt.bin) found. Skip converting text to binary.
[------------] First check if the text file has been already converted to binary format.
[------------] Binary file (/content/drive/MyDrive/recsys_hw2/val.txt.bin) found. Skip converting text to binary.
[------------] Number of Feature: 5665370
[------------] Number of Field: 5
[------------] Time cost for reading problem: 11.97 (sec)
[ ACTION     ] Initialize model ...
[------------] Model size: 1.73 GB
[------------] Time cost for model initial: 2.20 (sec)
[ ACTION     ] Start to train ...
[------------] Epoch      Train log_loss       Test log_loss            Test AUC     Time cost (sec)
[   10%      ]     1            0.099902            0.154330            0.787671                5.83
[   20%      ]     2            0.097782            0.153396            0.791561                6.18
[   30%      ]     3            0.096275            0.152300            0.794559                5.81
[   40%      ]     4            0.094807            0.151767            0.796667                6.31
[   50%      ]     5            0.093375            0.151292            0.799027                5.90
[   60%      ]     6            0.092015            0.150618            0.801451                5.84
[   70%      ]     7            0.090685            0.149956            0.803335                6.11
[   80%      ]     8            0.089370            0.149661            0.804967                5.91
[   90%      ]     9            0.088102            0.149099            0.806776                6.14
[  100%      ]    10            0.086873            0.148941            0.807659                5.57
[ ACTION     ] Start to save model ...
[------------] Model file: model.out
[------------] Time cost for saving model: 3.54 (sec)
[ ACTION     ] Finish training
[ ACTION     ] Clear the xLearn environment ...
[------------] Total time cost: 82.15 (sec)
{'k': 8, 'lambda': 0.0001, 'lr': 0.1, 'metric': 'auc', 'task': 'binary'}
----------------------------------------------------------------------------------------------
           _
          | |
     __  _| |     ___  __ _ _ __ _ __
     \ \/ / |    / _ \/ _` | '__| '_ \
      >  <| |___|  __/ (_| | |  | | | |
     /_/\_\_____/\___|\__,_|_|  |_| |_|

        xLearn   -- 0.40 Version --
----------------------------------------------------------------------------------------------

[------------] xLearn uses 8 threads for training task.
[ ACTION     ] Read Problem ...
[------------] First check if the text file has been already converted to binary format.
[------------] Binary file (/content/drive/MyDrive/recsys_hw2/train.txt.bin) found. Skip converting text to binary.
[------------] First check if the text file has been already converted to binary format.
[------------] Binary file (/content/drive/MyDrive/recsys_hw2/val.txt.bin) found. Skip converting text to binary.
[------------] Number of Feature: 5665370
[------------] Number of Field: 5
[------------] Time cost for reading problem: 11.44 (sec)
[ ACTION     ] Initialize model ...
[------------] Model size: 1.73 GB
[------------] Time cost for model initial: 2.19 (sec)
[ ACTION     ] Start to train ...
[------------] Epoch      Train log_loss       Test log_loss            Test AUC     Time cost (sec)
[   10%      ]     1            0.097774            0.149066            0.802444                5.93
[   20%      ]     2            0.093311            0.146515            0.811016                5.51
[   30%      ]     3            0.088531            0.144522            0.817968                5.67
[   40%      ]     4            0.082963            0.143164            0.822278                5.75
[   50%      ]     5            0.076890            0.143364            0.823002                5.51
[   60%      ]     6            0.070606            0.143929            0.822012                6.07
[   70%      ]     7            0.064414            0.145336            0.820140                5.75
[   80%      ]     8            0.058701            0.147460            0.817185                5.90
[ ACTION     ] Early-stopping at epoch 5, best AUC: 0.823002
[ ACTION     ] Start to save model ...
[------------] Model file: model.out
[------------] Time cost for saving model: 3.59 (sec)
[ ACTION     ] Finish training
[ ACTION     ] Clear the xLearn environment ...
[------------] Total time cost: 66.64 (sec)

Из результатов видно, что k = 8, немного лучше k = 4. C увеличением lambda, увеличивается auc. Попробуем еще уменьшить lambda

In [None]:
param_grid = {
    'task': ['binary'],
    'lr': [0.1],
    'lambda':[0.00001, 0.000001, 0.0000001],
    'k': [8],
    'metric': ['auc'],
}
for param in ParameterGrid(param_grid):
  print(param)
  ffm_model = xl.create_ffm()
  ffm_model.setTrain('/content/drive/MyDrive/recsys_hw2/train.txt')
  ffm_model.setValidate('/content/drive/MyDrive/recsys_hw2/val.txt')
  ffm_model.fit(param, 'model.out')

In [None]:
{'k': 8, 'lambda': 1e-05, 'lr': 0.1, 'metric': 'auc', 'task': 'binary'}
----------------------------------------------------------------------------------------------
           _
          | |
     __  _| |     ___  __ _ _ __ _ __
     \ \/ / |    / _ \/ _` | '__| '_ \
      >  <| |___|  __/ (_| | |  | | | |
     /_/\_\_____/\___|\__,_|_|  |_| |_|

        xLearn   -- 0.40 Version --
----------------------------------------------------------------------------------------------

[------------] xLearn uses 8 threads for training task.
[ ACTION     ] Read Problem ...
[------------] First check if the text file has been already converted to binary format.
[------------] Binary file (/content/drive/MyDrive/recsys_hw2/train.txt.bin) found. Skip converting text to binary.
[------------] First check if the text file has been already converted to binary format.
[------------] Binary file (/content/drive/MyDrive/recsys_hw2/val.txt.bin) found. Skip converting text to binary.
[------------] Number of Feature: 5665370
[------------] Number of Field: 5
[------------] Time cost for reading problem: 15.12 (sec)
[ ACTION     ] Initialize model ...
[------------] Model size: 1.73 GB
[------------] Time cost for model initial: 3.75 (sec)
[ ACTION     ] Start to train ...
[------------] Epoch      Train log_loss       Test log_loss            Test AUC     Time cost (sec)
[   10%      ]     1            0.097145            0.147621            0.805996                6.88
[   20%      ]     2            0.088830            0.143168            0.819404                6.31
[   30%      ]     3            0.075649            0.144162            0.817908                6.83
[   40%      ]     4            0.060635            0.150408            0.807022                6.23
[   50%      ]     5            0.048881            0.157371            0.799152                7.17
[ ACTION     ] Early-stopping at epoch 2, best AUC: 0.819404
[ ACTION     ] Start to save model ...
[------------] Model file: model.out
[------------] Time cost for saving model: 4.62 (sec)
[ ACTION     ] Finish training
[ ACTION     ] Clear the xLearn environment ...
[------------] Total time cost: 60.38 (sec)
{'k': 8, 'lambda': 1e-06, 'lr': 0.1, 'metric': 'auc', 'task': 'binary'}
----------------------------------------------------------------------------------------------
           _
          | |
     __  _| |     ___  __ _ _ __ _ __
     \ \/ / |    / _ \/ _` | '__| '_ \
      >  <| |___|  __/ (_| | |  | | | |
     /_/\_\_____/\___|\__,_|_|  |_| |_|

        xLearn   -- 0.40 Version --
----------------------------------------------------------------------------------------------

[------------] xLearn uses 8 threads for training task.
[ ACTION     ] Read Problem ...
[------------] First check if the text file has been already converted to binary format.
[------------] Binary file (/content/drive/MyDrive/recsys_hw2/train.txt.bin) found. Skip converting text to binary.
[------------] First check if the text file has been already converted to binary format.
[------------] Binary file (/content/drive/MyDrive/recsys_hw2/val.txt.bin) found. Skip converting text to binary.
[------------] Number of Feature: 5665370
[------------] Number of Field: 5
[------------] Time cost for reading problem: 14.18 (sec)
[ ACTION     ] Initialize model ...
[------------] Model size: 1.73 GB
[------------] Time cost for model initial: 2.28 (sec)
[ ACTION     ] Start to train ...
[------------] Epoch      Train log_loss       Test log_loss            Test AUC     Time cost (sec)
[   10%      ]     1            0.097074            0.147126            0.807328                6.44
[   20%      ]     2            0.087980            0.143130            0.819725                7.51
[   30%      ]     3            0.073041            0.145945            0.814570                7.59
[   40%      ]     4            0.056771            0.155364            0.798822                7.02
[   50%      ]     5            0.045515            0.164620            0.790223                6.43
[ ACTION     ] Early-stopping at epoch 2, best AUC: 0.819725
[ ACTION     ] Start to save model ...
[------------] Model file: model.out
[------------] Time cost for saving model: 4.96 (sec)
[ ACTION     ] Finish training
[ ACTION     ] Clear the xLearn environment ...
[------------] Total time cost: 59.45 (sec)
{'k': 8, 'lambda': 1e-07, 'lr': 0.1, 'metric': 'auc', 'task': 'binary'}
----------------------------------------------------------------------------------------------
           _
          | |
     __  _| |     ___  __ _ _ __ _ __
     \ \/ / |    / _ \/ _` | '__| '_ \
      >  <| |___|  __/ (_| | |  | | | |
     /_/\_\_____/\___|\__,_|_|  |_| |_|

        xLearn   -- 0.40 Version --
----------------------------------------------------------------------------------------------

[------------] xLearn uses 8 threads for training task.
[ ACTION     ] Read Problem ...
[------------] First check if the text file has been already converted to binary format.
[------------] Binary file (/content/drive/MyDrive/recsys_hw2/train.txt.bin) found. Skip converting text to binary.
[------------] First check if the text file has been already converted to binary format.
[------------] Binary file (/content/drive/MyDrive/recsys_hw2/val.txt.bin) found. Skip converting text to binary.
[------------] Number of Feature: 5665370
[------------] Number of Field: 5
[------------] Time cost for reading problem: 13.39 (sec)
[ ACTION     ] Initialize model ...
[------------] Model size: 1.73 GB
[------------] Time cost for model initial: 2.45 (sec)
[ ACTION     ] Start to train ...
[------------] Epoch      Train log_loss       Test log_loss            Test AUC     Time cost (sec)
[   10%      ]     1            0.096964            0.147381            0.807014                6.47
[   20%      ]     2            0.087151            0.143240            0.819795                6.40
[   30%      ]     3            0.071174            0.147373            0.811513                6.56
[   40%      ]     4            0.054898            0.157610            0.796451                6.67
[   50%      ]     5            0.044474            0.167031            0.788122                6.35
[ ACTION     ] Early-stopping at epoch 2, best AUC: 0.819795
[ ACTION     ] Start to save model ...
[------------] Model file: model.out
[------------] Time cost for saving model: 3.86 (sec)
[ ACTION     ] Finish training
[ ACTION     ] Clear the xLearn environment ...
[------------] Total time cost: 55.15 (sec)

Уменьшение lambda не дало результотов. Лучшие результаты оказались у модели с k=8, lambda=0.0001. Проверим на тестовых данных.

In [None]:
ffm_encoder = FFMFormat()
ffm_test = ffm_encoder.fit_transform(df_test, y='clicks')
ffm_test.to_csv('test.txt', index=False, header=False)

In [None]:
!cp test.txt /content/drive/MyDrive/recsys_hw2

In [None]:
params = {
    'task': 'binary',
    'lr': 0.1,
    'lambda':0.0001,
    'k': 8,
    'metric': 'auc',
}
ffm_model = xl.create_ffm()
ffm_model.setTrain('/content/drive/MyDrive/recsys_hw2/train.txt')
ffm_model.setValidate('/content/drive/MyDrive/recsys_hw2/val.txt')
ffm_model.fit(param, 'model.out')


In [None]:
----------------------------------------------------------------------------------------------
           _
          | |
     __  _| |     ___  __ _ _ __ _ __
     \ \/ / |    / _ \/ _` | '__| '_ \
      >  <| |___|  __/ (_| | |  | | | |
     /_/\_\_____/\___|\__,_|_|  |_| |_|

        xLearn   -- 0.40 Version --
----------------------------------------------------------------------------------------------

[------------] xLearn uses 8 threads for training task.
[ ACTION     ] Read Problem ...
[------------] First check if the text file has been already converted to binary format.
[------------] Binary file (/content/drive/MyDrive/recsys_hw2/train.txt.bin) found. Skip converting text to binary.
[------------] First check if the text file has been already converted to binary format.
[------------] Binary file (/content/drive/MyDrive/recsys_hw2/val.txt.bin) found. Skip converting text to binary.
[------------] Number of Feature: 5665370
[------------] Number of Field: 5
[------------] Time cost for reading problem: 15.72 (sec)
[ ACTION     ] Initialize model ...
[------------] Model size: 1.73 GB
[------------] Time cost for model initial: 3.70 (sec)
[ ACTION     ] Start to train ...
[------------] Epoch      Train log_loss       Test log_loss            Test AUC     Time cost (sec)
[   10%      ]     1            0.096976            0.146548            0.809004                6.93
[   20%      ]     2            0.087336            0.142749            0.820385                6.00
[   30%      ]     3            0.071341            0.146886            0.812959                7.14
[   40%      ]     4            0.054705            0.157021            0.797807                6.75
[   50%      ]     5            0.043993            0.168139            0.788127                6.97
[ ACTION     ] Early-stopping at epoch 2, best AUC: 0.820385
[ ACTION     ] Start to save model ...
[------------] Model file: model.out
[------------] Time cost for saving model: 5.27 (sec)
[ ACTION     ] Finish training
[ ACTION     ] Clear the xLearn environment ...
[------------] Total time cost: 61.87 (sec)

In [None]:
ffm_model.setTest('/content/drive/MyDrive/recsys_hw2/test.txt')
ffm_model.setSigmoid()
ffm_model.predict("model.out", "output.txt")

In [None]:
----------------------------------------------------------------------------------------------
           _
          | |
     __  _| |     ___  __ _ _ __ _ __
     \ \/ / |    / _ \/ _` | '__| '_ \
      >  <| |___|  __/ (_| | |  | | | |
     /_/\_\_____/\___|\__,_|_|  |_| |_|

        xLearn   -- 0.40 Version --
----------------------------------------------------------------------------------------------

[------------] xLearn uses 8 threads for prediction task.
[ ACTION     ] Load model ...
[------------] Load model from model.out
[------------] Loss function: cross-entropy
[------------] Score function: ffm
[------------] Number of Feature: 5665370
[------------] Number of K: 8
[------------] Number of field: 5
[------------] Time cost for loading model: 1.35 (sec)
[ ACTION     ] Read Problem ...
[------------] First check if the text file has been already converted to binary format.
[------------] Binary file (/content/drive/MyDrive/recsys_hw2/test.txt.bin) NOT found. Convert text file to binary file.
[------------] Time cost for reading problem: 6.45 (sec)
[ ACTION     ] Start to predict ...
[------------] The test loss is: 0.152884
[ ACTION     ] Clear the xLearn environment ...
[------------] Total time cost: 9.26 (sec)

In [6]:
df.drop(['campaign_clicks', 'impressions'], axis=1, inplace=True)
datetime = pd.to_datetime(df['date_time'])
df['date'] = datetime.dt.date
df.drop('date_time', axis=1, inplace=True)
df_test = df[df['date'] == pd.Timestamp('2021-10-02').date()]
df_test.drop('date', axis=1, inplace=True)
y_test = df_test['clicks']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test.drop('date', axis=1, inplace=True)


In [7]:
y_pred = pd.read_csv('output.txt', header=None)

In [8]:
from sklearn.metrics import roc_auc_score, log_loss
auc = roc_auc_score(y_test, y_pred)
log_loss = log_loss(y_test, y_pred)
print(f'auc={auc} log_loss={log_loss}')

auc=0.5859046923718023 log_loss=0.1528592157237372
