In [2]:
!pip install xlearn

Collecting xlearn
  Downloading xlearn-0.40a1.tar.gz (4.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.9/4.9 MB[0m [31m49.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: xlearn
  Building wheel for xlearn (setup.py) ... [?25ldone
[?25h  Created wheel for xlearn: filename=xlearn-0.40a1-py3-none-any.whl size=225729 sha256=fb1fb8045f0f548e1520c56cae744dd5429dafc2eb48942c8350295fbe65c417
  Stored in directory: /root/.cache/pip/wheels/09/48/04/779ee06b22532c86cde8da8984b83284517492dad1df998c6a
Successfully built xlearn
Installing collected packages: xlearn
Successfully installed xlearn-0.40a1


In [3]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, roc_auc_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from datetime import datetime
from sklearn.preprocessing import OneHotEncoder
import scipy
from sklearn.model_selection import cross_validate
import xlearn as xl
import os
from tqdm import tqdm

In [4]:
os.environ['USER'] = 'test'

In [5]:
def get_date_features(x):
    x_datetime = datetime.strptime(x.split('.')[0], '%Y-%m-%d %H:%M:%S')
    year = x_datetime.year
    month = x_datetime.month
    day = x_datetime.day
    hour = x_datetime.hour
    
    return [year, month, day, hour]


def split_date_time(data: pd.DataFrame):
    date_values= np.stack(data['date_time'].apply(lambda x: get_date_features(x)).values)
    data['year'] = date_values[:, 0]
    data['month'] = date_values[:, 1]
    data['day'] = date_values[:, 2]
    data['hour'] = date_values[:, 3]
    data = data.drop(columns=['date_time'])
    return data


def feature_engineering(data: pd.DataFrame) -> pd.DataFrame:    
    # Разобью столбец date_time на год/месяц/день/час
    # Минуты и секунды я дропаю, тк кажется, что эти значения не могут нести полезной информации
    data = split_date_time(data)
    
    # Удалю константные фичи
    data = data.drop(columns=['impressions', 'year'])
    last_day_indices = np.logical_and((data['month'] == 10).values, (data['day'] == 2).values)
    train_data, test_data = data[np.logical_not(last_day_indices)], data[last_day_indices]
    
    train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)
    
    return train_data, val_data, test_data

In [6]:
all_data = pd.read_csv('/kaggle/input/recsys/data.csv')
all_data = all_data.drop(columns=['banner_id0', 'banner_id1',\
                              'rate0', 'rate1', 'g0', 'g1', 'coeff_sum0', 'coeff_sum1'])

In [7]:
train_df, val_df, test_df = feature_engineering(all_data.sample(100000))

In [21]:
# Определение категориальных признаков
categorical_features = ["zone_id", "banner_id", "oaid_hash", "os_id", "country_id", "hour", "month", "day"]

# Преобразование категориальных признаков
field_dict = {col: i for i, col in enumerate(categorical_features)}
category_maps = {col: {val: i for i, val in enumerate(pd.concat((train_df, val_df, test_df))[col].unique())} for col in categorical_features}

def to_libffm(df, target, save_path):
    def convert_to_ffm(row):
        ffm_row = [str(row[target])]  # target
        for col in categorical_features:
            field_id = field_dict[col]
            category_id = category_maps[col][row[col]]
            ffm_row.append(f"{field_id}:{category_id}:1")
        return ' '.join(ffm_row)

    # Преобразование DataFrame и сохранение в файл
    ffm_data = df.apply(convert_to_ffm, axis=1)
    ffm_data.to_csv(save_path, index=False, header=False, sep='\n')

to_libffm(train_df, 'clicks', 'train.txt')
to_libffm(val_df, 'clicks', 'val.txt')
to_libffm(test_df, 'clicks', 'test.txt')

In [9]:
from tqdm import tqdm

In [None]:
y_val = val_df['clicks'].values

roc_auc_scores = []
log_losses = []

for l in [1e-2, 1e-3, 1e-4, 1e-5, 1e-6]:
    for k in [2, 4, 8, 16, 32]:
        ffm_model = xl.create_ffm()
        ffm_model.setTrain("train.txt")
        ffm_model.setTest("val.txt")
        param = {'task':'binary', 'lr': 0.1, 'lambda': l, 'k': k, 'metric': 'auc'}

        ffm_model.fit(param, './model.out')
        ffm_model.setSigmoid()
        ffm_model.predict('./model.out', './rusult.txt')
        
        with open('rusult.txt', 'r') as f:
            y_pred_proba = np.array(list(map(float, filter(lambda s: len(s) > 0, f.read().split('\n')))))
        roc_auc_scores.append([l, k, roc_auc_score(y_val, y_pred_proba)])
        log_losses.append([l, k, log_loss(y_val, y_pred_proba)])

In [32]:
np.argmax(np.array(roc_auc_scores)[:, 2])
roc_auc_scores[13]

[0.0001, 16, 0.738948523838858]

In [33]:
train_df, val_df, test_df = feature_engineering(all_data)

In [37]:
field_dict = {col: i for i, col in enumerate(categorical_features)}
category_maps = {col: {val: i for i, val in enumerate(pd.concat((train_df, val_df, test_df))[col].unique())} for col in categorical_features}

train_df = pd.concat((train_df, val_df))

to_libffm(train_df, 'clicks', 'train.txt')
to_libffm(test_df, 'clicks', 'test.txt')

y_test = test_df['clicks'].values

ffm_model = xl.create_ffm()
ffm_model.setTrain("train.txt")
ffm_model.setTest("test.txt")
param = {'task':'binary', 'lr': 0.1, 'lambda': 0.0001, 'k': 16, 'metric': 'auc'}

ffm_model.fit(param, './model.out')
ffm_model.setSigmoid()
ffm_model.predict('./model.out', './rusult.txt')

with open('rusult.txt', 'r') as f:
    y_pred_proba = np.array(list(map(float, filter(lambda s: len(s) > 0, f.read().split('\n')))))

    roc_auc_metric = roc_auc_score(y_test, y_pred_proba)
log_loss_metric = log_loss(y_test, y_pred_proba)
print(f'roc_auc = {roc_auc_metric}, log_loss = {log_loss_metric}')

[32m[1m----------------------------------------------------------------------------------------------
           _
          | |
     __  _| |     ___  __ _ _ __ _ __
     \ \/ / |    / _ \/ _` | '__| '_ \ 
      >  <| |___|  __/ (_| | |  | | | |
     /_/\_\_____/\___|\__,_|_|  |_| |_|

        xLearn   -- 0.40 Version --
----------------------------------------------------------------------------------------------

[32m[------------] [0mxLearn uses 4 threads for training task.
[32m[1m[ ACTION     ] Read Problem ...[0m
[32m[------------] [0mFirst check if the text file has been already converted to binary format.
[32m[------------] [0mBinary file (train.txt.bin) NOT found. Convert text file to binary file.
[32m[------------] [0mNumber of Feature: 5660418
[32m[------------] [0mNumber of Field: 8
[32m[------------] [0mTime cost for reading problem: 58.99 (sec)
[32m[1m[ ACTION     ] Initialize model ...[0m
[32m[------------] [0mModel size: 5.44 GB
[32m[------------]

Итоговый скор: roc_auc = 0.7877827249206713, log_loss = 0.13312216860809437 
Скор из предыдущего дз: roc_auc = 0.7225271872729148, log_loss = 0.7550432223404697

Результат получилось улучшить.