In [1]:
!pip install xlearn



In [1]:
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import cross_validate
import xlearn ags xl
import os


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data.csv')
data = data.drop(columns=['banner_id0', 'banner_id1', 'rate0', 'rate1', 'g0', 'g1', 'coeff_sum0', 'coeff_sum1'])
data

Unnamed: 0,date_time,zone_id,banner_id,oaid_hash,campaign_clicks,os_id,country_id,impressions,clicks
0,2021-09-27 00:01:30.000000,0,0,5664530014561852622,0,0,0,1,1
1,2021-09-26 22:54:49.000000,1,1,5186611064559013950,0,0,1,1,1
2,2021-09-26 23:57:20.000000,2,2,2215519569292448030,3,0,0,1,1
3,2021-09-27 00:04:30.000000,3,3,6262169206735077204,0,1,1,1,1
4,2021-09-27 00:06:21.000000,4,4,4778985830203613115,0,1,0,1,1
...,...,...,...,...,...,...,...,...,...
15821467,2021-10-02 15:51:35.000000,146,530,4329496688011613719,0,2,9,1,0
15821468,2021-09-27 22:03:14.000000,12,22,453968700792456599,0,1,6,1,0
15821469,2021-10-02 17:41:10.000000,12,1236,9112780675655118328,0,2,0,1,0
15821470,2021-09-29 00:39:32.000000,967,21,6968514095695555037,0,0,0,1,0


In [6]:
def analysis(data: pd.DataFrame):
    display(data.describe())
    for column in data.drop(columns=['date_time']).columns:
        print('unique values ' + str(column) + ':', len(data[column].unique()))

analysis(data)

Unnamed: 0,zone_id,banner_id,oaid_hash,campaign_clicks,os_id,country_id,impressions,clicks
count,15821470.0,15821470.0,15821470.0,15821470.0,15821470.0,15821470.0,15821472.0,15821470.0
mean,81.52679,381.6483,4.610505e+18,0.623854,1.840605,4.346986,1.0,0.02668835
std,163.2448,395.9386,2.663858e+18,9.249152,1.530005,4.317701,0.0,0.161171
min,0.0,0.0,1116911000000.0,0.0,0.0,0.0,1.0,0.0
25%,14.0,52.0,2.297977e+18,0.0,1.0,0.0,1.0,0.0
50%,19.0,217.0,4.614236e+18,0.0,2.0,4.0,1.0,0.0
75%,60.0,611.0,6.914243e+18,0.0,3.0,7.0,1.0,0.0
max,3443.0,1632.0,9.223371e+18,829.0,10.0,16.0,1.0,1.0


unique values zone_id: 3444
unique values banner_id: 1633
unique values oaid_hash: 6510316
unique values campaign_clicks: 822
unique values os_id: 11
unique values country_id: 17
unique values impressions: 1
unique values clicks: 2


Можно игнорировать impressions. Время разобъем по колонкам

In [3]:
def feature_engineering(data: pd.DataFrame) -> pd.DataFrame:
    data['date_time'] = data['date_time'].apply(lambda x: datetime.strptime(x.split('.')[0],'%Y-%m-%d %H:%M:%S'))
    data['month'] = data['date_time'].dt.month
    data['day'] = data['date_time'].dt.day
    data['hour'] = data['date_time'].dt.hour
    data['minute'] = data['date_time'].dt.minute
    data['second'] = data['date_time'].dt.second
    data['is_day_event'] = data['hour'].apply(lambda x: 1 if 9 <= int(x) <= 24 else 0)
    data['is_weekend_event'] = data['date_time'].apply(lambda x: 1 if x.weekday() > 4 else 0)
    data['is_campaign_successful'] = data['campaign_clicks'].apply(lambda x: int(bool(x)))
    return data

data = feature_engineering(data)

Тестовую выборку отсечем по последнему дню, валидационную по предпоследнему

In [4]:
last_day = data['date_time'].max().replace(hour=0, minute=0, second=0, microsecond=0)
penultimate_day = data['date_time'].max().replace(hour=0, minute=0, second=0, microsecond=0) - timedelta(days=1)
print('last_day ' + str(last_day), 'penultimate_day ' + str(penultimate_day))
train_data = data[data['date_time'] < penultimate_day]
print('train_data size ' + str(len(train_data)))
validation_data = data[(data['date_time'] >= penultimate_day) & (data['date_time'] < last_day)]
print('validation_data size ' + str(len(validation_data)))
test_data = data[data['date_time'] >= last_day]
print('test_data size ' + str(len(test_data)))

last_day 2021-10-02 00:00:00 penultimate_day 2021-10-01 00:00:00
train_data size 12049046
validation_data size 1643448
test_data size 2128978


In [5]:
assert(len(data) == len(train_data) + len(validation_data) + len(test_data))

In [6]:
train_data = train_data.drop(columns=['date_time'])
test_data = test_data.drop(columns=['date_time'])
validation_data = validation_data.drop(columns=['date_time'])

In [7]:
features_to_transform = ['zone_id', 'banner_id', 'oaid_hash' ,'os_id', 'country_id', 'month', 'day', 'hour', 'minute', 'second']
column_to_number = {col: i for i, col in enumerate(features_to_transform)}
category_maps = {col: {val: i for i, val in enumerate(pd.concat((train_data, validation_data, test_data))[col].unique())} for col in features_to_transform}

def to_libffm(df, target, save_path):
    def convert_to_ffm(row):
        result = [str(row[target])]
        for col in features_to_transform:
            field_id = column_to_number[col]
            category_id = category_maps[col][row[col]]
            result.append(f"{field_id}:{category_id}:1")
        return ' '.join(result)
    ffm_data = df.apply(convert_to_ffm, axis=1)
    ffm_data.to_csv(save_path, index=False, header=False, sep='\n')

In [12]:
to_libffm(train_data, 'clicks', 'train.txt')
to_libffm(validation_data, 'clicks', 'validation.txt')
to_libffm(test_data, 'clicks', 'test.txt')

In [8]:
os.environ['USER'] = 'test'

In [None]:
roc_auc_values = []
lambda_grid = [1e-2, 1e-3, 1e-4, 1e-5]
k_grid = [2, 4, 8, 16, 32]
for l in lambda_grid:
    for k in k_grid:
        ffm_model = xl.create_ffm()
        ffm_model.setTrain("train.txt")
        ffm_model.setTest("validation.txt")
        ffm_model.fit({'task':'binary', 'lr': 0.1, 'lambda': l, 'k': k, 'metric': 'auc'}, "./model.out")
        ffm_model.setSigmoid()
        ffm_model.predict("./model.out", "output.txt")

        with open("output.txt", 'r') as f:
            y_predicted = np.array(list(map(float, filter(lambda s: len(s) > 0, f.read().split('\n')))))
        roc_auc_values.append([l, k, roc_auc_score(validation_data['clicks'].values, y_predicted)])


In [1]:
max_val = -1
max_id = -1
for i in roc_auc_values:
    if i[2] > max_val:
      max_id = i
print(roc_auc_values[max_id])

[0.0001, 16, 0.741384392279736]


In [11]:
ffm_model = xl.create_ffm()
ffm_model.setTrain("train.txt")
ffm_model.setTest("test.txt")
ffm_model.fit({'task':'binary', 'lr': 0.1, 'lambda': 0.0001, 'k': 16, 'metric': 'auc'}, "./model.out")
ffm_model.setSigmoid()
ffm_model.predict("./model.out", "output.txt")

with open("output.txt", 'r') as f:
    y_predicted = np.array(list(map(float, filter(lambda s: len(s) > 0, f.read().split('\n')))))
roc_auc = roc_auc_score(test_data['clicks'].values, y_predicted)
log_loss = log_loss(test_data['clicks'].values, y_predicted)
display(f'roc_auc = {roc_auc}, log_loss = {log_loss}')


'roc_auc = 0.7788852190472766, log_loss = 0.13845943266724953'

Это намного лучше моей первой дз, там был roc_auc=0.5975206813676645