### Import

In [1]:
import os
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
import duckdb

import plotly.express as px

import category_encoders as ce
from sklearn.ensemble import AdaBoostClassifier
from sklearn import metrics
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
# 동일한 결과 보장을 위해 Seed값을 고정합니다
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed를 42로 고정

In [3]:
train_path = '../ctr_data/train.csv'

con = duckdb.connect()

train_data = con.query(f"""(SELECT *
                        FROM read_csv_auto('{train_path}')
                        WHERE Click = 0
                        ORDER BY random()
                        LIMIT 30000)
                        UNION ALL
                        (SELECT *
                        FROM read_csv_auto('{train_path}')
                        WHERE Click = 1
                        ORDER BY random()
                        LIMIT 30000)""").df()


con.close()
test = pd.read_csv('../ctr_data/test.csv')
train_data.head()

Unnamed: 0,ID,Click,F01,F02,F03,F04,F05,F06,F07,F08,...,F30,F31,F32,F33,F34,F35,F36,F37,F38,F39
0,TRAIN_12437096,0,DIPONTC,PRIIIKK,,3.0,OLNDYQY,0,YZEYLVA,FTPHMPQ,...,WWWEXSL,GTISJWW,4589.0,0.0,XWSKHMV,IRUDRFB,6.0,EPYCJGQ,0.0,PFDWETE
1,TRAIN_01658840,0,XRNFGPM,RNCJZPY,HUUWYSX,1.0,AWIGQQH,6,HAHGQUD,FTPHMPQ,...,SLXYBBG,GTISJWW,4307.0,0.0,LRUZVSS,IRUDRFB,5.0,KTCQCEU,0.0,WUPRYDQ
2,TRAIN_26836490,0,JCDGUTG,LKTOGQA,,16.0,FYQPUCR,18,STSQZOU,IGQGIJM,...,NZGEZLW,GTISJWW,321554.0,0.0,KLJWLXN,MNBSNJV,,PLLNZJI,0.0,WRVYDJH
3,TRAIN_25227395,0,JCDXFYU,PILDDJU,IAGJDOH,33.0,LFPUEOV,15,YRXECFU,FTPHMPQ,...,OACUGUM,KHZNEZF,50.0,18.0,QMOULXS,IRUDRFB,5.0,XBKBHCW,0.0,GTWSQIJ
4,TRAIN_02947799,0,VHLLSXG,DJDKEYH,,1.0,BJMUDNF,0,RWIXOGV,FTPHMPQ,...,MFPUCBU,GTISJWW,3173.0,0.0,VXXUNFA,IRUDRFB,1.0,BWNTSAV,0.0,FYALNBY


In [4]:
train_x = train_data.drop(columns=['ID', 'Click'])
train_y = train_data['Click']
test_x = test.drop(columns=['ID'])

In [5]:
for col in tqdm(train_x.columns):
    if train_x[col].isnull().sum() != 0:
        train_x[col].fillna(0, inplace=True)
        test_x[col].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_x[col].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_x[col].fillna(0, inplace=True)
100%|██████████| 39/39 [00:01<00:00, 23.94it/s]


In [6]:
encoding_target = list(train_x.dtypes[train_x.dtypes == "object"].index)

enc = ce.CountEncoder(cols = encoding_target).fit(train_x, train_y)
X_train_encoded = enc.transform(train_x)
X_test_encoded = enc.transform(test_x)

In [7]:
categorical_target=list(train_x.dtypes[train_x.dtypes == "float"].index)
print(categorical_target)
print(encoding_target)
print(X_train_encoded)

['F04', 'F11', 'F18', 'F19', 'F24', 'F27', 'F29', 'F32', 'F33', 'F36', 'F38']
['F01', 'F02', 'F03', 'F05', 'F07', 'F08', 'F09', 'F10', 'F12', 'F13', 'F15', 'F16', 'F17', 'F20', 'F21', 'F22', 'F23', 'F25', 'F26', 'F28', 'F30', 'F31', 'F34', 'F35', 'F37', 'F39']
         F01    F02    F03   F04    F05  F06  F07    F08   F09    F10  ...  \
0          1      2  23223   3.0      1    0    1  22773     4      2  ...   
1          1   1821   1827   1.0      1    6    6  22773    39      4  ...   
2          1      4  23223  16.0      1   18   21   3193     1      1  ...   
3      11231  11264  22511  33.0  11231   15    1  22773  1990  11231  ...   
4        238   1213  23223   1.0    238    0   64  22773    68    942  ...   
...      ...    ...    ...   ...    ...  ...  ...    ...   ...    ...  ...   
59995      1    269  23223  47.0      1    3    2   2345    19    128  ...   
59996     13     18    407  13.0     13    2  266  22773    83     13  ...   
59997      3      3  22511   0.0     

In [8]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
label_encoders = {}
for cat_col in encoding_target:
    le = LabelEncoder()
    train_data[cat_col] = le.fit_transform(train_data[cat_col])
    label_encoders[cat_col] = le


In [9]:
# 연속형 특성 스케일링
scaler = MinMaxScaler()
train_data[categorical_target] = scaler.fit_transform(train_data[categorical_target])

In [10]:
train_data.head()

Unnamed: 0,ID,Click,F01,F02,F03,F04,F05,F06,F07,F08,...,F30,F31,F32,F33,F34,F35,F36,F37,F38,F39
0,TRAIN_12437096,0,2927,5508,51,5.1e-05,13365,0,12127,4,...,5031,4,0.002533,0.0,18249,0,0.011952,574,0.0,2803
1,TRAIN_01658840,0,20807,6177,13,0.0,768,6,3472,4,...,4060,4,0.002377,0.0,8994,0,0.00996,1344,0.0,4208
2,TRAIN_26836490,0,8036,4052,51,0.000386,5435,18,9109,9,...,3059,4,0.177477,0.0,8024,1,,1989,0.0,4184
3,TRAIN_25227395,0,8038,5381,14,0.000823,10388,15,12004,4,...,3066,7,2.8e-05,0.047493,12707,0,0.00996,3000,0.0,1227
4,TRAIN_02947799,0,18769,1181,51,0.0,1236,0,8672,4,...,2687,4,0.001751,0.0,16756,0,0.001992,240,0.0,1088


In [11]:
feature_sizes = [len(le.classes_) for le in label_encoders.values()]
print(feature_sizes)
print(len(categorical_target))
print(len(encoding_target))

[22916, 9215, 52, 24014, 12615, 39, 6496, 15334, 21863, 985, 4, 8062, 9, 7861, 33, 2355, 405, 5587, 1147, 28, 5651, 14, 19896, 3, 3405, 4792]
11
26


In [12]:
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import sampler
features = torch.tensor(train_data.drop(columns=['ID', 'Click']).values, dtype=torch.float32)
targets = torch.tensor(train_data['Click'].values, dtype=torch.float32).unsqueeze(1)

In [13]:
print(features)
print(targets)

tensor([[2.9270e+03, 5.5080e+03, 5.1000e+01,  ..., 5.7400e+02, 0.0000e+00,
         2.8030e+03],
        [2.0807e+04, 6.1770e+03, 1.3000e+01,  ..., 1.3440e+03, 0.0000e+00,
         4.2080e+03],
        [8.0360e+03, 4.0520e+03, 5.1000e+01,  ..., 1.9890e+03, 0.0000e+00,
         4.1840e+03],
        ...,
        [2.2868e+04, 4.2130e+03, 1.4000e+01,  ..., 1.6020e+03, 0.0000e+00,
         2.8800e+03],
        [4.1660e+03, 2.4190e+03, 1.4000e+01,  ..., 2.6250e+03, 0.0000e+00,
         4.1340e+03],
        [1.1750e+04, 4.1710e+03, 4.0000e+01,  ..., 5.4800e+02, 0.0000e+00,
         2.0000e+03]])
tensor([[0.],
        [0.],
        [0.],
        ...,
        [1.],
        [1.],
        [1.]])


In [14]:
from torch.utils.data import TensorDataset
dataset = TensorDataset(features, targets)
loader = DataLoader(dataset, batch_size=32, shuffle=True)

In [15]:
from DeepFM import DeepFM

#모델 정의 (DeepFM)
model = DeepFM(feature_sizes=feature_sizes, embedding_size=4, hidden_dims=[32, 32], num_classes=1, use_cuda=True)

In [16]:
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [24]:
import torch.nn.functional as F
model.fit(loader,optimizer)

ValueError: not enough values to unpack (expected 3, got 2)

In [29]:
for b in loader:
    print(b)
print(len(loader))

[tensor([[1.0442e+04, 6.4950e+03, 5.1000e+01,  ..., 1.9000e+01, 0.0000e+00,
         7.9000e+02],
        [1.2854e+04, 9.1760e+03, 5.1000e+01,  ..., 3.2000e+02, 0.0000e+00,
         2.7910e+03],
        [8.0380e+03, 5.3810e+03, 1.4000e+01,  ..., 2.7040e+03, 0.0000e+00,
         3.0520e+03],
        ...,
        [1.6943e+04, 4.6600e+02, 2.8000e+01,  ..., 1.0200e+02, 0.0000e+00,
         1.3400e+02],
        [7.0830e+03, 2.9950e+03, 1.4000e+01,  ..., 5.4600e+02,        nan,
         4.6690e+03],
        [1.7135e+04, 1.3330e+03, 1.4000e+01,  ..., 1.3400e+02, 0.0000e+00,
         3.6240e+03]]), tensor([[0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [1.],
        [0.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [0.],
        [1.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [1.],
        [0.],
        [1.],
        [0.],
        [1.],
        [0.],
        [0.],
        [1