### Import

In [1]:
import os
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
import duckdb

import plotly.express as px

import category_encoders as ce
from sklearn.ensemble import AdaBoostClassifier
from sklearn import metrics
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
# 동일한 결과 보장을 위해 Seed값을 고정합니다
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed를 42로 고정

In [3]:
train_path = '../ctr_data/train.csv'

con = duckdb.connect()

train_data = con.query(f"""(SELECT *
                        FROM read_csv_auto('{train_path}')
                        WHERE Click = 0
                        ORDER BY random()
                        LIMIT 30000)
                        UNION ALL
                        (SELECT *
                        FROM read_csv_auto('{train_path}')
                        WHERE Click = 1
                        ORDER BY random()
                        LIMIT 30000)""").df()


con.close()
test = pd.read_csv('../ctr_data/test.csv')
train_data.head()

Unnamed: 0,ID,Click,F01,F02,F03,F04,F05,F06,F07,F08,...,F30,F31,F32,F33,F34,F35,F36,F37,F38,F39
0,TRAIN_18553563,0,GZVCLLD,KUWLNGU,IAGJDOH,,LXAADLB,2,DXTYPAO,FTPHMPQ,...,MFPUCBU,GTISJWW,3984.0,0.0,ERYFMVZ,IRUDRFB,1.0,QVXWGZA,0.0,IZVPXRU
1,TRAIN_08702536,0,RGVBXGF,IBGMFJN,,12.0,VTNUQUZ,7,DFWOPAR,FTPHMPQ,...,NZGEZLW,GTISJWW,34526.0,0.0,ECPKEZY,MNBSNJV,7.0,IEEJFFT,0.0,WQHLFPY
2,TRAIN_19471875,0,NGMLPUP,OTWSYZU,IAGJDOH,9.0,CEFCCIG,0,PQZBVMG,OFKQGTY,...,SLXYBBG,WHSRKIM,4449.0,0.0,TUBGNFF,IRUDRFB,,QFXNDDX,0.0,SVFIGME
3,TRAIN_03063117,0,DSZWSYL,JPTLTGK,,17.0,VOYTJWU,15,PQZBVMG,FTPHMPQ,...,OBOELBB,GTISJWW,1405.0,0.0,OJUUGZR,IRUDRFB,10.0,ULLNGKJ,0.0,QNTELED
4,TRAIN_07097479,0,JCDXFYU,PILDDJU,IAGJDOH,15.0,LFPUEOV,16,QSDCRVA,OFKQGTY,...,JSSZOFX,KHZNEZF,21.0,16.0,QMOULXS,IRUDRFB,18.0,DYXQJGB,0.0,HUAXGCU


In [4]:
train_x = train_data.drop(columns=['ID', 'Click'])
train_y = train_data['Click']
test_x = test.drop(columns=['ID'])

In [5]:
for col in tqdm(train_x.columns):
    if train_x[col].isnull().sum() != 0:
        train_x[col].fillna(0, inplace=True)
        test_x[col].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_x[col].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_x[col].fillna(0, inplace=True)
100%|██████████| 39/39 [00:01<00:00, 21.45it/s]


In [6]:
encoding_target = list(train_x.dtypes[train_x.dtypes == "object"].index)

enc = ce.CountEncoder(cols = encoding_target).fit(train_x, train_y)
X_train_encoded = enc.transform(train_x)
X_test_encoded = enc.transform(test_x)

In [7]:
categorical_target=list(train_x.dtypes[train_x.dtypes == "float"].index)
print(categorical_target)
print(encoding_target)
print(X_train_encoded)

['F04', 'F11', 'F18', 'F19', 'F24', 'F27', 'F29', 'F32', 'F33', 'F36', 'F38']
['F01', 'F02', 'F03', 'F05', 'F07', 'F08', 'F09', 'F10', 'F12', 'F13', 'F15', 'F16', 'F17', 'F20', 'F21', 'F22', 'F23', 'F25', 'F26', 'F28', 'F30', 'F31', 'F34', 'F35', 'F37', 'F39']
         F01    F02    F03   F04    F05  F06    F07    F08  F09    F10  ...  \
0          4     54  22550   0.0      4    2      6  22782   20      7  ...   
1          1      7  23257  12.0      1    7      3  22782    7      7  ...   
2          5      5  22550   9.0      5    0  10257   2375    6      5  ...   
3          3      4  23257  17.0      3   15  10257  22782    1      3  ...   
4      11115  11161  22550  15.0  11115   16      2   2375  623  11115  ...   
...      ...    ...    ...   ...    ...  ...    ...    ...  ...    ...  ...   
59995      1      3   4298   1.0      1    1  10257   4582   35      1  ...   
59996   5652   6037  23257   1.0   5652    0     29  14198   75   5652  ...   
59997      2    680   4298  

In [15]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
label_encoders = {}
for cat_col in encoding_target:
    le = LabelEncoder()
    train_data[cat_col] = le.fit_transform(train_data[cat_col])
    label_encoders[cat_col] = le


In [16]:
# 연속형 특성 스케일링
scaler = MinMaxScaler()
train_data[categorical_target] = scaler.fit_transform(train_data[categorical_target])

In [17]:
train_data.head()

Unnamed: 0,ID,Click,F01,F02,F03,F04,F05,F06,F07,F08,...,F30,F31,F32,F33,F34,F35,F36,F37,F38,F39
0,TRAIN_18553563,0,6190,3843,13,1.0,11068,2,1996,4,...,2645,4,0.16105,0.0,3650,0,0.006623,2175,0.0,1628
1,TRAIN_08702536,0,15221,2890,48,0.020484,20077,7,1637,4,...,3024,4,0.739224,0.0,3153,1,0.046358,1051,0.0,4175
2,TRAIN_19471875,0,11707,5201,13,0.014898,1980,0,7672,19,...,4002,13,0.180281,0.0,15203,0,1.0,2093,0.0,3477
3,TRAIN_03063117,0,3350,3420,48,0.029795,19917,15,7672,4,...,3039,4,0.058109,0.0,11055,0,0.066225,2660,0.0,3053
4,TRAIN_07097479,0,8006,5386,13,0.026071,10483,16,8196,19,...,2082,7,0.000893,0.103896,12658,0,0.119205,502,0.0,1416


In [22]:
feature_sizes = [len(le.classes_) for le in label_encoders.values()]
print(feature_sizes)
print(len(categorical_target))
print(len(encoding_target))

[22887, 9232, 49, 23980, 12608, 37, 6574, 15297, 21835, 992, 4, 8016, 9, 7872, 31, 2359, 406, 5626, 1106, 25, 5558, 14, 19895, 2, 3419, 4793]
11
26


In [23]:
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import sampler
features = torch.tensor(train_data.drop(columns=['ID', 'Click']).values, dtype=torch.float32)
targets = torch.tensor(train_data['Click'].values, dtype=torch.float32).unsqueeze(1)

In [25]:
print(features)
print(targets)

tensor([[6.1900e+03, 3.8430e+03, 1.3000e+01,  ..., 2.1750e+03, 0.0000e+00,
         1.6280e+03],
        [1.5221e+04, 2.8900e+03, 4.8000e+01,  ..., 1.0510e+03, 0.0000e+00,
         4.1750e+03],
        [1.1707e+04, 5.2010e+03, 1.3000e+01,  ..., 2.0930e+03, 0.0000e+00,
         3.4770e+03],
        ...,
        [2.8700e+03, 4.1210e+03, 4.0000e+00,  ..., 2.0100e+02, 0.0000e+00,
         4.5520e+03],
        [1.5651e+04, 7.9630e+03, 3.4000e+01,  ..., 2.0830e+03, 0.0000e+00,
         1.6440e+03],
        [1.6279e+04, 8.0510e+03, 1.3000e+01,  ..., 1.3900e+02, 0.0000e+00,
         1.3860e+03]])
tensor([[0.],
        [0.],
        [0.],
        ...,
        [1.],
        [1.],
        [1.]])


In [26]:
from torch.utils.data import TensorDataset
dataset = TensorDataset(features, targets)
loader = DataLoader(dataset, batch_size=32, shuffle=True)

In [34]:
from DeepFM import DeepFM

#모델 정의 (DeepFM)
model = DeepFM(feature_sizes=feature_sizes, embedding_size=4, hidden_dims=[32, 32], num_classes=1, use_cuda=True)

In [35]:
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [41]:
import torch.nn.functional as F
DeepFM.fit(loader_train=loader,loader_val='_',optimizer=optimizer)

TypeError: DeepFM.fit() missing 1 required positional argument: 'self'