### Import

In [1]:
import os
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
import duckdb

import plotly.express as px

import category_encoders as ce
from sklearn.ensemble import AdaBoostClassifier

In [2]:
# 동일한 결과 보장을 위해 Seed값을 고정합니다
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed를 42로 고정

In [3]:
train_path = '../ctr_data/train.csv'

con = duckdb.connect()

df = con.query(f"""(SELECT *
                        FROM read_csv_auto('{train_path}')
                        WHERE Click = 0
                        ORDER BY random()
                        LIMIT 30000)
                        UNION ALL
                        (SELECT *
                        FROM read_csv_auto('{train_path}')
                        WHERE Click = 1
                        ORDER BY random()
                        LIMIT 30000)""").df()


con.close()
test = pd.read_csv('../ctr_data/test.csv')
df.head()

Unnamed: 0,ID,Click,F01,F02,F03,F04,F05,F06,F07,F08,...,F30,F31,F32,F33,F34,F35,F36,F37,F38,F39
0,TRAIN_11680540,0,EMOXTTK,NIDBQLT,KVMAIVS,19.0,CBRFPWT,0,YBCNRUH,LPYPUNA,...,QRTVXSR,GTISJWW,261927.0,0.0,WQGPJOP,MNBSNJV,,BKXWPRP,0.0,OSCDYKV
1,TRAIN_22589211,0,LLKAVMO,EKJSVRG,,1.0,YKHABYT,0,NIXDORW,LPYPUNA,...,NZGEZLW,GTISJWW,1704.0,0.0,ABEHJLN,IRUDRFB,6.0,NOTFWKW,0.0,IBIXXAD
2,TRAIN_05131530,0,AMGEZKP,ASBBYOW,,2.0,TEUOJJC,201,SXHOQOS,VAWXMCR,...,LHVAIQP,WCMLPDK,13171.0,0.0,UXMOYJC,IRUDRFB,2.0,BDFRZYY,0.0,YINTNCW
3,TRAIN_08012251,0,PWUPWWR,BJENSNO,IAGJDOH,,WUEHXWO,991,HSDIXUN,LPYPUNA,...,IVIRTPR,GTISJWW,1405.0,0.0,PMHZWKN,IRUDRFB,,TYFMWZA,0.0,OKVIVIJ
4,TRAIN_15571049,0,JCDXFYU,PILDDJU,IAGJDOH,1.0,LFPUEOV,286,PQZBVMG,FTPHMPQ,...,GGRQCBP,KHZNEZF,132.0,2.0,QMOULXS,IRUDRFB,1.0,XLDEUVO,0.0,KINMMHQ


### EDA 1 : Sparse and Dense

In [4]:
train=df

### Data Preprocessing 1 : Select x, y

In [5]:
train_x = train.drop(columns=['ID', 'Click'])
train_y = train['Click']

test_x = test.drop(columns=['ID'])

### Data Preprocessing 2 : Fill NaN

In [6]:
for col in tqdm(train_x.columns):
    if train_x[col].isnull().sum() != 0:
        train_x[col].fillna(0, inplace=True)
        test_x[col].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_x[col].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_x[col].fillna(0, inplace=True)
100%|██████████| 39/39 [00:01<00:00, 21.09it/s]


### Data Preprocessing 3 : Count Encoding

In [7]:
encoding_target = list(train_x.dtypes[train_x.dtypes == "object"].index)

enc = ce.CountEncoder(cols = encoding_target).fit(train_x, train_y)
X_train_encoded = enc.transform(train_x)
X_test_encoded = enc.transform(test_x)

In [44]:
print(X_train_encoded)
print('------'*10)
print(train_y)

         F01    F02    F03   F04    F05  F06    F07    F08   F09    F10  ...  \
0          1      1  22506   0.0      1    0      1  22659    10      1  ...   
1          1      1   1103  23.0      1   86      1   4530    23      1  ...   
2       5663   6085  23402   1.0   5663   22     14   2448   362   5663  ...   
3         29     84  23402   1.0     29  157      1  14313     8     70  ...   
4      11265  11291  22506   3.0  11265    0     55  22659  2048  11265  ...   
...      ...    ...    ...   ...    ...  ...    ...    ...   ...    ...  ...   
59995      1    831  22506   5.0      1   36     15  14313    11     64  ...   
59996    122    122  23402   2.0    122   29  10390  14313    69    122  ...   
59997      3      3  22506  70.0      3    0      6  22659     1      3  ...   
59998      1      6   4358   0.0      1   -1    236   4530   245      1  ...   
59999      1      1  22506   3.0      1   19     11  14313    21      1  ...   

         F30    F31      F32   F33    F

### Model Setting

In [None]:
#test model(DeepFM)
# -*- coding: utf-8 -*-

"""
A pytorch implementation of DeepFM for rates prediction problem.
"""

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from time import time


class DeepFM(nn.Module):

    def __init__(self, feature_sizes, embedding_size=4,
                 hidden_dims=[32, 32], num_classes=1, dropout=[0.5, 0.5], 
                 use_cuda=True, verbose=False):

        super().__init__()
        self.field_size = len(feature_sizes)
        self.feature_sizes = feature_sizes
        self.embedding_size = embedding_size
        self.hidden_dims = hidden_dims
        self.num_classes = num_classes
        self.dtype = torch.long
        self.bias = torch.nn.Parameter(torch.randn(1))
        """
            check if use cuda
        """
        if use_cuda and torch.cuda.is_available():
            self.device = torch.device('cuda')
        else:
            self.device = torch.device('cpu')
        """
            init fm part
        """
        self.fm_first_order_embeddings = nn.ModuleList(
            [nn.Embedding(feature_size, 1) for feature_size in self.feature_sizes])
        self.fm_second_order_embeddings = nn.ModuleList(
            [nn.Embedding(feature_size, self.embedding_size) for feature_size in self.feature_sizes])
        """
            init deep part
        """
        all_dims = [self.field_size * self.embedding_size] + \
            self.hidden_dims + [self.num_classes]
        for i in range(1, len(hidden_dims) + 1):
            setattr(self, 'linear_'+str(i),
                    nn.Linear(all_dims[i-1], all_dims[i]))
            # nn.init.kaiming_normal_(self.fc1.weight)
            setattr(self, 'batchNorm_' + str(i),
                    nn.BatchNorm1d(all_dims[i]))
            setattr(self, 'dropout_'+str(i),
                    nn.Dropout(dropout[i-1]))

    def forward(self, Xi, Xv):
        """
        Forward process of network. 

        Inputs:
        - Xi: A tensor of input's index, shape of (N, field_size, 1)
        - Xv: A tensor of input's value, shape of (N, field_size, 1)
        """
        """
            fm part
        """

        fm_first_order_emb_arr = [(torch.sum(emb(Xi[:, i, :]), 1).t() * Xv[:, i]).t() for i, emb in enumerate(self.fm_first_order_embeddings)]
        fm_first_order = torch.cat(fm_first_order_emb_arr, 1)
        fm_second_order_emb_arr = [(torch.sum(emb(Xi[:, i, :]), 1).t() * Xv[:, i]).t() for i, emb in enumerate(self.fm_second_order_embeddings)]
        fm_sum_second_order_emb = sum(fm_second_order_emb_arr)
        fm_sum_second_order_emb_square = fm_sum_second_order_emb * \
            fm_sum_second_order_emb  # (x+y)^2
        fm_second_order_emb_square = [
            item*item for item in fm_second_order_emb_arr]
        fm_second_order_emb_square_sum = sum(
            fm_second_order_emb_square)  # x^2+y^2
        fm_second_order = (fm_sum_second_order_emb_square -
                           fm_second_order_emb_square_sum) * 0.5
        """
            deep part
        """
        deep_emb = torch.cat(fm_second_order_emb_arr, 1)
        deep_out = deep_emb
        for i in range(1, len(self.hidden_dims) + 1):
            deep_out = getattr(self, 'linear_' + str(i))(deep_out)
            deep_out = getattr(self, 'batchNorm_' + str(i))(deep_out)
            deep_out = getattr(self, 'dropout_' + str(i))(deep_out)
        """
            sum
        """
        total_sum = torch.sum(fm_first_order, 1) + \
                    torch.sum(fm_second_order, 1) + torch.sum(deep_out, 1) + self.bias
        return total_sum

    def fit(self, loader_train, loader_val, optimizer, epochs=100, verbose=False, print_every=100):
        """
        Training a model and valid accuracy.

        Inputs:
        - loader_train: I
        - loader_val: .
        - optimizer: Abstraction of optimizer used in training process, e.g., "torch.optim.Adam()""torch.optim.SGD()".
        - epochs: Integer, number of epochs.
        - verbose: Bool, if print.
        - print_every: Integer, print after every number of iterations. 
        """
        """
            load input data
        """
        model = self.train().to(device=self.device)
        criterion = F.binary_cross_entropy_with_logits

        for _ in range(epochs):
            for t, (xi, xv, y) in enumerate(loader_train):
                xi = xi.to(device=self.device, dtype=self.dtype)
                xv = xv.to(device=self.device, dtype=torch.float)
                y = y.to(device=self.device, dtype=torch.float)
                
                total = model(xi, xv)
                loss = criterion(total, y)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                if verbose and t % print_every == 0:
                    print('Iteration %d, loss = %.4f' % (t, loss.item()))
                    self.check_accuracy(loader_val, model)
                    print()
    
    def check_accuracy(self, loader, model):
        if loader.dataset.train:
            print('Checking accuracy on validation set')
        else:
            print('Checking accuracy on test set')   
        num_correct = 0
        num_samples = 0
        model.eval()  # set model to evaluation mode
        with torch.no_grad():
            for xi, xv, y in loader:
                xi = xi.to(device=self.device, dtype=self.dtype)  # move to device, e.g. GPU
                xv = xv.to(device=self.device, dtype=torch.float)
                y = y.to(device=self.device, dtype=torch.bool)
                total = model(xi, xv)
                preds = (F.sigmoid(total) > 0.5)
                num_correct += (preds == y).sum()
                num_samples += preds.size(0)
            acc = float(num_correct) / num_samples
            print('Got %d / %d correct (%.2f%%)' % (num_correct, num_samples, 100 * acc))




                        

In [38]:
model = AdaBoostClassifier()

### Model Train and Inference

In [15]:
model.fit(X_train_encoded, train_y)





In [16]:
pred = model.predict_proba(X_test_encoded)
display(model.classes_)
display(pred)

array([0, 1], dtype=int64)

array([[0.49527716, 0.50472284],
       [0.50054639, 0.49945361],
       [0.4960248 , 0.5039752 ],
       ...,
       [0.50429818, 0.49570182],
       [0.49968878, 0.50031122],
       [0.49917394, 0.50082606]])

### Submission

In [17]:
sample_submission = pd.read_csv('../ctr_data/sample_submission.csv')
sample_submission

Unnamed: 0,ID,Click
0,TEST_0000000,0
1,TEST_0000001,0
2,TEST_0000002,0
3,TEST_0000003,0
4,TEST_0000004,0
...,...,...
4538536,TEST_4538536,0
4538537,TEST_4538537,0
4538538,TEST_4538538,0
4538539,TEST_4538539,0


In [18]:
sample_submission['Click'] = pred[:,1]
sample_submission

Unnamed: 0,ID,Click
0,TEST_0000000,0.504723
1,TEST_0000001,0.499454
2,TEST_0000002,0.503975
3,TEST_0000003,0.499660
4,TEST_0000004,0.505143
...,...,...
4538536,TEST_4538536,0.496466
4538537,TEST_4538537,0.503691
4538538,TEST_4538538,0.495702
4538539,TEST_4538539,0.500311


In [20]:
sample_submission.to_csv('../ctr_data/baseline_submission.csv', index=False)