## Import

In [11]:
# 표준 라이브러리
import os
import random

# 데이터 처리 및 분석 라이브러리
import numpy as np
import pandas as pd
import polars as pl
import duckdb

# 진행상황 표시 라이브러리
from tqdm import tqdm

# 머신러닝 관련 라이브러리
from sklearn.model_selection import train_test_split



## Data Load

In [17]:
train_sample = duckdb.query("""
    SELECT * 
    FROM '../data/train.parquet'
    limit 5000000
""").to_df()
train_sample.head()

Unnamed: 0,gender,age_group,inventory_id,day_of_week,hour,seq,l_feat_1,l_feat_2,l_feat_3,l_feat_4,...,history_b_22,history_b_23,history_b_24,history_b_25,history_b_26,history_b_27,history_b_28,history_b_29,history_b_30,clicked
0,1.0,7.0,36,5,13,"9,18,269,516,57,97,527,74,317,311,269,479,57,7...",1.0,2.0,1.0,23.0,...,0.070092,0.070092,0.011682,0.004673,0.087226,0.049843,0.015576,0.040498,0.051401,0
1,1.0,7.0,2,5,8,"9,144,269,57,516,97,527,74,315,317,311,269,479...",2.0,2.0,3.0,17.0,...,0.07299,0.07299,0.012165,0.004866,0.045416,0.051904,0.01622,0.042172,0.026763,0
2,1.0,7.0,36,5,11,"269,516,57,97,165,527,74,77,317,269,75,450,15,...",1.0,2.0,1.0,7.0,...,0.057177,0.057177,0.00953,0.003812,0.035577,0.081318,0.012706,0.033036,0.062898,0
3,1.0,8.0,37,5,11,"269,57,516,21,214,269,561,214,269,561,247,516,...",2.0,2.0,2.0,7.0,...,0.100449,0.100449,0.016741,0.006697,0.062502,0.07143,0.022322,0.058037,0.073659,0
4,2.0,7.0,37,5,7,"144,269,57,516,35,479,57,516,527,74,77,318,193...",2.0,2.0,3.0,24.0,...,0.064512,0.064512,0.010752,0.004301,0.040141,0.045875,0.014336,0.037274,0.023654,0


In [None]:
test = duckdb.query("""
    SELECT *
    FROM '../data/test.parquet'
""").to_df()
test = test.drop(columns='ID')
test.head()

Unnamed: 0,gender,age_group,inventory_id,day_of_week,hour,seq,l_feat_1,l_feat_2,l_feat_3,l_feat_4,...,history_b_21,history_b_22,history_b_23,history_b_24,history_b_25,history_b_26,history_b_27,history_b_28,history_b_29,history_b_30
0,2.0,6.0,46,7,13,"321,269,57,516,479,516,57,479,35,57,516,403,45...",2.0,2.0,2.0,19.0,...,0.008702,0.071199,0.071199,0.011866,0.004747,0.044302,0.05063,0.015822,0.041137,0.104432
1,2.0,8.0,29,7,21,"57,35,479,57,463,212,193,151,463,193,74,77,207...",2.0,2.0,2.0,7.0,...,0.024553,0.200889,0.200889,0.033482,0.013393,0.124998,0.142854,0.044642,0.116069,0.073659
2,1.0,6.0,37,7,19,"57,516,97,74,527,77,318,315,317,311,269,479,57...",2.0,2.0,3.0,7.0,...,0.021739,0.177867,0.177867,0.029645,0.011858,0.110673,0.126483,0.039526,0.102768,0.065218
3,2.0,7.0,41,7,9,"144,321,57,479,57,479,35,57,516,165,74,527,318...",2.0,2.0,2.0,7.0,...,0.006614,0.108234,0.054117,0.009019,0.003608,0.033673,0.038483,0.012026,0.031268,0.039686
4,1.0,8.0,2,7,18,"269,57,516,342,516,403,173,457,343,403,457,173...",2.0,2.0,3.0,8.0,...,0.006532,0.053442,0.053442,0.008907,0.003563,0.033253,0.038003,0.011876,0.030878,0.039191


In [18]:
print(train_sample.shape)
print(test.shape)

(5000000, 119)
(1527298, 118)


# EDA

In [19]:
print(train_sample.isna().sum())
print(test.isna().sum())

gender          7627
age_group       7627
inventory_id       0
day_of_week        0
hour               0
                ... 
history_b_27    7627
history_b_28    7627
history_b_29    7627
history_b_30    7627
clicked            0
Length: 119, dtype: int64
gender          936
age_group       936
inventory_id      0
day_of_week       0
hour              0
               ... 
history_b_26    936
history_b_27    936
history_b_28    936
history_b_29    936
history_b_30    936
Length: 118, dtype: int64


In [20]:
train_clean = train_sample.dropna()
test_clean = test.dropna()

In [22]:
print(train_clean.isna().sum())
print(test_clean.isna().sum())
print(train_clean.shape)
print(test_clean.shape)

gender          0
age_group       0
inventory_id    0
day_of_week     0
hour            0
               ..
history_b_27    0
history_b_28    0
history_b_29    0
history_b_30    0
clicked         0
Length: 119, dtype: int64
gender          0
age_group       0
inventory_id    0
day_of_week     0
hour            0
               ..
history_b_26    0
history_b_27    0
history_b_28    0
history_b_29    0
history_b_30    0
Length: 118, dtype: int64
(4473236, 119)
(1434828, 118)


In [23]:
train_clean['clicked'].value_counts()
# 클래스 불균형 크다. 샘플링 해줘야할듯 

clicked
0    4388559
1      84677
Name: count, dtype: int64

## 모델링

In [24]:
from sklearn.model_selection import train_test_split
train_clean_x = train_clean.drop(columns='clicked')
train_clean_y = train_clean['clicked']
print(train_clean_x)
print(train_clean_y)


        gender age_group inventory_id day_of_week hour  \
0          1.0       7.0           36           5   13   
1          1.0       7.0            2           5   08   
2          1.0       7.0           36           5   11   
3          1.0       8.0           37           5   11   
4          2.0       7.0           37           5   07   
...        ...       ...          ...         ...  ...   
4999992    2.0       6.0           46           7   07   
4999993    2.0       4.0           46           7   11   
4999994    2.0       4.0           88           7   15   
4999995    1.0       3.0           46           7   12   
4999998    1.0       6.0           31           7   12   

                                                       seq  l_feat_1  \
0        9,18,269,516,57,97,527,74,317,311,269,479,57,7...       1.0   
1        9,144,269,57,516,97,527,74,315,317,311,269,479...       2.0   
2        269,516,57,97,165,527,74,77,317,269,75,450,15,...       1.0   
3        269,57

In [25]:
X_train, X_val, Y_train, Y_val = train_test_split(
    train_clean_x,train_clean_y,
    test_size=0.1,
    random_state=42,
    stratify = train_clean_y
)

In [26]:
print(X_train.shape)
print(Y_train.shape)
print(X_val.shape)
print(Y_val.shape)

(4025912, 118)
(4025912,)
(447324, 118)
(447324,)


In [27]:
# 딥러닝 관련 라이브러리
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split