### Import

In [11]:
import os
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
import duckdb

import plotly.express as px

import category_encoders as ce
from sklearn.ensemble import AdaBoostClassifier
from sklearn import metrics
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [12]:
# 동일한 결과 보장을 위해 Seed값을 고정합니다
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed를 42로 고정

In [13]:
train_path = '../ctr_data/train.csv'

con = duckdb.connect()

df = con.query(f"""(SELECT *
                        FROM read_csv_auto('{train_path}')
                        WHERE Click = 0
                        ORDER BY random()
                        LIMIT 30000)
                        UNION ALL
                        (SELECT *
                        FROM read_csv_auto('{train_path}')
                        WHERE Click = 1
                        ORDER BY random()
                        LIMIT 30000)""").df()


con.close()
test = pd.read_csv('../ctr_data/test.csv')
df.head()

Unnamed: 0,ID,Click,F01,F02,F03,F04,F05,F06,F07,F08,...,F30,F31,F32,F33,F34,F35,F36,F37,F38,F39
0,TRAIN_08512141,0,ESNJDTQ,BTBABXD,,48.0,TPQFONI,4,ESNLNXJ,VAWXMCR,...,CAQLQNO,GTISJWW,7.0,1.0,ZQLPFXE,IRUDRFB,6.0,IZMSJKY,0.0,RIMJWQY
1,TRAIN_18233289,0,AUTHYLP,BZHTQNB,NOENOSO,14.0,PPVVQDT,2,AUFSHTC,FTPHMPQ,...,OACUGUM,GTISJWW,987.0,0.0,DYKYQDI,IRUDRFB,6.0,UGWXMOV,0.0,UUPPZAU
2,TRAIN_18633490,0,YNPOZCY,EEGSMJR,IAGJDOH,13.0,DSHJNJK,212,DYBZBQS,HFWFEGT,...,FWFAWLF,GTISJWW,82.0,10.0,IHOYDZQ,IRUDRFB,8.0,DIFTUXH,0.0,AKXMWGH
3,TRAIN_03030943,0,RVMYLQZ,HTZNQPW,,3.0,CHAJYKQ,60,QRJFJSN,OFKQGTY,...,SLXYBBG,GTISJWW,1939.0,0.0,TIZLZJH,IRUDRFB,7.0,OFHONDV,0.0,UJVQVLH
4,TRAIN_06643575,0,CIDEMZA,RLFLWTQ,IAGJDOH,,FPKXCHJ,-1,HDBIDPR,FTPHMPQ,...,XGCLAGJ,ENBEWZP,17195.0,,FJJCMEA,IRUDRFB,,QSULIHT,0.0,PXCBWJA


### EDA 1 : Sparse and Dense

In [14]:
train=df

### Data Preprocessing 1 : Select x, y

In [15]:
train_x = train.drop(columns=['ID', 'Click'])
train_y = train['Click']

test_x = test.drop(columns=['ID'])

### Data Preprocessing 2 : Fill NaN

In [16]:
for col in tqdm(train_x.columns):
    if train_x[col].isnull().sum() != 0:
        train_x[col].fillna(0, inplace=True)
        test_x[col].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_x[col].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_x[col].fillna(0, inplace=True)
100%|██████████| 39/39 [00:01<00:00, 20.98it/s]


In [17]:
encoding_target = list(train_x.dtypes[train_x.dtypes == "object"].index)

enc = ce.CountEncoder(cols = encoding_target).fit(train_x, train_y)
X_train_encoded = enc.transform(train_x)
X_test_encoded = enc.transform(test_x)

In [25]:
categorical_target=list(train_x.dtypes[train_x.dtypes == "float"].index)
print(categorical_target)
print(encoding_target)
print(X_train_encoded)

['F04', 'F11', 'F18', 'F19', 'F24', 'F27', 'F29', 'F32', 'F33', 'F36', 'F38']
['F01', 'F02', 'F03', 'F05', 'F07', 'F08', 'F09', 'F10', 'F12', 'F13', 'F15', 'F16', 'F17', 'F20', 'F21', 'F22', 'F23', 'F25', 'F26', 'F28', 'F30', 'F31', 'F34', 'F35', 'F37', 'F39']
         F01    F02    F03   F04    F05   F06  F07    F08  F09    F10  ...  \
0         57     96  23154  48.0     57     4   42   7501   57     76  ...   
1          1      1   1083  14.0      1     2    8  22710  192      1  ...   
2          1      1  22645  13.0      1   212   45    432  188      1  ...   
3        175    179  23154   3.0    175    60   40   2457    1    179  ...   
4          1      1  22645   0.0      1    -1   11  22710    8      1  ...   
...      ...    ...    ...   ...    ...   ...  ...    ...  ...    ...  ...   
59995      1      1    431   5.0      1   547   34    581   22      1  ...   
59996     49    122  23154   2.0     49     0  240  14381   42     49  ...   
59997      1     70   4378   1.0     

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X_train_encoded, train_y, test_size=0.15, random_state=9608)

cat = CatBoostClassifier(iterations=10000,
                         learning_rate=0.01,
                         l2_leaf_reg=5e-3,
                         max_depth=16,
                         task_type="GPU",
                         devices='0')

cat.fit(X_train,
        y_train,
        eval_set=[(X_test, y_test)],
        early_stopping_rounds=50,
        verbose=True)

0:	learn: 0.6801029	test: 0.6907429	best: 0.6907429 (0)	total: 745ms	remaining: 2h 4m 9s
1:	learn: 0.6682862	test: 0.6887711	best: 0.6887711 (1)	total: 1.46s	remaining: 2h 1m 59s
2:	learn: 0.6568287	test: 0.6869480	best: 0.6869480 (2)	total: 2.19s	remaining: 2h 1m 31s
3:	learn: 0.6476350	test: 0.6851586	best: 0.6851586 (3)	total: 2.9s	remaining: 2h 48s
4:	learn: 0.6377076	test: 0.6832777	best: 0.6832777 (4)	total: 3.61s	remaining: 2h 24s
5:	learn: 0.6311814	test: 0.6815406	best: 0.6815406 (5)	total: 4.31s	remaining: 1h 59m 43s
6:	learn: 0.6215843	test: 0.6797521	best: 0.6797521 (6)	total: 5.03s	remaining: 1h 59m 33s
7:	learn: 0.6110514	test: 0.6782569	best: 0.6782569 (7)	total: 5.75s	remaining: 1h 59m 37s
8:	learn: 0.6002699	test: 0.6763793	best: 0.6763793 (8)	total: 6.47s	remaining: 1h 59m 41s
9:	learn: 0.5903505	test: 0.6746532	best: 0.6746532 (9)	total: 7.2s	remaining: 1h 59m 49s
10:	learn: 0.5792736	test: 0.6734039	best: 0.6734039 (10)	total: 7.92s	remaining: 1h 59m 54s
11:	learn: 

<catboost.core.CatBoostClassifier at 0x1e582c10790>

In [19]:
preds = cat.predict(X_test)
acc_score = accuracy_score(y_test, preds)
print(acc_score)

score = cat.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = metrics.roc_curve(y_test, score, pos_label=1)
auc_score = metrics.auc(fpr, tpr)
print(auc_score)

0.6423333333333333
0.6943037672440925


In [20]:
test_pred = cat.predict_proba(X_test_encoded)

sample_submission = pd.read_csv('../ctr_data/sample_submission.csv')
sample_submission['Click'] = test_pred[:,1]
sample_submission.to_csv('../ctr_data/catboost_submission.csv', index=False)