### Import

In [1]:
import os
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
import duckdb

import plotly.express as px

import category_encoders as ce
from sklearn.ensemble import AdaBoostClassifier
from sklearn import metrics
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
# 동일한 결과 보장을 위해 Seed값을 고정합니다
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed를 42로 고정

In [3]:
train_path = '../ctr_data/train.csv'

con = duckdb.connect()

df = con.query(f"""(SELECT *
                        FROM read_csv_auto('{train_path}')
                        WHERE Click = 0
                        ORDER BY random()
                        LIMIT 50000)
                        UNION ALL
                        (SELECT *
                        FROM read_csv_auto('{train_path}')
                        WHERE Click = 1
                        ORDER BY random()
                        LIMIT 50000)""").df()


con.close()
test = pd.read_csv('../ctr_data/test.csv')

### EDA 1 : Sparse and Dense

In [4]:
train=df

### Data Preprocessing 1 : Select x, y

In [5]:
train_x = train.drop(columns=['ID', 'Click'])
train_y = train['Click']

test_x = test.drop(columns=['ID'])

### Data Preprocessing 2 : Fill NaN

In [6]:
for col in tqdm(train_x.columns):
    if train_x[col].isnull().sum() != 0:
        train_x[col].fillna(0, inplace=True)
        test_x[col].fillna(0, inplace=True)

  0%|          | 0/39 [00:00<?, ?it/s]

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_x[col].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_x[col].fillna(0, inplace=True)
100%|██████████| 39/39 [00:01<00:00, 20.25it/s]


In [7]:
encoding_target = list(train_x.dtypes[train_x.dtypes == "object"].index)

enc = ce.CountEncoder(cols = encoding_target).fit(train_x, train_y)
X_train_encoded = enc.transform(train_x)
X_test_encoded = enc.transform(test_x)

In [8]:
categorical_target=list(train_x.dtypes[train_x.dtypes == "float"].index)
# print(categorical_target)
# print(encoding_target)
# print(X_train_encoded)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_train_encoded, train_y, test_size=0.15, random_state=9608)

cat = CatBoostClassifier(iterations=10000,
                         learning_rate=0.01,
                         l2_leaf_reg=5e-3,
                         max_depth=16,
                         task_type="GPU",
                         devices='0')

cat.fit(X_train,
        y_train,
        eval_set=[(X_test, y_test)],
        early_stopping_rounds=50,
        verbose=True)

0:	learn: 0.6857584	test: 0.6911395	best: 0.6911395 (0)	total: 732ms	remaining: 2h 2m 3s
1:	learn: 0.6782159	test: 0.6890951	best: 0.6890951 (1)	total: 1.45s	remaining: 2h 52s
2:	learn: 0.6692149	test: 0.6873042	best: 0.6873042 (2)	total: 2.18s	remaining: 2h 1m 4s
3:	learn: 0.6609018	test: 0.6853452	best: 0.6853452 (3)	total: 2.9s	remaining: 2h 1m 1s
4:	learn: 0.6538723	test: 0.6836779	best: 0.6836779 (4)	total: 3.62s	remaining: 2h 36s
5:	learn: 0.6485888	test: 0.6820352	best: 0.6820352 (5)	total: 4.32s	remaining: 2h
6:	learn: 0.6406497	test: 0.6804003	best: 0.6804003 (6)	total: 5.04s	remaining: 1h 59m 57s
7:	learn: 0.6338496	test: 0.6790317	best: 0.6790317 (7)	total: 5.76s	remaining: 1h 59m 49s
8:	learn: 0.6279350	test: 0.6776992	best: 0.6776992 (8)	total: 6.46s	remaining: 1h 59m 36s
9:	learn: 0.6220269	test: 0.6762759	best: 0.6762759 (9)	total: 7.18s	remaining: 1h 59m 29s
10:	learn: 0.6159938	test: 0.6748253	best: 0.6748253 (10)	total: 7.89s	remaining: 1h 59m 25s
11:	learn: 0.6087422

<catboost.core.CatBoostClassifier at 0x20341554710>

In [10]:
preds = cat.predict(X_test)
acc_score = accuracy_score(y_test, preds)
print(acc_score)

score = cat.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = metrics.roc_curve(y_test, score, pos_label=1)
auc_score = metrics.auc(fpr, tpr)
print(auc_score)

0.6477333333333334
0.6984777890718509


In [11]:
test_pred = cat.predict_proba(X_test_encoded)

sample_submission = pd.read_csv('../ctr_data/sample_submission.csv')
sample_submission['Click'] = test_pred[:,1]
sample_submission.to_csv('../ctr_data/catboost_submission.csv', index=False)