### Import

In [1]:
import os
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
import duckdb

import plotly.express as px

import category_encoders as ce
from sklearn.ensemble import AdaBoostClassifier

In [2]:
# 동일한 결과 보장을 위해 Seed값을 고정합니다
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed를 42로 고정

In [9]:
train_path = '../ctr_data/train.csv'

con = duckdb.connect()

df = con.query(f"""(SELECT *
                        FROM read_csv_auto('{train_path}')
                        WHERE Click = 0
                        ORDER BY random()
                        LIMIT 30000)
                        UNION ALL
                        (SELECT *
                        FROM read_csv_auto('{train_path}')
                        WHERE Click = 1
                        ORDER BY random()
                        LIMIT 30000)""").df()


con.close()
test = pd.read_csv('../ctr_data/test.csv')
df.head()

Unnamed: 0,ID,Click,F01,F02,F03,F04,F05,F06,F07,F08,...,F30,F31,F32,F33,F34,F35,F36,F37,F38,F39
0,TRAIN_10175700,0,LLKAVMO,EKJSVRG,,22.0,YKHABYT,37,TDHZHOJ,LPYPUNA,...,FGOVFJM,CNEYHMW,191.0,27.0,ABEHJLN,IRUDRFB,14.0,QSDLVTN,0.0,ZMXDDNB
1,TRAIN_13243952,0,LWKZZCN,LESKWXZ,,,ESTDBCQ,0,DXRALFP,FTPHMPQ,...,TTUOGIK,AZZYLIF,38515.0,0.0,LFGAQTQ,IRUDRFB,,ACKQKGP,0.0,XPNEEUX
2,TRAIN_24238112,0,SPNTBWW,DXHJHRP,IAGJDOH,8.0,PJXKTHM,156,CAESBUA,ZTCWAXO,...,NZGEZLW,GTISJWW,197913.0,0.0,GYSHAAZ,MNBSNJV,,UGWXMOV,0.0,UUPPZAU
3,TRAIN_00039770,0,QSVESDG,RKJYNLT,,7.0,CMXXRSC,59,VZKNATZ,LPYPUNA,...,OACUGUM,AZZYLIF,1343.0,0.0,FEBVXFF,IRUDRFB,7.0,ECOIAXE,0.0,ZSZKXIR
4,TRAIN_24849068,0,VBRKXXM,RNCJZPY,HUUWYSX,51.0,EEDVPZR,130,TYIVAOY,FTPHMPQ,...,LLMKLSJ,GTISJWW,99.0,6.0,QUZHAXC,IRUDRFB,4.0,KTCQCEU,0.0,GVRZOHG


In [3]:
# 제공된 train 데이터와 test 데이터를 불러옵니다
train = pd.read_csv('../ctr_data/train.csv')
test = pd.read_csv('../ctr_data/test.csv')

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

### EDA 1 : Sparse and Dense

In [10]:
train=df
train.head()


Unnamed: 0,ID,Click,F01,F02,F03,F04,F05,F06,F07,F08,...,F30,F31,F32,F33,F34,F35,F36,F37,F38,F39
0,TRAIN_10175700,0,LLKAVMO,EKJSVRG,,22.0,YKHABYT,37,TDHZHOJ,LPYPUNA,...,FGOVFJM,CNEYHMW,191.0,27.0,ABEHJLN,IRUDRFB,14.0,QSDLVTN,0.0,ZMXDDNB
1,TRAIN_13243952,0,LWKZZCN,LESKWXZ,,,ESTDBCQ,0,DXRALFP,FTPHMPQ,...,TTUOGIK,AZZYLIF,38515.0,0.0,LFGAQTQ,IRUDRFB,,ACKQKGP,0.0,XPNEEUX
2,TRAIN_24238112,0,SPNTBWW,DXHJHRP,IAGJDOH,8.0,PJXKTHM,156,CAESBUA,ZTCWAXO,...,NZGEZLW,GTISJWW,197913.0,0.0,GYSHAAZ,MNBSNJV,,UGWXMOV,0.0,UUPPZAU
3,TRAIN_00039770,0,QSVESDG,RKJYNLT,,7.0,CMXXRSC,59,VZKNATZ,LPYPUNA,...,OACUGUM,AZZYLIF,1343.0,0.0,FEBVXFF,IRUDRFB,7.0,ECOIAXE,0.0,ZSZKXIR
4,TRAIN_24849068,0,VBRKXXM,RNCJZPY,HUUWYSX,51.0,EEDVPZR,130,TYIVAOY,FTPHMPQ,...,LLMKLSJ,GTISJWW,99.0,6.0,QUZHAXC,IRUDRFB,4.0,KTCQCEU,0.0,GVRZOHG


### EDA 2 : Imbalance

In [7]:
click = train['Click'].value_counts(normalize=True)

click_figure = px.bar(click,
             x=['Not Clicked : 0', 'Clicked : 1'],
             y=click.values.tolist(),
             labels={'x': 'Value', 'y': 'Percentage'},
             width = 450,
             height = 500
            )

# 그래프 표시
click_figure.show()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

### Data Preprocessing 1 : Select x, y

In [11]:
train_x = train.drop(columns=['ID', 'Click'])
train_y = train['Click']

test_x = test.drop(columns=['ID'])

### Data Preprocessing 2 : Fill NaN

In [12]:
for col in tqdm(train_x.columns):
    if train_x[col].isnull().sum() != 0:
        train_x[col].fillna(0, inplace=True)
        test_x[col].fillna(0, inplace=True)


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.




A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.



10

### Data Preprocessing 3 : Count Encoding

In [13]:
encoding_target = list(train_x.dtypes[train_x.dtypes == "object"].index)

enc = ce.CountEncoder(cols = encoding_target).fit(train_x, train_y)
X_train_encoded = enc.transform(train_x)
X_test_encoded = enc.transform(test_x)

### Model Setting

In [14]:
model = AdaBoostClassifier()

### Model Train and Inference

In [15]:
model.fit(X_train_encoded, train_y)





In [16]:
pred = model.predict_proba(X_test_encoded)
display(model.classes_)
display(pred)

array([0, 1], dtype=int64)

array([[0.49527716, 0.50472284],
       [0.50054639, 0.49945361],
       [0.4960248 , 0.5039752 ],
       ...,
       [0.50429818, 0.49570182],
       [0.49968878, 0.50031122],
       [0.49917394, 0.50082606]])

### Submission

In [17]:
sample_submission = pd.read_csv('../ctr_data/sample_submission.csv')
sample_submission

Unnamed: 0,ID,Click
0,TEST_0000000,0
1,TEST_0000001,0
2,TEST_0000002,0
3,TEST_0000003,0
4,TEST_0000004,0
...,...,...
4538536,TEST_4538536,0
4538537,TEST_4538537,0
4538538,TEST_4538538,0
4538539,TEST_4538539,0


In [18]:
sample_submission['Click'] = pred[:,1]
sample_submission

Unnamed: 0,ID,Click
0,TEST_0000000,0.504723
1,TEST_0000001,0.499454
2,TEST_0000002,0.503975
3,TEST_0000003,0.499660
4,TEST_0000004,0.505143
...,...,...
4538536,TEST_4538536,0.496466
4538537,TEST_4538537,0.503691
4538538,TEST_4538538,0.495702
4538539,TEST_4538539,0.500311


In [20]:
sample_submission.to_csv('../ctr_data/baseline_submission.csv', index=False)