In [1]:
import pandas as pd
import numpy as np
import os
data_path = 'D:\\criteo_ctr_mlops\\data'

- train - Training set. 10 days of click-through data, ordered chronologically. Non-clicks and clicks are subsampled according to different strategies.
- test - Test set. 1 day of ads to for testing your model predictions. 

In [15]:
with open("D:\\criteo_ctr_mlops\\data\\train.csv", "r", encoding="utf-8") as f_in, open("train_sample.csv", "w", encoding="utf-8") as f_out:
    for i in range(100_001):
        line = f_in.readline()
        if not line:
            break
        f_out.write(line)


In [None]:
chunk_size = 100_000  # 10만 행씩 나눠서
chunks = pd.read_csv(os.path.join(data_path, "train.csv"), chunksize=chunk_size)

for i, chunk in enumerate(chunks):
    # 각 chunk에 대해 전처리나 EDA
    print(f"Chunk {i} shape: {chunk.shape}")
    # 예: target 값 비율 확인
    print(chunk['click'].value_counts(normalize=True))
    if i == 5:
        break  # 처음 6개만 샘플로 봄
    


In [16]:
df = pd.read_csv('train_sample.csv')

In [18]:
df.columns

Index(['id', 'click', 'hour', 'C1', 'banner_pos', 'site_id', 'site_domain',
       'site_category', 'app_id', 'app_domain', 'app_category', 'device_id',
       'device_ip', 'device_model', 'device_type', 'device_conn_type', 'C14',
       'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21'],
      dtype='object')

In [8]:
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import numpy as np

def preprocess_and_save(csv_path, output_dir, chunk_size=100_000):
    os.makedirs(output_dir, exist_ok=True)

    reader = pd.read_csv(csv_path, chunksize=chunk_size)
    label_encoders = {}
    scaler = MinMaxScaler()

    # 첫 번째 chunk에서 수치형 스케일링과 범주형 인코더 학습
    first_chunk = next(reader)
    num_cols = [col for col in first_chunk.columns if col.startswith('I')]
    cat_cols = [col for col in first_chunk.columns if col.startswith('C')]

    # 수치형 null 처리
    first_chunk[num_cols] = first_chunk[num_cols].fillna(0)
    scaler.fit(first_chunk[num_cols])

    # 범주형 인코딩 학습
    for col in cat_cols:
        le = LabelEncoder()
        le.fit(first_chunk[col].astype(str).fillna(''))
        label_encoders[col] = le

    # 첫 chunk 다시 포함해서 반복
    for i, chunk in enumerate(pd.read_csv(csv_path, chunksize=chunk_size)):
        chunk[num_cols] = chunk[num_cols].fillna(0)
        chunk[num_cols] = scaler.transform(chunk[num_cols])

        for col in cat_cols:
            chunk[col] = label_encoders[col].transform(chunk[col].astype(str).fillna(''))

        # 저장
        output_path = os.path.join(output_dir, f"chunk_{i:03}.npz")
        np.savez_compressed(
            output_path,
            X=chunk[num_cols + cat_cols].values.astype(np.float32),
            y=chunk["click"].values.astype(np.float32)
        )

        print(f"Saved {output_path}")


### Retain (useful features):
- click (target)
- hour (extract day/hour from this)
- C1, banner_pos (user context)
- site_id, site_domain, site_category (site features)
- app_id, app_domain, app_category (app features)
- device_id, device_ip, device_model (device/user ID — may want to hash)
- device_type, device_conn_type
- C14~C21 (engineered categorical features)

### isposable or to transform:
- id: unique row identifier. Not predictive → drop
- device_id, device_ip: may leak user info, and are high-cardinality. Either:
- hash them into fixed buckets or drop them if not using high-cardinality methods (e.g., LightGBM can handle, but not basic DNNs)

In [4]:
df = train_df.drop('id', axis=1)

In [5]:
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns

label_encoder = LabelEncoder()
categorical_cols = df.columns

for col in categorical_cols:
    df[col] = label_encoder.fit_transform(df[col])
    
corr1 = df.corr()['flag'].sort_values()

plt.figure(figsize=(20,2))
sns.heatmap(data = pd.DataFrame(corr1).T, annot=True, annot_kws={"size": 12}, fmt = '.2f', linewidths=0.5, cmap='coolwarm') # .T
plt.title('Correlation with Flag Variable', fontsize=20)
plt.yticks(rotation=90)
plt.tick_params(axis="x", labelsize=13)
plt.tick_params(axis="y", labelsize=16)
plt.show()

KeyError: 'flag'

In [None]:
test_df = pd.read_csv(os.path.join(data_path, 'test.csv'))

In [None]:
test_df.head()