<a href="https://colab.research.google.com/github/sunghyunjun/big-data-analysis-dataq-cert/blob/main/dataq_type2_5folds_ensemble.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier

In [None]:
def preprocessing(df):
    # 결측지 제거
    df["환불금액"].fillna(0, inplace=True)

    # 음수값 0 으로 대치
    df.loc[df["총구매액"] < 0, "총구매액"] = 0
    df.loc[df["최대구매액"] < 0, "최대구매액"] = 0

    # Log transform
    df["총구매액"] = df["총구매액"].map(lambda x: np.log(x + 1))
    df["최대구매액"] = df["최대구매액"].map(lambda x: np.log(x + 1))

    # 파생변수 생성
    df["refund"] = df["환불금액"].map(lambda x: 1 if x > 0 else 0)
    df["period"] = df["구매주기"].map(lambda x: 1 / (x + 1))

    # One-Hot Encoding
    df_dum = pd.get_dummies(df[["주구매상품", "주구매지점"]])
    df = pd.concat([df, df_dum], axis=1)
    df.drop(columns=["주구매상품", "주구매지점"], inplace=True)

    return df

In [None]:
df_X = pd.read_csv('https://raw.githubusercontent.com/Datamanim/dataq/main/X_train.csv',encoding='euc-kr')
df_X_test  = pd.read_csv('https://raw.githubusercontent.com/Datamanim/dataq/main/X_test.csv',encoding='euc-kr')
df_y = pd.read_csv('https://raw.githubusercontent.com/Datamanim/dataq/main/y_train.csv', encoding='euc-kr')

In [None]:
df_X = preprocessing(df_X)
df_X_test = preprocessing(df_X_test)

# One-Hot Encoding, Column 불일치 제거
df_X, df_X_test = df_X.align(df_X_test, join="left", axis=1, fill_value=0)

In [None]:
# 스케일링, numeric column만
scaler = MinMaxScaler()
# scaler = StandardScaler()
scale_cols = ["총구매액", "최대구매액", "환불금액", "내점일수", "내점당구매건수", "주말방문비율", "구매주기"]
df_X[scale_cols] = scaler.fit_transform(df_X[scale_cols])
df_X_test[scale_cols] = scaler.transform(df_X_test[scale_cols])

In [None]:
# pandas dataframe을 numpy로, cust_id 칼럼 제외, y target 타입 int로 설정(int가 아니면 안되는 경우가 있음)
X = df_X.iloc[:, 1:].values
X_test = df_X_test.iloc[:, 1:].values
y = df_y["gender"].values.astype(int)

In [None]:
# 5 Folds Ensemble 준비 - Folds Index 생성
NFOLDS = 5
SEED = 0
skf = StratifiedKFold(n_splits=NFOLDS, random_state=SEED, shuffle=True)

train_folds = []
valid_folds = []
for train_idx, valid_idx in skf.split(X, y):
    train_folds.append(train_idx)
    valid_folds.append(valid_idx)

In [None]:
# 5 Folds Ensemble Training
clf_list = []
auc_list = []
for train_fold, valid_fold in zip(train_folds, valid_folds):
    clf = LogisticRegression(C=0.25, class_weight="balanced")
    # clf = RandomForestClassifier(random_state=SEED, max_depth=5, criterion="entropy")
    # clf = XGBClassifier(random_state=SEED, learning_rate=0.01, max_depth=5)
    clf.fit(X[train_fold], y[train_fold])

    # predict_proba는 클래스별로 결과값을 생성, 여기서는 class=1에 대한 확률이 필요
    y_pred = clf.predict_proba(X[valid_fold])[:, 1]
    y_gt = y[valid_fold]

    auc = roc_auc_score(y_gt, y_pred)
    clf_list.append(clf)
    auc_list.append(auc)
    print(auc)

0.6742088731499769
0.7016122717108526
0.6931376217034569
0.6785114546989064
0.6650072977481235


In [None]:
# 5 Folds, Out-Of-Folds averaged auc score
mean_auc = np.asarray(auc_list).mean()
print(f"mean_auc: {mean_auc}")

mean_auc: 0.6824955038022633


In [None]:
# Predict
pred_list = []
for clf in clf_list:
    pred = clf.predict_proba(X_test)[:, 1]
    pred_list.append(pred)

# 5 Folds Ensemble, Soft Voting, simple average
preds = np.asarray(pred_list).mean(axis=0)

In [None]:
# Save Result
df_result = pd.DataFrame()
df_result["cust_id"] = df_X_test["cust_id"]
df_result["gender"] = preds
df_result.to_csv("0000.csv", index=False)

# Check Result
df_check = pd.read_csv("0000.csv", index_col="cust_id")
print(df_check.head())

           gender
cust_id          
3500     0.526786
3501     0.241209
3502     0.322448
3503     0.472974
3504     0.538400
