In [1]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample

In [3]:
df = pd.read_csv('./data/merge.csv')

In [5]:
del df['eval_set']

In [7]:
df2 = df.copy()
df2.drop(columns = ['product_name', 'aisle', 'department', 'order_dow', 'order_hour_of_day', 'product_id', 'aisle_id',
       'department_id'], axis=1, inplace=True)

In [9]:
df2.shape

(32434489, 11)

In [17]:
grade_mapping = {'VIP': 4, '최우수': 3, '우수': 2, '일반': 1, '이탈 위험':0}
df2['grade'] = df2['grade'].map(grade_mapping)

In [19]:
# `reordered`가 0인 데이터와 1인 데이터의 비율을 맞추기 위해 SMOTE 적용
X = df2.drop('reordered_x', axis = 1)
y = df2['reordered_x']

# SMOTE로 오버샘플링 (클래스 1에 대해)
# smote = SMOTE(random_state=42)
# X_resampled, y_resampled = smote.fit_resample(X, y)

# 학습용 데이터와 테스트용 데이터 나누기 (80% 학습, 20% 테스트)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [21]:
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score, recall_score

pipeline = Pipeline([
    ('classifier', LGBMClassifier(class_weight='balanced', random_state=42, n_jobs=-1))
])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

# 예측 결과 평가
print("F1 Score:", f1_score(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("precision_score:", precision_score(y_test, y_pred))
print("recall_score:", recall_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

[LightGBM] [Info] Number of positive: 13392089, number of negative: 9312053
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.078637 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1196
[LightGBM] [Info] Number of data points in the train set: 22704142, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
F1 Score: 0.7809504015625088
Accuracy: 0.7411242373987279
precision_score: 0.7788804456125646
recall_score: 0.7830313890772729
Classification Report:
               precision    recall  f1-score   support

           0       0.69      0.68      0.68   3995900
           1       0.78      0.78      0.78   5734447

    accuracy                           0.74   9730347
   macro avg       0.73      0.73      0.73   9730347
weighted avg 

In [23]:
from catboost import CatBoostClassifier
from sklearn.pipeline import Pipeline

# 모델 파이프라인 구성 (CatBoostClassifier만 사용)
pipeline = Pipeline([
    ('classifier', CatBoostClassifier(random_state=42, verbose=0))
])

# 모델 학습
pipeline.fit(X_train, y_train)

# 모델 평가
y_pred = pipeline.predict(X_test)

# 예측 결과 평가
print("F1 Score:", f1_score(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("precision score:", precision_score(y_test, y_pred))
print("recall score:", recall_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

F1 Score: 0.8036290734785234
Accuracy: 0.7506624378349508
precision score: 0.7498553721941829
recall score: 0.8657110267127763
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.59      0.66   3995900
           1       0.75      0.87      0.80   5734447

    accuracy                           0.75   9730347
   macro avg       0.75      0.73      0.73   9730347
weighted avg       0.75      0.75      0.74   9730347



In [25]:
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
import numpy as np


# XGBoost 파이프라인 구성
pipeline = Pipeline([
    ('classifier', XGBClassifier(random_state=42, n_jobs=-1, max_depth=10))
])

# 모델 학습
pipeline.fit(X_train, y_train)

# 모델 평가
y_pred = pipeline.predict(X_test)

# 예측 결과 평가
print("F1 Score:", f1_score(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("precision score:", precision_score(y_test, y_pred))
print("recall score:", recall_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

F1 Score: 0.8036718852102668
Accuracy: 0.7512846150296593
precision score: 0.7513811568041503
recall score: 0.8637851217388529
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.59      0.66   3995900
           1       0.75      0.86      0.80   5734447

    accuracy                           0.75   9730347
   macro avg       0.75      0.73      0.73   9730347
weighted avg       0.75      0.75      0.74   9730347

