In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.preprocessing import LabelEncoder, StandardScaler, QuantileTransformer
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.metrics import f1_score, mean_squared_error
import random, os
from lightgbm import LGBMClassifier, LGBMRegressor
import warnings
warnings.filterwarnings('ignore')
from optuna.samplers import TPESampler
from optuna.pruners import SuccessiveHalvingPruner
from sklearn.ensemble import GradientBoostingRegressor, VotingRegressor
import xgboost

import optuna

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
SEED = 25
seed_everything(SEED) 

## Data Load

In [13]:
train = pd.read_csv('dataset/train.csv')
test = pd.read_csv('dataset/test.csv')

train = train.drop(columns=['TIMESTAMP', 'PRODUCT_ID'])
test = test.drop(columns=['TIMESTAMP', 'PRODUCT_ID'])

cat_features = ['LINE', 'PRODUCT_CODE']
num_features = [i for i in test.columns if i not in cat_features]

#y = train['Y_Class']
y = train['Y_Quality']

for col in num_features:
    train[col] = train[col].fillna(train[col].median())
    test[col] = test[col].fillna(train[col].median())

scaler = StandardScaler()
#scaler = QuantileTransformer()
train[num_features] = scaler.fit_transform(train[num_features])
test[num_features] = scaler.transform(test[num_features])

use_cat = True
if use_cat: 
    X = train.drop(columns=['Y_Class'])
    X_test = test
else: 
    X = train[num_features]
    X_test = test[num_features]


from math import *

corr = pd.read_csv('correlation/correlation.csv')
# Y_Quality 제거
corr = corr.iloc[:-1,:]
important = list(corr[abs(corr['correlation'])>=0.1]['feature'])
X = X[important]
X_test = X_test[important]
X

Unnamed: 0,X_368,X_367,X_335,X_2779,X_1849,X_2467,X_2466,X_1833,X_2780,X_2841,...,X_2401,X_189,X_1012,X_1010,X_699,X_318,X_1525,X_1524,X_1523,X_1407
0,-0.065183,-0.063973,-0.025369,0.182827,-1.526917,-1.526917,0.011794,0.011794,-1.265229,-0.046216,...,-0.195467,2.254073,2.268425,2.268425,-0.057301,0.054701,0.057928,0.057928,0.057928,7.516550
1,-0.065183,-0.063973,-0.025369,-5.745796,1.200372,1.200372,-6.136643,-6.136643,1.257635,-6.322596,...,4.215454,-0.402527,-0.508954,-0.508954,-0.057301,0.054701,0.057928,0.057928,0.057928,0.026347
2,-0.065183,-0.063973,-0.025369,-0.340286,-2.345104,-2.345104,-0.349879,-0.349879,-2.274374,-0.064047,...,-0.195467,2.254073,2.268425,2.268425,-0.057301,0.054701,0.057928,0.057928,0.057928,0.026347
3,-0.065183,-0.063973,-0.025369,0.008456,0.654914,0.654914,-0.169043,-0.169043,0.753062,-0.099708,...,4.215454,-0.402527,-0.508954,-0.508954,-0.057301,0.054701,0.057928,0.057928,0.057928,0.026347
4,-0.065183,-0.063973,-0.025369,-0.165915,-2.617833,-2.617833,-0.349879,-0.349879,-2.274374,-0.028386,...,-0.195467,2.254073,2.268425,2.268425,-0.057301,0.054701,0.057928,0.057928,0.057928,0.026347
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
593,-0.065183,-0.063973,-0.025369,0.008456,0.109456,0.109456,0.011794,0.011794,0.122346,0.007275,...,-0.195467,-0.171518,-0.161782,-0.161782,-0.057301,0.054701,0.057928,0.057928,0.057928,0.026347
594,-0.065183,-0.063973,-0.025369,1.229055,0.109456,0.109456,1.096812,1.096812,0.248489,0.898807,...,-0.195467,0.290499,0.416839,0.416839,-0.057301,0.054701,-17.262677,-17.262677,-17.262677,-15.470627
595,-0.065183,-0.063973,-0.025369,1.054684,0.109456,0.109456,0.915976,0.915976,0.248489,0.809653,...,-0.195467,0.290499,0.416839,0.416839,-0.057301,0.054701,-17.262677,-17.262677,-17.262677,-15.470627
596,-0.065183,-0.063973,-0.025369,0.008456,0.109456,0.109456,0.011794,0.011794,0.122346,0.007275,...,-0.195467,-0.171518,-0.161782,-0.161782,14.537464,0.054701,0.057928,0.057928,0.057928,0.026347


In [21]:
def RMSE(y, y_pred):
    rmse = mean_squared_error(y, y_pred) ** 0.5
    return rmse

## Modeling & Ensemble

In [None]:
def objective(trial):
    params = {
        'objective': 'regression', # 회귀
        'metric': 'rmse', 
        'max_depth': trial.suggest_int('max_depth',3, 12),
        'learning_rate': trial.suggest_loguniform("learning_rate", 1e-3, 0.5),
        'n_estimators': trial.suggest_int('n_estimators', 25, 200),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 50),
        'subsample': trial.suggest_loguniform('subsample', 0.4, 1),
        'random_state': SEED
    }

    kf = KFold(n_splits=10, shuffle=True, random_state=25)
    scores = []
    for train_index, test_index in kf.split(X):
        X_train, X_valid = X.values[train_index], X.values[test_index]
        y_train, y_valid = y.values[train_index], y.values[test_index]
        model = LGBMRegressor(verbose=-1,**params)
        model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)])
        pred = model.predict(X_valid)
        output = RMSE(y_valid, pred)
        scores.append(output)

    return np.mean(scores)

study = optuna.create_study(direction='minimize', sampler=TPESampler(seed=SEED), pruner=SuccessiveHalvingPruner())
study.optimize(objective, n_trials=1000)

In [9]:
def objective(trial):
    xgb_regressor_param = {
        'lambda': trial.suggest_float('lambda', 1e-3, 0.1),
        'alpha': trial.suggest_float('alpha', 1e-3, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1),
        'subsample': trial.suggest_float('subsample', 0.4, 1),
        'learning_rate': trial.suggest_float('learning_rate',0.0001, 0.1),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 4,8),
        'min_child_weight': trial.suggest_int('min_child_weight', 2, 50),
    }
    
    kf = KFold(n_splits=10, shuffle=True, random_state=25)
    scores = []
    for train_index, test_index in kf.split(X):
        X_train, X_valid = X.values[train_index], X.values[test_index]
        y_train, y_valid = y.values[train_index], y.values[test_index]
        
        model = xgboost.XGBRegressor(tree_method='gpu_hist', gpu_id=0, **xgb_regressor_param)
        model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=0)
        pred = model.predict(X_valid)
        output = RMSE(y_valid, pred)
        scores.append(output)

    return np.mean(scores)

study = optuna.create_study(direction='minimize', sampler=TPESampler(seed=SEED), pruner=SuccessiveHalvingPruner())
study.optimize(objective, n_trials=1000)

[32m[I 2023-02-11 21:10:45,252][0m A new study created in memory with name: no-name-442687ef-4015-4d92-bc51-04035b2b9357[0m
[32m[I 2023-02-11 21:10:49,146][0m Trial 0 finished with value: 0.0070904886971407555 and parameters: {'lambda': 0.08714228952609399, 'alpha': 0.5826946517438872, 'colsample_bytree': 0.5673033644206414, 'subsample': 0.5115467392541075, 'learning_rate': 0.04116890277971881, 'n_estimators': 205, 'max_depth': 7, 'min_child_weight': 23}. Best is trial 0 with value: 0.0070904886971407555.[0m
[32m[I 2023-02-11 21:10:58,508][0m Trial 1 finished with value: 0.006801560866994902 and parameters: {'lambda': 0.05606670319202505, 'alpha': 0.3677132412977046, 'colsample_bytree': 0.6414194372885217, 'subsample': 0.4678244204166378, 'learning_rate': 0.04475838154212219, 'n_estimators': 627, 'max_depth': 4, 'min_child_weight': 27}. Best is trial 1 with value: 0.006801560866994902.[0m
[32m[I 2023-02-11 21:11:05,960][0m Trial 2 finished with value: 0.006983565565053732 an

In [21]:
def objective(trial):
    gb_regressor_param = {
        'alpha': trial.suggest_float('alpha', 1e-3, 1.0),
        'subsample': trial.suggest_float('subsample', 0.4, 1),
        'learning_rate': trial.suggest_float('learning_rate',0.0001, 0.1),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3,8),
    }
    
    kf = KFold(n_splits=10, shuffle=True, random_state=25)
    scores = []
    for train_index, test_index in kf.split(X):
        X_train, X_valid = X.values[train_index], X.values[test_index]
        y_train, y_valid = y.values[train_index], y.values[test_index]
        
        model = GradientBoostingRegressor(verbose=0, **gb_regressor_param)
        model.fit(X_train, y_train)
        pred = model.predict(X_valid)
        output = RMSE(y_valid, pred)
        scores.append(output)

    return np.mean(scores)

study = optuna.create_study(direction='minimize', sampler=TPESampler(seed=SEED), pruner=SuccessiveHalvingPruner())
study.optimize(objective, n_trials=1000)

[32m[I 2023-02-11 23:35:12,630][0m A new study created in memory with name: no-name-5a61bc81-60f5-4644-96df-71ca76412e11[0m
[32m[I 2023-02-11 23:36:28,897][0m Trial 0 finished with value: 0.004489763647131324 and parameters: {'alpha': 0.8702540124905846, 'subsample': 0.7493661572035359, 'learning_rate': 0.0279560101760368, 'n_estimators': 267, 'max_depth': 5}. Best is trial 0 with value: 0.004489763647131324.[0m
[32m[I 2023-02-11 23:39:33,067][0m Trial 1 finished with value: 0.0043932902482001724 and parameters: {'alpha': 0.11825817158758083, 'subsample': 0.810981246624481, 'learning_rate': 0.04381734485999908, 'n_estimators': 601, 'max_depth': 5}. Best is trial 1 with value: 0.0043932902482001724.[0m
[32m[I 2023-02-11 23:40:42,253][0m Trial 2 finished with value: 0.0045117455833115225 and parameters: {'alpha': 0.40296336308538866, 'subsample': 0.4678244204166378, 'learning_rate': 0.04475838154212219, 'n_estimators': 627, 'max_depth': 3}. Best is trial 1 with value: 0.004393

In [22]:
from catboost import Pool,CatBoostClassifier, CatBoostRegressor
def objective(trial):
    param = {
        'iterations':trial.suggest_int("iterations", 100, 500),
        'od_wait':trial.suggest_int('od_wait', 10, 150),
        'learning_rate' : trial.suggest_uniform('learning_rate',0.01, 1),
        'reg_lambda': trial.suggest_uniform('reg_lambda',1e-5,100),
        'subsample': trial.suggest_uniform('subsample',0,1),
        'random_strength': trial.suggest_uniform('random_strength',10,50),
        'depth': trial.suggest_int('depth',1, 15),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf',1,30),
        'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations',1,15),
        'bagging_temperature' :trial.suggest_loguniform('bagging_temperature', 0.01, 100.00),
        'colsample_bylevel':trial.suggest_float('colsample_bylevel', 0.4, 1.0),
        'random_state': 25,
        'verbose': 0,
    }


    kf = KFold(n_splits=10, shuffle=True, random_state=25)
    scores = []
    for train_index, test_index in kf.split(X):
        X_train, X_valid = X.values[train_index], X.values[test_index]
        y_train, y_valid = y.values[train_index], y.values[test_index]
        # Generate model
        model = CatBoostRegressor(
        task_type="CPU",
        thread_count=16,
        **param)
        model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)])
        pred = model.predict(X_valid)
        output = RMSE(y_valid, pred)
        scores.append(output)

    return np.mean(scores)

study = optuna.create_study(direction='minimize', sampler=TPESampler(seed=SEED), pruner=SuccessiveHalvingPruner())
study.optimize(objective, n_trials=1000)

[32m[I 2023-02-13 00:28:50,141][0m A new study created in memory with name: no-name-da507c31-07ab-4b76-9a4e-6b000254a14b[0m
[32m[I 2023-02-13 00:30:29,338][0m Trial 0 finished with value: 0.004951309917780495 and parameters: {'iterations': 448, 'od_wait': 92, 'learning_rate': 0.2860505512940584, 'reg_lambda': 18.591131349905602, 'subsample': 0.4111001279251132, 'random_strength': 14.695021885388622, 'depth': 11, 'min_data_in_leaf': 14, 'leaf_estimation_iterations': 9, 'bagging_temperature': 0.2939823698260364, 'colsample_bylevel': 0.6414194372885217}. Best is trial 0 with value: 0.004951309917780495.[0m
[32m[I 2023-02-13 00:31:32,705][0m Trial 1 finished with value: 0.005035330862261673 and parameters: {'iterations': 145, 'od_wait': 73, 'learning_rate': 0.5895906653459253, 'reg_lambda': 16.198518764642785, 'subsample': 0.5207187880526836, 'random_strength': 23.04204504551302, 'depth': 11, 'min_data_in_leaf': 11, 'leaf_estimation_iterations': 13, 'bagging_temperature': 0.8421157

KeyboardInterrupt: 

In [5]:
lgbm_regressor_param = {'max_depth': 3, 'learning_rate': 0.15878472521108428, 'n_estimators': 194, 'min_child_samples': 7, 'subsample': 0.5455804149064393}
xgb_regressor_param = {'lambda': 0.04474041412782344, 'alpha': 0.0014239665651524195, 'colsample_bytree': 0.5004904955911537, 'subsample': 0.5138883122217653,
'learning_rate': 0.03323051964794906, 'n_estimators': 784, 'max_depth': 5, 'min_child_weight': 2}
gb_regressor_param = {'alpha': 0.10975438125543445, 'subsample': 0.6191653692823086, 'learning_rate': 0.015528406328128127, 'n_estimators': 873, 'max_depth': 7}
cat_regressor_param = {}

In [6]:
accuracy_history = []
kf = KFold(n_splits=10, shuffle = True, random_state = SEED)
scores = []
models = []
# split 개수 스텝 만큼 train, test 데이터셋을 매번 분할
for train_index, valid_index in kf.split(X):
    X_train, X_test = X.values[train_index], X.values[valid_index]
    y_train, y_test = y.values[train_index], y.values[valid_index]
    model = [
        ('lgbm', LGBMRegressor(device="gpu", verbose=-1, random_state=SEED, **lgbm_regressor_param)),
        ('xgb', xgboost.XGBRegressor(tree_method='gpu_hist', gpu_id=0, **xgb_regressor_param)),
        ('gb', GradientBoostingRegressor(verbose=0, **gb_regressor_param))
        ('cat', CatBoostRegressor(verbose=0, random_state=25, task_type="CPU", thread_count=16, **cat_regressor_param))
    ]

    model = VotingRegressor(model, weights=[1,1,1,1])
    model.fit(X_train, y_train)
    models.append(model)

In [8]:
tmp_train = pd.read_csv('dataset/train.csv')
tmp_test = pd.read_csv("dataset/test.csv")


In [9]:
idx = 0
for train_index, valid_index in kf.split(X):
    X_train, X_valid = X.values[train_index], X.values[valid_index]
    y_train, y_valid = y.values[train_index], y.values[valid_index]
    model = models[idx]
    pred = model.predict(X_valid)
    tmp_train.loc[valid_index, ["Y_Quality"]] = pred
    idx += 1


In [15]:
pred = model.predict(X_test)
tmp_test.loc[list(range(310)), ["Y_Quality"]] = pred

In [19]:
tmp_train.to_csv("dataset/train_add.csv", index=False)
tmp_test.to_csv("dataset/test_add.csv", index=False)

In [16]:
corr = tmp_train.corr()['Y_Class']
idx = corr > 0.6
corr[idx]

Y_Class      1.000000
Y_Quality    0.665576
X_130        0.664402
X_131        0.664402
X_1407       0.841727
X_1523       0.866025
X_1524       0.866025
X_1525       0.866025
X_1969       0.683307
X_1971       0.712272
X_1975       0.694126
X_1977       0.710391
X_1981       0.697230
X_1983       0.724330
X_1987       0.697060
X_1989       0.719800
X_1993       0.683307
X_1995       0.712272
X_1999       0.694126
X_2001       0.710391
X_2005       0.683307
X_2007       0.712272
X_2011       0.694126
X_2013       0.710391
X_2029       0.697226
X_2031       0.724372
X_2035       0.697060
X_2037       0.720189
Name: Y_Class, dtype: float64

Y_Class      1.000000
Y_Quality    0.667246
X_130        0.664402
X_131        0.664402
X_1407       0.841727
X_1523       0.866025
X_1524       0.866025
X_1525       0.866025
X_1969       0.683307
X_1971       0.712272
X_1975       0.694126
X_1977       0.710391
X_1981       0.697230
X_1983       0.724330
X_1987       0.697060
X_1989       0.719800
X_1993       0.683307
X_1995       0.712272
X_1999       0.694126
X_2001       0.710391
X_2005       0.683307
X_2007       0.712272
X_2011       0.694126
X_2013       0.710391
X_2029       0.697226
X_2031       0.724372
X_2035       0.697060
X_2037       0.720189
Name: Y_Class, dtype: float64