In [1]:
import pandas as pd
import numpy as np
import sklearn
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelBinarizer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import KFold

import multiprocessing
import matplotlib.pyplot as plt
import optuna.integration.lightgbm as lgb
import optuna

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['font.family'] = 'Malgun Gothic'

In [3]:
data = pd.read_excel('./data/data.xlsx')

In [4]:
def preprocess(data, product_num, product_size = False):
    data = data[data['제품종류'] == 'Product_' + str(product_num)]
    
    if product_size:
        data = data[data['제품Size'] == product_size]
    
    y = data['선재사상압연모터전류'].values
    data['압연시간'] = (data['압연완료일시'] - data['압연시작일시']).dt.seconds
    data.drop(['CoilNO', '선재사상압연모터전류', '압연완료일시', '압연시작일시', '제품종류'], inplace=True, axis=1)
    
    if product_size:
        data.drop(['제품Size'], inplace=True, axis=1)

    x = data.values
    
#     scaler = StandardScaler()
    scaler = []
    train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.2, random_state=42)

#     train_x = pd.DataFrame(scaler.fit_transform(train_x), columns=data.columns)
#     test_x = pd.DataFrame(scaler.transform(test_x), columns=data.columns)
    train_x = pd.DataFrame(train_x, columns=data.columns)
    test_x = pd.DataFrame(test_x, columns=data.columns)
    
#     return train_x, test_x, train_y, test_y, scaler
    return train_x, test_x, train_y, test_y

In [5]:
# train_x_17, test_x_17, train_y_17, test_y_17, scaler_17 = preprocess(data, 17)
# train_x_9, test_x_9, train_y_9, test_y_9, scaler_9 = preprocess(data, 9)

train_x_17, test_x_17, train_y_17, test_y_17 = preprocess(data, 17)
train_x_17_55, test_x_17_55, train_y_17_55, test_y_17_55 = preprocess(data, 17, 5.5)
train_x_9, test_x_9, train_y_9, test_y_9 = preprocess(data, 9)

In [6]:
# lgb dataset 만들기
categorical_features = [] # 필요시 제품Size 추가
training_rounds = 10000

train_ds_17 = lgb.Dataset(train_x_17, label = train_y_17, categorical_feature = categorical_features) 
test_ds_17 = lgb.Dataset(test_x_17, label = test_y_17, categorical_feature = categorical_features) 

train_ds_17_55 = lgb.Dataset(train_x_17_55, label = train_y_17_55, categorical_feature = categorical_features) 
test_ds_17_55 = lgb.Dataset(test_x_17_55, label = test_y_17_55, categorical_feature = categorical_features) 

train_ds_9 = lgb.Dataset(train_x_9, label = train_y_9, categorical_feature = categorical_features) 
test_ds_9 = lgb.Dataset(test_x_9, label = test_y_9, categorical_feature = categorical_features) 

In [7]:
kf = KFold(n_splits=5)

params = {
        "objective": 'regression',
        "metric": 'mse',
        "verbosity": -1,
        "boosting_type": "gbdt",                
        "seed": 42
    }

study_tuner = optuna.create_study(direction='minimize')

# Suppress information only outputs - otherwise optuna is 
# quite verbose, which can be nice, but takes up a lot of space
optuna.logging.set_verbosity(optuna.logging.WARNING) 

# Run optuna LightGBMTunerCV tuning of LightGBM with cross-validation
tuner = lgb.LightGBMTunerCV(params, 
                            train_ds_17, 
                            categorical_feature=categorical_features,
                            study=study_tuner,
                            verbose_eval=False,                            
                            early_stopping_rounds=250,
                            time_budget=19800, # Time budget of 5 hours, we will not really need it
                            seed = 42,
                            folds=kf,
                            num_boost_round=10000,
                            callbacks=[lgb.reset_parameter(learning_rate = [0.005]*200 + [0.001]*9800) ] #[0.1]*5 + [0.05]*15 + [0.01]*45 + 
                           )

tuner.run()

[32m[I 2021-09-05 12:50:46,336][0m A new study created in memory with name: no-name-c349d69c-a38a-4782-8804-69da240ee0e6[0m
feature_fraction, val_score: 144.812422: 100%|###########################################| 7/7 [06:49<00:00, 58.48s/it]
num_leaves, val_score: 138.359880: 100%|##############################################| 20/20 [36:22<00:00, 109.12s/it]
bagging, val_score: 138.359880: 100%|#################################################| 10/10 [18:12<00:00, 109.30s/it]
feature_fraction_stage2, val_score: 138.311914: 100%|###################################| 6/6 [12:05<00:00, 120.98s/it]
regularization_factors, val_score: 138.246293: 100%|##################################| 20/20 [44:18<00:00, 132.90s/it]
min_data_in_leaf, val_score: 138.246293: 100%|##########################################| 5/5 [11:35<00:00, 139.18s/it]


In [8]:
print(tuner.best_params)
# Classification error
print(tuner.best_score)

{'objective': 'regression', 'metric': 'l2', 'verbosity': -1, 'boosting_type': 'gbdt', 'seed': 42, 'feature_pre_filter': False, 'lambda_l1': 2.8696095424127307e-05, 'lambda_l2': 1.0443431377721304e-06, 'num_leaves': 126, 'feature_fraction': 0.552, 'bagging_fraction': 1.0, 'bagging_freq': 0, 'min_child_samples': 20}
138.24629319224178


In [9]:
tmp_best_params = tuner.best_params
if tmp_best_params['feature_fraction']==1:
    tmp_best_params['feature_fraction']=1.0-1e-9
if tmp_best_params['feature_fraction']==0:
    tmp_best_params['feature_fraction']=1e-9
if tmp_best_params['bagging_fraction']==1:
    tmp_best_params['bagging_fraction']=1.0-1e-9
if tmp_best_params['bagging_fraction']==0:
    tmp_best_params['bagging_fraction']=1e-9  

In [None]:
tmp_best_params = {'objective': 'regression', 'metric': 'l2', 'verbosity': -1, 'boosting_type': 'gbdt', 'seed': 42, 'feature_pre_filter': False, 'lambda_l1': 2.8696095424127307e-05, 'lambda_l2': 1.0443431377721304e-06, 'num_leaves': 126, 'feature_fraction': 0.552, 'bagging_fraction': 1.0, 'bagging_freq': 0, 'min_child_samples': 20}

if tmp_best_params['feature_fraction']==1:
    tmp_best_params['feature_fraction']=1.0-1e-9
if tmp_best_params['feature_fraction']==0:
    tmp_best_params['feature_fraction']=1e-9
if tmp_best_params['bagging_fraction']==1:
    tmp_best_params['bagging_fraction']=1.0-1e-9
if tmp_best_params['bagging_fraction']==0:
    tmp_best_params['bagging_fraction']=1e-9  

In [None]:
import lightgbm as lgb

# We will track how many training rounds we needed for our best score.
# We will use that number of rounds later.
best_score = 999
training_rounds = 10000

# Declare how we evaluate how good a set of hyperparameters are, i.e.
# declare an objective function.
def objective(trial):
    # Specify a search space using distributions across plausible values of hyperparameters.
    param = {
        "objective": 'regression',
        "metric": 'mse',
        "verbosity": -1,
        "boosting_type": "gbdt",                
        "seed": 42,
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 2, 512),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.1, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.1, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 0, 15),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 100),
        'seed': 1979,
        'num_threads': multiprocessing.cpu_count(),
         'min_data_in_leaf' : 20,
#         'device': 'gpu',
#         'gpu_platform_id': 0,
#         'gpu_device_id': 0
    }

    # Run LightGBM for the hyperparameter values
    lgbcv = lgb.cv(param,
                   train_ds_17,
                   categorical_feature=categorical_features,
                   folds=kf,
                   verbose_eval=False,                   
                   early_stopping_rounds=250,                   
                   num_boost_round=10000,                    
                   callbacks=[lgb.reset_parameter(learning_rate = [0.005]*200 + [0.001]*9800) ]
                  )
    cv_score = lgbcv['l2-mean'][-1] + lgbcv['l2-stdv'][-1]
    if cv_score<best_score:
        training_rounds = len( list(lgbcv.values())[0] )
    
    # Return metric of interest
    return cv_score

# Suppress information only outputs - otherwise optuna is 
# quite verbose, which can be nice, but takes up a lot of space
optuna.logging.set_verbosity(optuna.logging.WARNING) 

# We search for another 4 hours (3600 s are an hours, so timeout=14400).
# We could instead do e.g. n_trials=1000, to try 1000 hyperparameters chosen 
# by optuna or set neither timeout or n_trials so that we keep going until 
# the user interrupts ("Cancel run").
study = optuna.create_study(direction='minimize')  
study.enqueue_trial(tmp_best_params)
study.optimize(objective, timeout= 60 * 60 * 12) 

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_slice(study)

In [None]:
optuna.visualization.plot_param_importances(study)

In [None]:
print(study.best_params)

In [None]:
# Classification error
print(study.best_value)

In [None]:
best_params = {
    "objective": 'regression',
    "metric": 'mse',
    "verbosity": -1,
    "boosting_type": "gbdt",
    "seed": 42} 
best_params.update(study.best_params)
best_params

In [None]:
lgbfit = lgb.train(best_params,
                   train_ds_17,
                   categorical_feature=categorical_features,
                   verbose_eval=False,                   
                   num_boost_round=training_rounds)

In [None]:
predict_train = lgbfit.predict(train_x_17)
predict_test = lgbfit.predict(test_x_17)

In [None]:
mse = mean_squared_error(test_y_17, predict_test)
r2 = r2_score(test_y_17, predict_test)

In [None]:
print('Mean squared error: ', mse)
print('R2 score: ', r2)

In [None]:
final_result = pd.concat([pd.DataFrame(test_y_17), pd.DataFrame(predict_test)], axis = 1)

In [None]:
final_result.columns = ['label','predict']

In [None]:
sns.regplot(x = 'label', y = 'predict', data = final_result)