In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!sudo apt-get install -y fonts-nanum
!sudo fc-cache -fv
!rm ~/.cache/matplotlib -rf

# Install

In [None]:
!pip install pycaret
!pip install openpyxl
!pip install lightgbm
!pip install xgboost
!pip install catboost
!pip install shap

# Package

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn.preprocessing import StandardScaler
import copy
import shap
from pycaret.regression import *
import json
import joblib
import warnings
warnings.simplefilter('ignore')

In [None]:
today = datetime.today().strftime("%Y%m%d")
random_seed = 42

# Function

In [None]:
def timeline(data:pd.DataFrame, start_tuple:tuple, last_tuple:tuple, time:bool)->pd.DataFrame :

    start = data.values[start_tuple[0]][start_tuple[1]]
    last = data.values[last_tuple[0]][last_tuple[1]]

    try :
        start_date = datetime.strptime(start, "%Y-%m-%d %H:%M:%S")
        last_date = datetime.strptime(last, "%Y-%m-%d %H:%M:%S")

    except :
        start_date = start
        last_date = last

    timeline = []

    while start_date <= last_date:
        timeline.append(start_date)
        if time == True :
            start_date += timedelta(minutes=1)
        else :
            start_date += timedelta(hours=2)


    time_data = pd.DataFrame(timeline, columns=['날짜'])
    return time_data

In [None]:
def data_build(root:str, start_tuple:tuple, last_tuple:tuple, column_number:int, time:bool)->dict :
    data_dict = {}
    cnt = 0

    while(1) :
        try :
            data = pd.read_excel(root, engine='openpyxl', dtype=str, sheet_name=cnt)

        except :
            break
        fe_data = data.copy()
        fe_data.columns = fe_data.values[column_number].tolist()
        fe_data.drop(list(range(column_number+1)), axis=0, inplace=True)
        fe_data.drop(fe_data.columns[0], axis=1, inplace=True)
        fe_data = fe_data.reset_index(drop=True)

        fe_result = pd.concat([timeline(data, start_tuple, last_tuple, time=time), fe_data], axis=1)
        fe_result.dropna(subset=['날짜'], axis=0, inplace=True)
        # fe_result[fe_result.columns[1:]] = fe_result[fe_result.columns[1:]].astype('float64')

        data_dict[cnt] = fe_result
        cnt+=1

    return data_dict

In [None]:
def data_build_2(root:str, column_number:int)->dict :
    data_dict = {}
    cnt = 0

    while(1) :
        try :
            data = pd.read_excel(root, engine='openpyxl', dtype=str, sheet_name=cnt)

        except :
            break
        fe_data = data.copy()
        fe_data.columns = fe_data.values[column_number].tolist()
        fe_data.drop([0, 1], axis=0, inplace=True)
        fe_data = fe_data.reset_index(drop=True)
        fe_data.columns = ['날짜'] + fe_data.columns[1:].tolist()

        fe_data.drop(fe_data.columns[-3:], axis=1, inplace=True)
        fe_data[fe_data.columns[2:]] = fe_data[fe_data.columns[2:]].astype('float64')

        data_dict[cnt] = fe_data
        cnt+=1

    return data_dict

# Dataset

In [None]:
data_path = '/content/drive/My Drive/Colab Notebooks/롯데케미칼'

In [None]:
raw_1 = data_build(root = f'{data_path}/input/공정(23.02~23.04).xlsx',
                   start_tuple = (0, 1), last_tuple = (0, 2), column_number=2, time=True)

raw_2 = data_build(root = f'{data_path}/input/공정(22.05~22.10).xlsx',
                   start_tuple = (0, 1), last_tuple = (0, 2), column_number=2, time=True)

raw_3 = data_build(root = f'{data_path}/input/공정(21.04~22.03).xlsx',
                   start_tuple = (0, 1), last_tuple = (0, 2), column_number=2, time=True)

mat = data_build_2(root = f'{data_path}/input/230731_물성데이터_전처리.xlsx',
                    column_number=0)

## Choose data

In [None]:
raw_data = {
    0 : raw_1[0],
    1 : raw_1[1],
    2 : raw_2[0],
    3 : raw_2[1],
    4 : raw_2[2],
    5 : raw_3[0],
    6 : raw_3[1]
}

In [None]:
first_date = []

for item in mat.keys() :
    mat[item]['날짜'] = pd.to_datetime(mat[item]['날짜'])
    first_date.append(mat[item]['날짜'][0])
    last_date = mat[item]['날짜'][len(mat[item]['날짜'])-1]
    shutdown = last_date - timedelta(hours=last_date.hour)
    shutdown_idx = mat[item]['날짜'][mat[item]['날짜'] == shutdown].index[0]
    mat[item].drop(mat[item][shutdown_idx:].index.tolist(), axis=0, inplace=True)
    mat[item] = mat[item][mat[item]['RPS_YN']!=0]

# Timeline Setting

In [None]:
main = {}
for item in mat.keys() :
    main[item] = timeline(mat[item], start_tuple = (0, 0), last_tuple = (-1, 0), time=False)
    main[item] = pd.merge(main[item], mat[item], on=['날짜'], how='left')
    main[item] = pd.merge(main[item], raw_data[item], on=['날짜'], how='left')
    start = (main[item]['날짜'][0] - raw_data[item]['날짜'][0]).components[0]*12 + (main[item]['날짜'][0] - raw_data[item]['날짜'][0]).components[1]
    main[item]['product_time'] = [start + 2 * i for i in range(len(main[item]))]

# Imputation

In [None]:
col_list = ["23년 4월", "23년 2월", "22년 10월", "22년 8월", "22년 5월", "22년 3월", "22년 1월"]

In [None]:
fe_list = ['TDDM FEED', '산방제 FEED', '개시제 FEED', 'RUBBER SOLUTION FEED',
       'PL FEED RATE', 'R-1 중량', 'R-1 압력', 'R-1 온도', 'R-1 REFLUX',
       'R-1 AGITATOR RPM', 'R-2 중량', 'R-2 압력', 'R-2 온도', 'R-2 REFLUX',
       'R-2 AGITATOR RPM', 'DV-1 온도', 'ZAPPER 온도',
       'DV-1 JACKET 온도', 'DV-1 진공도', 'RECYCLE 후단 압력', 'DV-2 PREHEATER 온도',
       'DV-2 진공도', 'DV-2 JACKET 온도', 'RPS', 'product_time']

In [None]:
for item in main.keys() :
    main[item][main[item].columns[1:]] = main[item][main[item].columns[1:]].astype('float64')

In [None]:
inp_data = copy.deepcopy(main)

for item in main.keys() :
    inp_data[item][inp_data[item].columns[1:]] = inp_data[item][inp_data[item].columns[1:]].fillna(inp_data[item][inp_data[item].columns[1:]].interpolate())

# Lagging

In [None]:
drop_list = ['날짜', 'RPS', 'RPS_after_2', 'RPS_after_4', 'RPS_after_6', 'RPS_after_8', 'RPS_after_10', 'RPS_after_12']

In [None]:
en_data = [inp_data[item][['날짜']+fe_list] for item in inp_data.keys()]

In [None]:
for sheet in range(len(en_data)) :
    for column in en_data[sheet].columns :
        if column in fe_list[:-1] :
            for i in range(1, 7) :
                en_data[sheet][f'{column}_before_{i*2}'] = en_data[sheet][column].shift(i)
    for i in range(1, 7) :
        en_data[sheet][f'RPS_after_{i*2}'] = en_data[sheet]['RPS'].shift(-i)
    en_data[sheet].dropna(subset=['TDDM FEED_before_12'], axis=0, inplace=True)
    en_data[sheet].dropna(subset=['RPS_after_12'], axis=0, inplace=True)
    en_data[sheet] = en_data[sheet].reset_index(drop=True)

# Modeling

In [None]:
main = pd.concat([en_data[6], en_data[1], en_data[2], en_data[3], en_data[4], en_data[5]], axis=0)
main = main.reset_index(drop=True)
train = main.drop(drop_list, axis=1)

In [None]:
metrics_list = []
cnt = 0
idx_list = train.columns[0:24].tolist() + train.columns[-6:].tolist()
shap_info = pd.DataFrame(index=idx_list)

for target in ['RPS_after_2', 'RPS_after_4', 'RPS_after_6', 'RPS_after_8', 'RPS_after_10', 'RPS_after_12'] :
  temp = copy.deepcopy(train)
  # temp.drop(['RPS_before_12', 'RPS_before_6'], axis=1, inplace=True)
  if cnt <=2 :
    cols = temp.columns[temp.columns.str.contains('_12') | temp.columns.str.contains('RPS_before')]
    # cols = temp.columns[temp.columns.str.contains('_12')]
  else :
    cols = temp.columns[temp.columns.str.contains('_6') | temp.columns.str.contains('RPS_before')]
    # cols = temp.columns[temp.columns.str.contains('_6')]

  temp = temp[cols]
  temp['product_time'] = train['product_time']

  scaler = StandardScaler()
  temp_scaled = scaler.fit_transform(temp)
  joblib.dump(scaler, f'{data_path}/output/{target}_scaler.pkl')
  temp = pd.DataFrame(temp_scaled, index = temp.index, columns=temp.columns)

  temp[target] = main[target]
  # 저장
  # temp.to_csv(f'{data_path}/canvas/231218_{target}_data.csv', index=False)
  params = setup(data=temp, target=target, train_size=0.8, fold=5, session_id=random_seed, preprocess=True,
               numeric_features=temp.drop([target], axis=1).columns.tolist(), html=False, system_log=False, verbose=False)

  top1 = compare_models(n_select=1, sort='r2', verbose=False, exclude=['knn', 'huber'])
  end_model = tune_model(top1, optimize='r2', verbose=False)
  save_model(end_model, f'{data_path}/output/{target}_model', verbose=False)
  predict = predict_model(end_model, verbose=False)
  metrics = pull()
  metrics_list.append([target, metrics.head(1)['MAE'].tolist()[0], metrics.head(1)['RMSE'].tolist()[0], metrics.head(1)['MAPE'].tolist()[0], metrics.head(1)['R2'].tolist()[0]])

  train_pipe = temp.drop([target], axis=1)
  explainer = shap.TreeExplainer(end_model)
  shap_values = explainer.shap_values(train_pipe)
  if len(shap_values) == 2 :
      importances = np.absolute(shap_values[1]).sum(axis=0) / shap_values[1].shape[0]
  else :
      importances = np.absolute(shap_values).sum(axis=0) / shap_values.shape[0]

  feature_importance = pd.Series(importances / np.sum(importances))
  feature_importance.index = idx_list # 24, 17
  shap_info[target] = feature_importance
  cnt += 1

# Export

In [None]:
result = pd.DataFrame(metrics_list, columns=['LABEL', 'MAE', 'RMSE', 'MAPE', 'R2'])
metrics_dict = {}
metrics_dict['MAE'] = result.mean()['MAE']
metrics_dict['RMSE'] = result.mean()['RMSE']
metrics_dict['MAPE'] = result.mean()['MAPE']
metrics_dict['R2'] = result.mean()['R2']

In [None]:
shap_info['mean'] = shap_info.mean(axis=1)
fe_dict = shap_info.sort_values(by='mean', ascending=False)['mean'].to_dict()
fe_dict['누적 가동시간'] = fe_dict.pop('product_time')

In [None]:
main_dict = {}
main_dict['feature importance'] = fe_dict
main_dict['metrics'] = metrics_dict

In [None]:
file_path = f'{data_path}/output/model_data_rps.json'
with open(file_path, 'w', encoding='utf-8') as file:
    file.write(json.dumps(main_dict, ensure_ascii=False, indent=2))