## 기본설정

### 한글설정

In [None]:
!apt-get install fonts-nanum* > /dev/null 2>&1
!fc-cache -fv > /dev/null 2>&1

In [None]:
import os
import matplotlib as mpl

file_dir = os.path.split(mpl.__file__)[0]
font_dir = os.path.join(file_dir, 'mpl-data/fonts/ttf')
!cp /usr/share/fonts/truetype/nanum/Nanum* {font_dir}
!rm -rf ~/.cache/matplotlib/*

In [None]:
!pip install statsmodels --upgrade

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


> Runtime Restart

In [None]:
import matplotlib.pyplot as plt

import matplotlib as mpl
mpl.rcParams['axes.unicode_minus'] = False
plt.rc('font', family='NanumGothicCoding')

### 데이터 가져오기

In [None]:
import os
import gdown
import glob
import pandas as pd
from collections import defaultdict

id = "10Hpa4YM0KX_Ig0W9w7DbTdq62nF2UThA"
output = "./open.zip"

if not os.path.isdir('./datasets'):
  gdown.download(id=id, output=output)
  gdown.extractall(path=output, to='./datasets')

raw_data = defaultdict(pd.DataFrame)

for fname in glob.glob('/content/datasets/**/*.csv', recursive=True):
    df_name = os.path.splitext(os.path.basename(fname))[0]
    raw_data[df_name] = pd.read_csv(fname)

### 라이브러리 불러오기

In [None]:
import seaborn as sns
import numpy as np

from collections import defaultdict

from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import KFold
from sklearn.metrics import make_scorer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score

from statsmodels.tsa.holtwinters import ExponentialSmoothing

## 함수 선언

In [None]:
# 데이콘에서 제공한 평가함수
def lg_nrmse(gt, preds):
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    all_nrmse = []
    for idx in range(0,14): # ignore 'ID'
        rmse = metrics.mean_squared_error(gt[:,idx], preds[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(gt[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:14])
    return score

# 변경한 함수
def single_nrmse(gt, preds):
    rmse = metrics.mean_squared_error(gt, preds, squared=False)
    nrmse = rmse/np.mean(np.abs(gt))
    return nrmse

def split_patten(data):
  start_idx, data_size = 0, len(data)
  result = []
  for i in range(data_size):
    if i < 10 or i > data_size - 10:
      pass
    elif (data['X_30'][i-5:i] < 1.425).all() and (data['X_30'][i:i+5] > 1.425).all():
      result.append(i-start_idx)
      start_idx = i
    elif (data['X_26'][i:i+5] == 2.03).all() and (data['X_27'][i:i+5] == 2.07).all() \
        and len(result) >= 5 and (i - start_idx > 1000):
      result.append(i-start_idx)
      start_idx = i
  else:
    result.append(data_size-start_idx)
  return result

def time_data(data):
  T_dict = {'T_SIN':[], 'T_COS':[]}

  for size in split_patten(data):
    T_dict['T_SIN'].extend([ np.sin(2 * np.pi * i/size) for i in range(size) ])
    T_dict['T_COS'].extend([ np.cos(2 * np.pi * i/size) for i in range(size) ])

  return pd.DataFrame(T_dict).copy()

def outlier_index(data, sigma=3):
  data = data.abs().copy()
  min_index = data.index[0]
  SF = data.values.std() * sigma

  model = ExponentialSmoothing(data, 
                               trend='additive', 
                               seasonal_periods=len(data)//6,
                               initialization_method=None)
  model = model.fit()

  prediction = model.predict(
      start=data.index[0], end=data.index[-1]
  )

  result = []
  for idx, (actual, predicted) in enumerate(zip(data.values, prediction)):
    if predicted - SF < actual < predicted + SF:
      pass
    else:
      result.append(min_index + idx)

  return result

## 데이터 전처리

In [None]:
def preprocessing_raw(data_type):
  # CSV 데이터 불러오기
  data = raw_data[data_type].copy()

  # 불필요한 컬럼 제거
  data = data.drop(columns=['ID', 'X_04', 'X_23', 'X_47', 'X_48']).copy()
  data = data.drop(columns=['X_50', 'X_51', 'X_52', 'X_53', 'X_54', 'X_55', 'X_56']).copy()

  # 주기성을 대표할 수 있는 Feature 추가
  Ts = time_data(data)
  data = data.join(Ts).copy()

  # 주기에 따라 데이터 분할
  result = []
  start_idx = data[data['T_SIN']==0].index.to_list()
  for i in range(len(start_idx)-1):
    result.append(data.iloc[start_idx[i]:start_idx[i+1]])
  else:
    result.append(data.iloc[start_idx[i+1]:])

  return result.copy()

In [None]:
selected_columns = []
selected_columns.append({
    'Y_01' :['X_07', 'X_08', 'X_09', 'X_49', 'T_COS', 'T_SIN', 'X_03', 'X_05', 'X_13', 'X_19', 'X_20', 'X_21', 'X_22', 'X_35', 'X_36', 'X_43'],
    'Y_02' :['X_07', 'X_08', 'X_09', 'X_49', 'T_COS', 'T_SIN', 'X_03', 'X_05', 'X_13', 'X_19', 'X_20', 'X_21', 'X_22', 'X_29', 'X_36', 'X_43'],
    'Y_03' :['X_07', 'X_08', 'X_09', 'X_49', 'T_COS', 'T_SIN', 'X_03', 'X_05', 'X_13', 'X_19', 'X_21', 'X_22', 'X_35', 'X_36', 'X_43'],
    'Y_04' :['X_07', 'X_09', 'X_49', 'T_COS', 'T_SIN', 'X_03', 'X_05', 'X_31'],
    'Y_05' :['X_07', 'X_08', 'X_09', 'X_49', 'T_COS', 'T_SIN', 'X_03', 'X_05', 'X_13'],
    'Y_06' :['X_07', 'X_08', 'X_09', 'X_49', 'T_COS', 'T_SIN', 'X_03', 'X_05'],
    'Y_07' :['X_07', 'X_08', 'X_09', 'X_49', 'T_COS', 'T_SIN', 'X_03', 'X_05', 'X_13', 'X_21'],
    'Y_08' :['X_07', 'X_08', 'X_09', 'X_49', 'T_COS', 'T_SIN', 'X_03', 'X_05', 'X_19'],
    'Y_09' :['X_07', 'X_08', 'X_09', 'X_49', 'T_COS', 'T_SIN', 'X_03', 'X_05', 'X_19'],
    'Y_10' :['X_07', 'X_08', 'X_09', 'X_49', 'T_COS', 'T_SIN', 'X_01', 'X_03', 'X_05', 'X_20'],
    'Y_11' :['X_07', 'X_08', 'X_09', 'X_49', 'T_COS', 'T_SIN', 'X_01', 'X_03', 'X_05'],
    'Y_12' :['X_07', 'X_08', 'X_09', 'X_49', 'T_COS', 'T_SIN', 'X_03', 'X_05', 'X_19'],
    'Y_13' :['X_07', 'X_08', 'X_09', 'X_49', 'T_COS', 'T_SIN', 'X_03', 'X_05'],
    'Y_14' :['X_07', 'X_08', 'X_09', 'X_49', 'T_COS', 'T_SIN', 'X_03', 'X_05', 'X_19'],
})
selected_columns.append({
    'Y_01' :['X_07', 'X_08', 'X_09', 'X_49', 'T_COS', 'T_SIN', 'X_03', 'X_05', 'X_19', 'X_21', 'X_22'],
    'Y_02' :['X_07', 'X_08', 'X_09', 'X_49', 'T_COS', 'T_SIN', 'X_03', 'X_05', 'X_19', 'X_22', 'X_43'],
    'Y_03' :['X_07', 'X_08', 'X_09', 'X_49', 'T_COS', 'T_SIN', 'X_03', 'X_05', 'X_19', 'X_20', 'X_22'],
    'Y_04' :['X_07', 'X_08', 'X_09', 'X_49', 'T_COS', 'T_SIN', 'X_03', 'X_05', 'X_26'],
    'Y_05' :['X_07', 'X_08', 'X_09', 'X_49', 'T_COS', 'T_SIN', 'X_03', 'X_05', 'X_19'],
    'Y_06' :['X_07', 'X_08', 'X_09', 'X_49', 'T_COS', 'T_SIN', 'X_03', 'X_05', 'X_22'],
    'Y_07' :['X_07', 'X_08', 'X_09', 'X_49', 'T_COS', 'T_SIN', 'X_03', 'X_05', 'X_13', 'X_19', 'X_22'],
    'Y_08' :['X_07', 'X_08', 'X_09', 'X_49', 'T_COS', 'T_SIN', 'X_03', 'X_05'], 
    'Y_09' :['X_07', 'X_08', 'X_09', 'X_49', 'T_COS', 'T_SIN', 'X_03', 'X_05'], 
    'Y_10' :['X_07', 'X_08', 'X_49', 'T_COS', 'T_SIN', 'X_03', 'X_05'], 
    'Y_11' :['X_07', 'X_08', 'X_09', 'X_49', 'T_COS', 'T_SIN', 'X_03', 'X_05'], 
    'Y_12' :['X_07', 'X_08', 'X_09', 'X_49', 'T_COS', 'T_SIN', 'X_03', 'X_05'], 
    'Y_13' :['X_07', 'X_08', 'X_09', 'X_49', 'T_COS', 'T_SIN', 'X_03', 'X_05'], 
    'Y_14' :['X_07', 'X_08', 'X_09', 'X_49', 'T_COS', 'T_SIN', 'X_03', 'X_05'], 
})

#### Outlier

In [None]:
outlier_weight = []
outlier_weight.append({
    'Y_01' : 2.1, 'Y_02' : 3.1, 'Y_03' : 2.1, 'Y_04' : 4.0, 'Y_05' : 4.0, 
    'Y_06' : 2.7, 'Y_07' : 3.9, 'Y_08' : 4.0, 'Y_09' : 4.0, 'Y_10' : 3.6,
    'Y_11' : 4.0, 'Y_12' : 4.0, 'Y_13' : 4.0, 'Y_14' : 4.0
})
outlier_weight.append({
    'Y_01' : 2.7, 'Y_02' : 2.5, 'Y_03' : 2.2, 'Y_04' : 4.0, 'Y_05' : 4.0, 
    'Y_06' : 4.0, 'Y_07' : 3.3, 'Y_08' : 4.0, 'Y_09' : 4.0, 'Y_10' : 4.0,
    'Y_11' : 4.0, 'Y_12' : 4.0, 'Y_13' : 4.0, 'Y_14' : 4.0
})

In [None]:
train_raw_list = preprocessing_raw('train')
test_raw_list = preprocessing_raw('test')

In [None]:
len(pd.concat(train_raw_list)), len(pd.concat(test_raw_list))

(39607, 39608)

## 학습 및 예측(평가)

In [None]:
result_list = []

# 데이터의 주기적 성격에 따라 2개로 분할하여 예측수행.
for patten_i in range(2):
  print('Part{} Start...'.format(patten_i+1))
  # 결과를 저장할 리스트 defaultdict 선언
  result = defaultdict(list)

  # 각 컬럼마다 Outlier를 제거하고 수행
  for cname in ['Y_%02d' % (i+1) for i in range(14)]: 
    print('\t{} Preprocess...'.format(cname), end='')
    # 전처리된 Train RAW 데이터 준비
    train = pd.concat(train_raw_list[6*patten_i:6*(patten_i+1)]).copy()
    train = train.reset_index(drop=True).copy()

    # Outlier 제거
    filter_data = train.drop(
        index=outlier_index(data=train[cname], sigma=outlier_weight[patten_i][cname])
        ).copy()

    # 학습용 X, y 선언
    X_train = filter_data.filter(items=selected_columns[patten_i][cname]).copy()
    y_train = filter_data[[cname]].values.ravel().copy()

    # log1p 함수를 적용하여 스케이링 할 컬럼 분리
    lst_cols = [ c for c in ['X_07', 'X_08', 'X_09', 'X_49'] if c in selected_columns[patten_i][cname]]
    std_cols = sorted(list(set(X_train.columns) - set(lst_cols)))

    # Scaler Pipe 선언
    log_transformer = Pipeline(steps=[
        ("log1p", FunctionTransformer(np.log1p))
    ])

    splitrans = ColumnTransformer(transformers=[
          ("lst", log_transformer, lst_cols),
          ("none", 'passthrough', std_cols)
    ])

    preprocessing = Pipeline(steps=[    
          ("split", splitrans),
          ('minmax', MinMaxScaler())
    ])

    Xt = preprocessing.fit_transform(X_train)
    Xt = pd.DataFrame(data=Xt, columns=lst_cols + std_cols).copy()
    
    # Estimater Pipe 선언
    pipe = Pipeline(steps=[
        # ('preprocessing', preprocessing),
        ('rf_reg', RandomForestRegressor(criterion="squared_error"
                        , random_state=13, n_jobs=4
                        , n_estimators=600
                        , max_depth=80
                        , min_samples_leaf=32
                        , min_samples_split=16
                        )
        )
    ])
    print(' Training...'.format(cname), end='')
    # Train 데이터 학습
    pipe.fit(Xt, y_train)

    # 전처리된 Test RAW 데이터 준비
    test = pd.concat(test_raw_list[6*patten_i:6*(patten_i+1)]).copy()
    X_test = test.filter(items=selected_columns[patten_i][cname]).copy()

    # 테스트 데이터 Scaler 적용
    Xt = preprocessing.fit_transform(X_test)
    Xt = pd.DataFrame(data=Xt, columns=lst_cols + std_cols).copy()

    # 예측수행
    print(' Predict...'.format(cname))
    y_pred = pipe.predict(Xt)

    # 결과 저장.
    result[cname] = y_pred

  # 결과 취합
  result_list.append(pd.DataFrame(result, index=test.index))

preds = pd.concat(result_list)
preds.head()

Part1 Start...
	Y_01 Preprocess... Training... Predict...
	Y_02 Preprocess... Training... Predict...
	Y_03 Preprocess... Training... Predict...
	Y_04 Preprocess... Training... Predict...
	Y_05 Preprocess... Training... Predict...
	Y_06 Preprocess... Training... Predict...
	Y_07 Preprocess... Training... Predict...
	Y_08 Preprocess... Training... Predict...
	Y_09 Preprocess... Training... Predict...
	Y_10 Preprocess... Training... Predict...
	Y_11 Preprocess... Training... Predict...
	Y_12 Preprocess... Training... Predict...
	Y_13 Preprocess... Training... Predict...
	Y_14 Preprocess... Training... Predict...
Part2 Start...
	Y_01 Preprocess... Training... Predict...
	Y_02 Preprocess... Training... Predict...
	Y_03 Preprocess... Training... Predict...
	Y_04 Preprocess... Training... Predict...
	Y_05 Preprocess... Training... Predict...
	Y_06 Preprocess... Training... Predict...
	Y_07 Preprocess... Training... Predict...
	Y_08 Preprocess... Training... Predict...
	Y_09 Preprocess... Trai

Unnamed: 0,Y_01,Y_02,Y_03,Y_04,Y_05,Y_06,Y_07,Y_08,Y_09,Y_10,Y_11,Y_12,Y_13,Y_14
0,1.38224,1.112253,0.985119,14.816871,31.765264,17.003042,3.13257,-25.653803,-25.683031,-21.921966,24.807331,-25.595499,-25.626921,-25.640482
1,1.408119,1.119884,0.974271,14.612658,31.915018,17.064136,3.119649,-25.561284,-25.588819,-21.967932,24.861635,-25.527432,-25.529373,-25.542042
2,1.368801,1.138783,1.005271,14.819768,32.312165,17.113146,3.027712,-25.555625,-25.581001,-21.873327,24.887025,-25.511718,-25.521607,-25.523868
3,1.34875,1.06734,0.94648,14.394441,32.266371,17.131393,3.016165,-25.576281,-25.610179,-21.889642,24.884955,-25.524547,-25.552493,-25.554324
4,1.413176,1.032844,1.049042,14.903065,31.998501,17.08603,3.098095,-25.726926,-25.766438,-22.050342,24.79746,-25.672172,-25.718561,-25.678442


In [None]:
preds.tail()

Unnamed: 0,Y_01,Y_02,Y_03,Y_04,Y_05,Y_06,Y_07,Y_08,Y_09,Y_10,Y_11,Y_12,Y_13,Y_14
39603,1.314425,0.93336,0.963305,13.520705,31.234867,16.837297,3.200632,-26.374732,-26.387753,-22.80356,24.481768,-26.335311,-26.336796,-26.3263
39604,1.317228,0.905539,0.954004,13.735586,31.281218,16.777101,3.189037,-26.417349,-26.407353,-22.840761,24.471309,-26.373097,-26.355808,-26.34303
39605,1.318415,0.916672,0.961393,13.308986,31.326077,16.784266,3.170473,-26.346376,-26.37438,-22.791332,24.456257,-26.321734,-26.312061,-26.317203
39606,1.373663,0.955382,1.019963,13.652659,31.346164,16.652528,3.150442,-26.419875,-26.404452,-22.87605,24.422659,-26.358192,-26.336591,-26.333521
39607,1.346809,0.966905,1.006946,13.634912,31.382102,16.682922,3.125317,-26.413346,-26.384548,-22.745575,24.44512,-26.349678,-26.322606,-26.32357


In [None]:
preds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39608 entries, 0 to 39607
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Y_01    39608 non-null  float64
 1   Y_02    39608 non-null  float64
 2   Y_03    39608 non-null  float64
 3   Y_04    39608 non-null  float64
 4   Y_05    39608 non-null  float64
 5   Y_06    39608 non-null  float64
 6   Y_07    39608 non-null  float64
 7   Y_08    39608 non-null  float64
 8   Y_09    39608 non-null  float64
 9   Y_10    39608 non-null  float64
 10  Y_11    39608 non-null  float64
 11  Y_12    39608 non-null  float64
 12  Y_13    39608 non-null  float64
 13  Y_14    39608 non-null  float64
dtypes: float64(14)
memory usage: 4.2 MB


In [None]:
len(preds)

39608

## 결과파일 작성

In [None]:
preds = preds.values

submit = raw_data['sample_submission'].copy()
for idx, col in enumerate(submit.columns):
  if col=='ID':
      continue
  submit[col] = preds[:,idx-1]
print('Done.')

submit.to_csv('/content/submit.csv', index=False)
print('Done.')

Done.
Done.


In [None]:
!cp './submit.csv' '/content/drive/MyDrive/DeepLearning_Datas'