# 한글설정

In [None]:
!apt-get install fonts-nanum* > /dev/null 2>&1
!fc-cache -fv > /dev/null 2>&1

In [None]:
import os
import matplotlib as mpl

file_dir = os.path.split(mpl.__file__)[0]
font_dir = os.path.join(file_dir, 'mpl-data/fonts/ttf')
!cp /usr/share/fonts/truetype/nanum/Nanum* {font_dir}
!rm -rf ~/.cache/matplotlib/*

> Runtime Restart

In [1]:
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['axes.unicode_minus'] = False
plt.rc('font', family='NanumGothicCoding')

# 데이터 가져오기

In [2]:
import os
import gdown

id = "10Hpa4YM0KX_Ig0W9w7DbTdq62nF2UThA"
output = "./open.zip"

if not os.path.isdir('./datasets'):
  gdown.download(id=id, output=output)
  gdown.extractall(path=output, to='./datasets')

# 데이터프레임 만들기

In [3]:
import os
import glob
import pandas as pd
from collections import defaultdict

raw_data = defaultdict(pd.DataFrame)

for fname in glob.glob('./**/*.csv', recursive=True):
    df_name = os.path.splitext(os.path.basename(fname))[0]
    raw_data[df_name] = pd.read_csv(fname)

# Weights & Biases 설정

In [None]:
!pip install pathlib ruamel-yaml

In [None]:
!pip install wandb -Uq

In [4]:
import wandb

wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mzbooster[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
sweep_config = {
    'method': 'grid'
}

metric = {
    'name': 'LG_NRMSE'
}

sweep_config['metric'] = metric

parameters_dict = {
    'add_column': {
        # 'values': ex_mean
        'values': ex1
    },
}

sweep_config['parameters'] = parameters_dict

In [None]:
sweep_id = wandb.sweep(sweep_config, project="Antenna_Scaler")

# Feature Engineering

## 컬럼 선택

In [5]:
columns = [ 'X_01', 'X_03', 'X_05', 'X_06', 'X_07', 'X_08', 'X_09', 'X_10'
          , 'X_11', 'X_14', 'X_15', 'X_16', 'X_17', 'X_18', 'X_19', 'X_20'
          , 'X_22', 'X_26', 'X_28', 'X_29'
          , 'X_31', 'X_32', 'X_33', 'X_38'
          , 'X_42', 'X_44', 'X_45', 'X_46', 'X_49']

## Scaler

In [9]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import FunctionTransformer

scaler_list = [
    ('Unscaled', None),
    ('Standard', StandardScaler()),
    ('log1p', FunctionTransformer(np.log1p)),
    ('Min-Max', MinMaxScaler()),
    ('Min-Abs', MaxAbsScaler()),
    ('Robust', RobustScaler(quantile_range=(25, 75))),
    ('uniform pdf', QuantileTransformer(output_distribution="uniform")),
    ('gaussian pdf', QuantileTransformer(output_distribution="normal")),
    ('L2 normalizing', Normalizer())
]

In [10]:
train = raw_data['train'].copy()

# 평가함수

In [6]:
import numpy as np
from sklearn import metrics

# 데이콘에서 제공한 평가함수
def lg_nrmse(gt, preds):
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    all_nrmse = []
    for idx in range(0,14): # ignore 'ID'
        rmse = metrics.mean_squared_error(gt[:,idx], preds[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(gt[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:14])
    return score, all_nrmse

# 학습하기

In [None]:
import time
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from collections import defaultdict

train = raw_data['train'].copy()

def run_train(config=None):
  with wandb.init(config=config):

    config = wandb.config

    kf = KFold(shuffle=True, random_state=13)
    result = defaultdict(list)

    # 데이터 나누기
    X = train[columns].values
    y = train.filter(regex='Y').values
    for idx, (train_index, test_index) in enumerate(kf.split(X)):

      X_train, X_test = X[train_index], X[test_index]
      y_train, y_test = y[train_index], y[test_index]

      # 파이프라인 작성
      pipe = make_pipeline( 
        RandomForestRegressor(criterion="squared_error", random_state=13, n_jobs=-1
                            , n_estimators=400
                            , max_depth=80
                            , min_samples_leaf=2
                            , min_samples_split=2)
      )

      # 학습하기
      pipe.fit(X_train, y_train)

      # 검증하기
      y_pred = pipe.predict(X_test)

      # 평가 및 기록
      score, all_nrmse = lg_nrmse(y_test, y_pred)
      result['LG_NRMSE'].append(score)
      for i, v in enumerate(all_nrmse):
        result['Y_%02d_NRMSE' % (i+1)].append(v)
    
    log_dict = defaultdict(float)
    log_dict['LG_NRMSE_MEAN'] = np.mean(result['LG_NRMSE'])
    log_dict['LG_NRMSE_BEST'] = min(result['LG_NRMSE'])
    for i in range(14):
      log_dict['Y_%02d_NRMSE' % (i+1)] = np.mean(result['Y_%02d_NRMSE' % (i+1)])
    wandb.log(log_dict)

In [None]:
wandb.agent(sweep_id, run_train)