# IMPORT DATA & LIBRARY

In [6]:
import pandas as pd
import cupy as cp
import numpy as np
import os, sys

import random
import math
import scipy
from scipy.optimize import curve_fit

import warnings
warnings.filterwarnings('ignore')

In [7]:
print(pd.__version__)
print(cp.__version__)
print(np.__version__)
print(scipy.__version__)

1.2.2
8.4.0
1.19.2
1.6.1


In [30]:
TEST_PATH = 'C:\\Users\\Wyatt\\wyatt37\\Data\\solarpanel\\test\\'
SUB_PATH = 'C:\\Users\\Wyatt\\wyatt37\\Data\\solarpanel\\sample_submission.csv'

In [36]:
train = pd.read_csv('C:\\Users\\Wyatt\\wyatt37\\Data\\solarpanel\\train\\train.csv')
train

Unnamed: 0,Day,Hour,Minute,DHI,DNI,WS,RH,T,TARGET
0,0,0,0,0,0,1.5,69.08,-12,0.0
1,0,0,30,0,0,1.5,69.06,-12,0.0
2,0,1,0,0,0,1.6,71.78,-12,0.0
3,0,1,30,0,0,1.6,71.75,-12,0.0
4,0,2,0,0,0,1.6,75.20,-12,0.0
...,...,...,...,...,...,...,...,...,...
52555,1094,21,30,0,0,2.4,70.70,-4,0.0
52556,1094,22,0,0,0,2.4,66.79,-4,0.0
52557,1094,22,30,0,0,2.2,66.78,-4,0.0
52558,1094,23,0,0,0,2.1,67.72,-4,0.0


In [34]:
sub = pd.read_csv(SUB_PATH)
sub

Unnamed: 0,id,q_0.1,q_0.2,q_0.3,q_0.4,q_0.5,q_0.6,q_0.7,q_0.8,q_0.9
0,0.csv_Day7_0h00m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.csv_Day7_0h30m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.csv_Day7_1h00m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.csv_Day7_1h30m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.csv_Day7_2h00m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
7771,80.csv_Day8_21h30m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7772,80.csv_Day8_22h00m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7773,80.csv_Day8_22h30m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7774,80.csv_Day8_23h00m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# DATA PREPROCESSING

고라파덕님이 진행한 전처리와 피쳐 엔지니어링은 다음과 같다.

 - Hour 과 Minute를 1개 항목으로(float) 병합, day 항목 제거
 - sin cos 함수를 이용한 시간의 연속적 표현 (cyclical time encoding)
 - 가장 마지막 3, 5일 동시간 Target값의 평균
 - 기온과 상대습도를 이용한 이슬점 산출
 - DNI>0 여부에 따른 일출 일몰시간 추출
 - 일출 일몰 시간에 따른 년간, 일별 계절성(seasonality)를 고려한 일별 2차 함수 근사를 통해 zenith angle 산출
 - zenith angle과 DNI DHI를 이용한 GHI 산출
 - (solar zenith angle) + (solar altitude angle) = 90 degrees

In [37]:
def obj_curve(x, a, b, c):
    return a*(x - b)**2+c

def round_half(number):
    return round(number * 2) / 2

def preprocess_data(data, is_train=True):
    
    temp = data.copy()
    
    ## merge hour and minute
    temp.Hour = temp.Hour + temp.Minute/60
    temp.drop(['Minute', 'Day'], axis=1, inplace=True)
    
    ## add cyclical encoded time feature
    ## continueous time feature
    temp['cos_time'] = np.cos(2*np.pi*(temp.Hour/24))
    temp['sin_time'] = np.sin(2*np.pi*(temp.Hour/24))
    
    ## add 3day & 5day mean value for target according to Hour
    ## average TARGET values of the most recent 3, 5 days
    temp['shft1'] = temp['TARGET'].shift(48)
    temp['shft2'] = temp['TARGET'].shift(48*2)
    temp['shft3'] = temp['TARGET'].shift(48*3)
    temp['shft4'] = temp['TARGET'].shift(48*4)
    
    temp['avg3'] = np.mean(temp[['TARGET', 'shft1','shft2']].values, axis=1)
    temp['avg5'] = np.mean(temp[['TARGET', 'shft1','shft2', 'shft3', 'shft4']].values, axis=1)
    
    ## add dewpoint feature from T and RH column
    c = 243.12
    b = 17.62
    gamma = (b * (temp['T']) / (c + (temp['T']))) + np.log(temp['RH'] / 100)
    dp = (c * gamma) / (b - gamma)
    temp['Td'] = dp
    
    ## add GHI feature by approximation of zenith angle
    ## 1. sunrise and sunset hour are extracted from DHI feature
    ## 2. zenith angle is approximated daily from know yearly and daily seasonality of zenith angle
    ## 3. GHI is calculated from DNI DHI and zenith angle
    for day in temp.rolling(window=48):
         if day.values[0][0] == 0 and day.shape[0] == 48:
                sunup = day[day.DHI > 0]
                sunup['zenith'] = np.nan
                sunrise = sunup.Hour.values[0]
                sunset = sunup.Hour.values[-1]
                peak = (sunrise + sunset) / 2
                
                param, _ = curve_fit(obj_curve,
                                    [sunrise - 0.5, peak, sunset + 0.5],
                                    [90, (sunrise - 6.5) / 1.5 * 25 + 35, 90],
                                    p0 = [0.5, peak, 36],
                                     bounds = ([0.01, (sunrise + sunset) / 2-1, 10],
                                              [1.2, (sunrise + sunset) / 2+1, 65]))
                
                temp.loc[day.index, 'zenith'] = obj_curve(day.Hour, *param)
                
    ## altitude of sun is added
    temp['altitude'] = 90 - temp.zenith
    temp['GHI'] = temp.DHI + temp.DNI*np.cos(temp.zenith*math.pi/180)
    temp = temp[['Hour','cos_time','sin_time','altitude','GHI','DHI','DNI','WS','RH','T','Td','avg3','avg5','TARGET']]
    
    
    ## if train_data, add target values to last 2 columns
    if is_train==True:
        temp['Target1'] = temp['TARGET'].shift(-48)
        temp['Target2'] = temp['TARGET'].shift(-48*2)
    else:
        pass
    
    ## first 4 days are dropped because nan values were filled when creating avg columns
    ## for train_data, last 2 days are dropped additionally
    temp = temp.dropna()
    return temp

In [38]:
df_train = preprocess_data(train)

In [39]:
df_train

Unnamed: 0,Hour,cos_time,sin_time,altitude,GHI,DHI,DNI,WS,RH,T,Td,avg3,avg5,TARGET,Target1,Target2
192,0.0,1.000000,0.000000,-152.025,0.0,0,0,2.1,72.10,-5,-9.252469,0.0,0.0,0.0,0.0,0.0
193,0.5,0.991445,0.130526,-137.625,0.0,0,0,2.0,72.10,-5,-9.252469,0.0,0.0,0.0,0.0,0.0
194,1.0,0.965926,0.258819,-123.825,0.0,0,0,2.0,72.14,-5,-9.245388,0.0,0.0,0.0,0.0,0.0
195,1.5,0.923880,0.382683,-110.625,0.0,0,0,2.0,72.11,-5,-9.250699,0.0,0.0,0.0,0.0,0.0
196,2.0,0.866025,0.500000,-98.025,0.0,0,0,2.0,72.94,-5,-9.104485,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52459,21.5,0.793353,-0.608761,-74.625,0.0,0,0,3.5,55.97,-1,-8.689729,0.0,0.0,0.0,0.0,0.0
52460,22.0,0.866025,-0.500000,-86.025,0.0,0,0,3.9,54.23,-2,-10.028583,0.0,0.0,0.0,0.0,0.0
52461,22.5,0.923880,-0.382683,-98.025,0.0,0,0,4.1,54.21,-2,-10.033261,0.0,0.0,0.0,0.0,0.0
52462,23.0,0.965926,-0.258819,-110.625,0.0,0,0,4.3,56.46,-2,-9.516353,0.0,0.0,0.0,0.0,0.0


# FEATURE SELECTION & MODELING

In [None]:
## prep train and test data
## group data by day(48rows)
## Td and WS features are dropped

tf_train = []
for day in df_train.rolling(48):
    if day.shape[0] == 48 and day.values[0][0] == 0:
        day = day.drop(['Td', 'WS'], axis=1)
        tf_train.append(day.values)
tf_train = np.asarray(tf_train)

