In [None]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

import os
import gc
from tqdm import tqdm
import random

import warnings
warnings.filterwarnings('ignore')

def seed_all(SEED=42):
    np.random.seed(SEED)
    os.environ['PYTHONHASHSEED']=str(SEED)
    random.seed(SEED)
    tf.random.set_seed(SEED)

In [None]:
data = pd.read_csv('./data/train/train.csv')
valid_cols = ['DHI', 'DNI', 'T', 'TARGET'] #  'WS', 'RH',
data

In [None]:
zero_hour = []
for i in range(24):
    if data[data['Hour']==i].sum()['TARGET'] == 0:
        zero_hour.append(i)
zero_hour

In [None]:
data[valid_cols]

In [None]:
train = data[valid_cols].copy()
target = train['TARGET']

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# sc = StandardScaler()
tr_sc  = MinMaxScaler()
tar_sc = MinMaxScaler()
tr_sc.fit(train)
tar_sc.fit(target.values.reshape(-1, 1))

In [None]:
train = tr_sc.transform(train)
target = tar_sc.transform(target.values.reshape(-1, 1))

In [None]:
train.shape, target.shape

In [None]:
days = 3
window = days*48
# valid_min = max(48*2, window)

available_idx = np.array(list(range(0, train.shape[0]-window-48*2, 1)))

In [None]:
def tr_generator():
    for el in available_idx:
        yield train[el:el+window], target[el+window:el+window+48*2]

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import optimizers, callbacks, layers, losses
from tensorflow.keras.layers import Dense, Concatenate, Activation, Add, BatchNormalization, Dropout, Conv1D, MaxPooling1D, Conv1DTranspose,\
concatenate, Input, UpSampling1D, GlobalMaxPooling1D, Permute, LSTM, GRU
from tensorflow.keras.models import Model, Sequential, load_model
from tensorflow.keras.optimizers import Adam, SGD
import tensorflow_addons as tfa

In [None]:
def mish(x):
    return x*tf.math.tanh(tf.math.softplus(x))

def decay(epochs):
    init = 1e-3
    drop = 10
    ratio = 0.9
    return max(5e-5, (init * (ratio ** (epochs//drop))))

es = callbacks.EarlyStopping(patience=10, restore_best_weights=True)
lrs = callbacks.LearningRateScheduler(decay, verbose=0)

In [None]:
tr_ds = tf.data.Dataset.from_generator(tr_generator,
                                    (tf.float32, tf.float32), 
                                    )
tr_loader = tr_ds.shuffle(1024, reshuffle_each_iteration=True).batch(128).prefetch(16)

In [None]:
for d,y in tr_generator():
    print(d.shape)
    print(y.shape)
    break

In [None]:
def build_model():
    inputs = Input(shape=(window, 6))
#     x = Permute((2, 1),)(inputs)

    h = LSTM(128, return_sequences=True)(inputs)
    h = LSTM(128, return_sequences=False)(h)

    outputs = Dense(48*2)(h)
#     outputs = tf.expand_dims(outputs, 1)
#     outputs = Add()([outputs, tf.expand_dims(inputs[:,-1], 1)])
    
    return Model(inputs, outputs)

In [None]:
nn = build_model()
nn.summary()

In [None]:
checkpoint_filepath = './tmp/checkpoint'
ckpt = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='loss',
    save_best_only=True)

nn.compile(loss='mse', optimizer=tfa.optimizers.AdamW(learning_rate=1e-3, weight_decay=1e-4))
nn.fit(tr_loader,
      epochs=5,
      callbacks=[ckpt]
      )
nn.load_weights(checkpoint_filepath)

In [None]:
import glob
test_files = glob.glob('./data/test/*.csv')
test_files

In [None]:
sub = pd.read_csv('./data/sample_submission.csv')
sub

In [None]:
def post_processing(pred):
    res = pred.copy()
    for i in zero_hour:
        res[2*i] = 0
        res[2*i+1] = 0
        res[48 + 2*i] = 0
        res[48 + 2*i+1] = 0
        
    res = np.clip(res, 0, float('inf'))
    res = smoothing(res)
    return res

def smoothing(x):
    for i in range(1, len(x)-1):
        if x[i-1] == 0 and x[i+1] == 0:
            x[i] = 0
    return x
        
post_processing(pred)

In [None]:
test_X = test[valid_cols].values[-48*2:]
test_X = tr_sc.transform(test_X)
test_X = np.expand_dims(test_X, 0)
pred = nn.predict(test_X)
pred = tar_sc.inverse_transform(pred.flatten().reshape(-1, 1))
pred = post_processing(pred)
pred

In [None]:
plt.plot(pred)
plt.plot(post_processing(pred))

In [None]:
from sklearn.metrics import MeanAbsoluteError, MeanSquaredError

In [None]:
iteration  = 100
preds = {}
for seed in tqdm(range(iteration)):
    seed_all(seed)
    nn = build_model()
    nn.compile(loss='mse', optimizer=tfa.optimizers.AdamW(learning_rate=1e-3, weight_decay=1e-4))
    nn.fit(tr_loader,
          epochs=10,
          verbose = 0,
          callbacks=[ckpt]
          )
    nn.load_weights(checkpoint_filepath)

    for i, test_file in enumerate(test_files):
        if seed == 0:
            preds[test_file] = []
            
        test = pd.read_csv(test_file)
        test_X = test[valid_cols].values[-48*2:]
        test_X = tr_sc.transform(test_X)
        test_X = np.expand_dims(test_X, 0)
        pred = nn.predict(test_X)
        pred = tar_sc.inverse_transform(pred.flatten().reshape(-1, 1))
        pred = post_processing(pred)
        preds[test_file].append(pred)
        
        if seed == iteration-1:
            preds[test_file] = np.array(preds[test_file])

print('train done')
for key in preds.keys():
    file = key.split('\\')[1]
    idx = sub[sub['id'].map(lambda x: x.split('_')[0]) == file].index
    for i in range(1, 10):
        q = i/10
        sub.loc[idx[0]:idx[-1]][f'q_{q}'] = np.quantile(preds[key], q, axis=0).reshape(-1, 1)

In [None]:
preds = {}
for seed in tqdm(range(100)):
    seed_all(seed)
    rf = RandomForestRegressor(n_jobs=-1, random_state=seed, n_estimators=100, min_samples_split=10)
    rf.fit(X, y)

    for i, test_file in enumerate(test_files):
        if seed == 0:
            preds[test_file] = []
        test = pd.read_csv(test_file)
        pred = rf.predict(test[test['Day']>=6][test['Day']<=6][valid_cols].values.flatten().reshape(1, -1))
        preds[test_file].append(pred)
        
        if seed == 99:
            preds[test_file] = np.array(preds[test_file])

print('train done')
for key in preds.keys():
    file = key.split('\\')[1]
    idx = sub[sub['id'].map(lambda x: x.split('_')[0]) == file].index
    for i in range(1, 10):
        q = i/10
        sub.loc[idx[0]:idx[-1]][f'q_{q}'] = np.quantile(preds[key], q, axis=0).reshape(-1, 1)

In [None]:
for test_file in tqdm(test_files):
    test = pd.read_csv(test_file)
    preds = []
    for es in rf.estimators_:
        pred = es.predict(test[test['Day']>=5][test['Day']<=6][valid_cols].values.flatten().reshape(1, -1))
        preds.append(pred)
    preds = np.array(preds)
    
    file = test_file.split('\\')[1]
    idx = sub[sub['id'].map(lambda x: x.split('_')[0]) == file].index
    for i in range(1, 10):
        q = i/10
        sub.loc[idx[0]:idx[-1]][f'q_{q}'] = np.quantile(preds, q, axis=0).reshape(-1, 1)
#     break

In [None]:
sub.iloc[30:50]