In [None]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

import os
import gc
from tqdm import tqdm
import random

import warnings
warnings.filterwarnings('ignore')

def seed_all(SEED=42):
    np.random.seed(SEED)
    os.environ['PYTHONHASHSEED']=str(SEED)
    random.seed(SEED)

In [None]:
data = pd.read_csv('./data/train/train.csv')
valid_cols = ['DHI', 'DNI', 'RH', 'T', 'TARGET'] # WS
data

In [None]:
X = []
y = []
term = 2
for i in range(1094-term):
    d = data[data['Day']>=i][data['Day'] < i+term][valid_cols]
    X.append(d.values.flatten())
    y.append(data[data['Day']>i][data['Day']<=i+2]['TARGET'].values)

In [None]:
from sklearn.ensemble import RandomForestRegressor
SEED = 42
seed_all(SEED)
rf = RandomForestRegressor(n_jobs=-1, random_state=SEED, n_estimators=200, min_samples_split=10)
rf.fit(X, y)

In [None]:
import glob
test_files = glob.glob('./data/test/*.csv')
test_files = sorted(test_files, key=lambda x: int(x.split('\\')[1].split('.')[0]))
test_files

In [None]:
sub = pd.read_csv('./data/sample_submission.csv')
sub

In [None]:
preds = {}
for seed in tqdm(range(100)):
    seed_all(seed)
    rf = RandomForestRegressor(n_jobs=-1, random_state=seed, n_estimators=100, min_samples_split=10)
    rf.fit(X, y)

    for i, test_file in enumerate(test_files):
        if seed == 0:
            preds[test_file] = []
        test = pd.read_csv(test_file)
        pred = rf.predict(test[test['Day']>=6][test['Day']<=6][valid_cols].values.flatten().reshape(1, -1))
        preds[test_file].append(pred)
        
        if seed == 99:
            preds[test_file] = np.array(preds[test_file])

print('train done')
for key in preds.keys():
    file = key.split('\\')[1]
    idx = sub[sub['id'].map(lambda x: x.split('_')[0]) == file].index
    for i in range(1, 10):
        q = i/10
        sub.loc[idx[0]:idx[-1]][f'q_{q}'] = np.quantile(preds[key], q, axis=0).reshape(-1, 1)

In [None]:
sub.to_csv('./sub/sample.csv', index=False)