In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, make_scorer
from IPython.display import FileLink
# from tqdm import tqdm_notebook 
from tqdm import tqdm
import lightgbm as lgb
from xgboost.sklearn import XGBRegressor
from sklearn.model_selection import GridSearchCV
import warnings

from joblib import Parallel, delayed
from sklearn.model_selection import PredefinedSplit

from tsfresh.utilities.dataframe_functions import roll_time_series, make_forecasting_frame
from tsfresh.utilities.dataframe_functions import impute
warnings.filterwarnings('ignore')

In [None]:
train_file = '../input/df-exploded/result_{}.csv'
super_train = '../input/df-exploded/super_train.csv'


In [None]:
stock = 1
df = pd.read_csv(train_file.format(stock), index_col='ID', parse_dates=['Date'])
    
df

In [None]:
tdf = pd.read_csv(super_train, index_col='ID')

tdf

In [None]:
def expand_df(dframe):
    dFrame = dframe.copy()
    dFrame['day'] = dFrame.Date.apply(lambda x: x.day)
    dFrame['month'] = dFrame.Date.apply(lambda x: x.month)
    dFrame['year'] = dFrame.Date.apply(lambda x: x.year)
    dFrame['dayofweek'] = dFrame.Date.apply(lambda x: x.dayofweek)
    dFrame['dayofyear'] = dFrame.Date.apply(lambda x: x.dayofyear)
    dFrame['weekofyear'] = dFrame.Date.apply(lambda x: x.weekofyear)
    dFrame['year_diff'] = dFrame.Date.apply(lambda x: x.year - 2017)
    dFrame['days_so_far_skipped'] = dFrame.Date.apply(lambda x: dFrame[dFrame.Date < x].shape[0])
    dFrame['days_so_far'] = dFrame.Date.apply(lambda x: (x - pd.Timestamp('2017-01-03')).days)
    return dFrame

In [None]:
cat_cols = [ 
    'holiday',
    'stock',
    'day',
     'month',
     'year',
     'dayofweek',
     'dayofyear',
     'weekofyear',
    'year_diff', 
    'unpredictability_score']
excluded_cols = ['Close_hat', 'Open_hat', 'High_hat', 'Low_hat']
                 

In [None]:
def get_rolling_mean(dframe, col, idx, days=30, met='mean'):
#     print(col, idx, days, met)
    if met == 'mean':
        return dframe[(dframe['days_so_far_skipped'] < dframe.loc[idx]['days_so_far_skipped'] + days) & (dframe['days_so_far_skipped'] >= dframe.loc[idx]['days_so_far_skipped'])][col].mean()
    if met == 'max':
        return dframe[(dframe['days_so_far_skipped'] < dframe.loc[idx]['days_so_far_skipped'] + days) & (dframe['days_so_far_skipped'] >= dframe.loc[idx]['days_so_far_skipped'])][col].max()
    return dframe[(dframe['days_so_far_skipped'] < dframe.loc[idx]['days_so_far_skipped'] + days) & (dframe['days_so_far_skipped'] >= dframe.loc[idx]['days_so_far_skipped'])][col].min()

def rolled_mean(dframe, timeshift=30):
    dframe['ID'] = dframe.index

    for col in excluded_cols:
        dframe[col + '_roll_mean_per_mon'] = dframe['ID'].apply(lambda x: get_rolling_mean(dframe, col, x, days=timeshift, met='mean'))
        dframe[col + '_roll_max_per_mon'] = dframe['ID'].apply(lambda x: get_rolling_mean(dframe, col, x, days=timeshift, met='max'))
        dframe[col + '_roll_min_per_mon'] = dframe['ID'].apply(lambda x:get_rolling_mean(dframe, col, x, days=timeshift, met='min'))
        dframe[col + '_roll_range_per_mon'] = dframe[col + '_roll_max_per_mon'] - dframe[col + '_roll_min_per_mon']
        dframe[col + '_roll_mean_per_d'] = dframe['ID'].apply(lambda x: get_rolling_mean(dframe, col, x, days=2, met='mean'))
        dframe[col + '_roll_max_per_d'] = dframe['ID'].apply(lambda x: get_rolling_mean(dframe, col, x, days=2, met='max'))
        dframe[col + '_roll_min_per_d'] = dframe['ID'].apply(lambda x:get_rolling_mean(dframe, col, x, days=2, met='min'))
        dframe[col + '_roll_range_per_d'] = dframe[col + '_roll_max_per_d'] - dframe[col + '_roll_min_per_d']
    
    return dframe.set_index('ID', drop=True)

In [None]:
def split_time_series(ts, ys, train_size):
    
    split_ind = int(train_size*ts.shape[0])
    train_ts, val_ts, train_ys, val_ys = ts[:split_ind], ts[split_ind:], ys[:split_ind], ys[split_ind:]
    return (train_ts, val_ts, train_ys, val_ys)

In [None]:
def rmse(y_true, y_pred):
    return abs(mean_squared_error(y_true, y_pred, squared=False))

In [None]:
df = df.join(tdf[['Open_hat', 'High_hat', 'Low_hat', 'Close_hat']])
df = expand_df(df)

df = rolled_mean(df)

encoder = LabelEncoder()
for col in cat_cols:
    df[col] = encoder.fit_transform(df[col])

df.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in df.columns]

X, y = df[df['Close'].notna()].drop(columns=['Close', 'Open', 'High', 'Low', 'Date'], axis=1), df[['Close']][df['Close'].notna()]
X_test, y_test = df[df['Close'].isna()].drop(columns=['Close', 'Open', 'High', 'Low', 'Date'], axis=1), df[['Close']][df['Close'].isna()]

X_tr, X_val, y_tr, y_val = split_time_series(X, y, train_size=.8)


In [None]:
y_val

In [None]:
X_val

In [None]:
df.columns.tolist()

In [None]:
model_store = [0] * 103
metrics = [0] * 103

preds = []

parameters = {#'nthread':[1], #when use hyperthread, xgboost may become slower
              'objective':['reg:squarederror'],
              'learning_rate': [.4], #so called `eta` value
              'max_depth': [6],
#               'min_child_weight': [4],
#               'silent': [1],
#               'subsample': [0.7, .9],
#               'colsample_bytree': [0.7],
              'n_estimators': [500],
                'early_stopping_rounds' : [10]
}


def get_predictions1(stock):
    df = pd.read_csv(train_file.format(stock), index_col='ID', parse_dates=['Date'])
    df = df.join(tdf[['Open_hat', 'High_hat', 'Low_hat', 'Close_hat']])
    df = expand_df(df)

    df = rolled_mean(df)

    encoder = LabelEncoder()
    for col in cat_cols:
        df[col] = encoder.fit_transform(df[col])

    df.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in df.columns]

    X, y = df[df['Close'].notna()].drop(columns=['Close', 'Open', 'High', 'Low', 'Date'], axis=1), df[['Close']][df['Close'].notna()]
    X_test, y_test = df[df['Close'].isna()].drop(columns=['Close', 'Open', 'High', 'Low', 'Date'], axis=1), df[['Close']][df['Close'].isna()]

    X_tr, X_val, y_tr, y_val = split_time_series(X, y, train_size=.8)
    
    train_indices = np.full((X_tr.shape[0],), -1, dtype=int)
    test_indices = np.full((X_val.shape[0],), 0, dtype=int)
    
    test_fold = np.append(train_indices, test_indices)
    
#     print(test_fold)
    ps = PredefinedSplit(test_fold)
    
    model = XGBRegressor(tree_method='gpu_hist', gpu_id=0)
    
    
    xgb_grid = GridSearchCV(model,
                           parameters,
                           cv=ps,
                           n_jobs=2,
                           scoring=make_scorer(rmse, greater_is_better=False),
                           verbose=False
                           )
    
    xgb_grid.fit(
        X, y
    )

    return pd.DataFrame({'Stock': stock, 'Params': str(xgb_grid.best_params_), 'Score': xgb_grid.best_score_, 'ID': X_test.index, 'Close': xgb_grid.predict(X_test)})
num_cores = 2
preds = Parallel(n_jobs=num_cores)(delayed(get_predictions1)(stock) for stock in tqdm(range(103)))

# preds = [get_predictions1(stock) for stock in tqdm(range(103))]

In [None]:
pd.concat(preds)[['ID', 'Close']].to_csv('result_xgboost.csv', index=False)


In [None]:
preds[0]

In [None]:
FileLink('result_xgboost.csv')

In [None]:
!cat result_xgboost.csv

In [None]:
pd.concat(preds).to_csv("result_xgboost_analysis.csv", index=False)
FileLink('result_xgboost_analysis.csv')

In [None]:
!cat result_xgboost_analysis.csv

In [None]:
pd.concat(preds)['Params'].unique()