In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, make_scorer
from IPython.display import FileLink
from tqdm import tqdm_notebook 
from tqdm.notebook import tqdm
from xgboost.sklearn import XGBRegressor
from sklearn.model_selection import GridSearchCV
import warnings
from joblib import Parallel, delayed
from tsfresh.utilities.dataframe_functions import roll_time_series, make_forecasting_frame
from tsfresh.utilities.dataframe_functions import impute
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
train_file = 'training_files/result_{}.csv'
super_train = 'super_train.csv'


In [None]:
stock = 1
df = pd.read_csv(train_file.format(stock), index_col='ID', parse_dates=['Date'])
    
df

In [None]:
tdf = pd.read_csv(super_train, index_col='ID')

tdf

In [None]:
# print([i for i in df.columns if 'Open' == i])
# print([i for i in df.columns if 'High' == i])
# print([i for i in df.columns if 'Low' == i])
# print([i for i in df.columns if 'holiday' == i])





In [None]:
df = df.join(tdf[['Open_hat', 'High_hat', 'Low_hat', 'Close_hat']])

In [None]:
cat_cols = [ 
    'holiday',
    'stock',
    'day',
     'month',
     'year',
     'dayofweek',
     'dayofyear',
     'weekofyear',
    'unpredictability_score']
excluded_cols = ['Close_hat', 'Open_hat', 'High_hat', 'Low_hat']
                 

In [None]:
def expand_df(dframe):
    dFrame = dframe.copy()
    dFrame['day'] = dFrame.Date.apply(lambda x: x.day)
    dFrame['month'] = dFrame.Date.apply(lambda x: x.month)
    dFrame['year'] = dFrame.Date.apply(lambda x: x.year)
    dFrame['dayofweek'] = dFrame.Date.apply(lambda x: x.dayofweek)
    dFrame['dayofyear'] = dFrame.Date.apply(lambda x: x.dayofyear)
    dFrame['weekofyear'] = dFrame.Date.apply(lambda x: x.weekofyear)
    dFrame['year_diff'] = dFrame.Date.apply(lambda x: x.year - 2017)
    dFrame['days_so_far_skipped'] = dFrame.Date.apply(lambda x: dFrame[dFrame.Date < x].shape[0])
    dFrame['days_so_far'] = dFrame.Date.apply(lambda x: (x - pd.Timestamp('2017-01-03')).days)
    return dFrame
#     return dFrame[['Open_hat', 'High_hat', 'Low_hat', 'Close_hat'] + ['Date'] + cat_cols + ['year_diff', 'days_so_far_skipped', 'days_so_far'] + ['Close', 'Open', 'High', 'Low'] ]

In [None]:
df = expand_df(df)

In [None]:
# for col in df.columns:
#     if df[col].isna().sum() != 0: print(col)

In [None]:
def get_rolling_mean(dframe, col, idx, days=30, met='mean'):
#     print(col, idx, days, met)
    if met == 'mean':
        return dframe[(dframe['days_so_far_skipped'] < dframe.loc[idx]['days_so_far_skipped'] + days) & (dframe['days_so_far_skipped'] >= dframe.loc[idx]['days_so_far_skipped'])][col].mean()
    if met == 'max':
        return dframe[(dframe['days_so_far_skipped'] < dframe.loc[idx]['days_so_far_skipped'] + days) & (dframe['days_so_far_skipped'] >= dframe.loc[idx]['days_so_far_skipped'])][col].max()
    return dframe[(dframe['days_so_far_skipped'] < dframe.loc[idx]['days_so_far_skipped'] + days) & (dframe['days_so_far_skipped'] >= dframe.loc[idx]['days_so_far_skipped'])][col].min()

def rolled_mean(dframe, timeshift=30):
    dframe['ID'] = dframe.index

    for col in excluded_cols:
        dframe[col + '_roll_mean_per_mon'] = dframe['ID'].apply(lambda x: get_rolling_mean(dframe, col, x, days=timeshift, met='mean'))
        dframe[col + '_roll_max_per_mon'] = dframe['ID'].apply(lambda x: get_rolling_mean(dframe, col, x, days=timeshift, met='max'))
        dframe[col + '_roll_min_per_mon'] = dframe['ID'].apply(lambda x:get_rolling_mean(dframe, col, x, days=timeshift, met='min'))
        dframe[col + '_roll_range_per_mon'] = dframe[col + '_roll_max_per_mon'] - dframe[col + '_roll_min_per_mon']
        dframe[col + '_roll_mean_per_d'] = dframe['ID'].apply(lambda x: get_rolling_mean(dframe, col, x, days=2, met='mean'))
        dframe[col + '_roll_max_per_d'] = dframe['ID'].apply(lambda x: get_rolling_mean(dframe, col, x, days=2, met='max'))
        dframe[col + '_roll_min_per_d'] = dframe['ID'].apply(lambda x:get_rolling_mean(dframe, col, x, days=2, met='min'))
        dframe[col + '_roll_range_per_d'] = dframe[col + '_roll_max_per_d'] - dframe[col + '_roll_min_per_d']
    
    return dframe.set_index('ID', drop=True)

In [None]:
def rolled_mean1(dframe):
    dframe['ID'] = dframe.index

    dFrame = dframe.copy()
    dFrame = roll_time_series(dFrame, show_warnings=False, disable_progressbar=True, column_id='stock', column_sort='Date', max_timeshift=30, min_timeshift=0)
    for col in excluded_cols:
        dframe[col + '_roll_mean'] = dframe['ID'].apply(lambda x: dFrame[dFrame['ID'] == x][col].mean())
        dframe[col + '_roll_max'] = dframe['ID'].apply(lambda x: dFrame[dFrame['ID'] == x][col].max())
        dframe[col + '_roll_min'] = dframe['ID'].apply(lambda x: dFrame[dFrame['ID'] == x][col].min())
        dframe[col + '_roll_range'] = dframe[col + '_roll_max'] - dframe[col + '_roll_min']
        
#     dframe['Close_hat'] = dframe['Close'].fillna(dframe[dframe['Close'].isna()]['Close_hat'])
#     dframe['Open_hat'] = dframe['Open'].fillna(dframe[dframe['Close'].isna()]['Open_hat'])
#     dframe['High_hat'] = dframe['High'].fillna(dframe[dframe['Close'].isna()]['High_hat'])
#     dframe['Low_hat'] = dframe['Low'].fillna(dframe[dframe['Close'].isna()]['Low_hat'])
    
            
    
    return dframe.set_index('ID', drop=True)


In [None]:
df.shape

In [None]:

df = rolled_mean(df)
df

In [None]:
# df = rolled_mean(df)
df.shape

In [None]:
df['Close_hat'].equals(df['Close_hat_roll_mean_per_mon'])

In [None]:
sum(df['Close_hat'] - df['Close_hat_roll_mean_per_mon'])

In [None]:
sum(df[df['Close'].notna()]['Close'] - df[df['Close'].notna()]['Close_hat'])

In [None]:
[col for col in tdf.columns.tolist() if col in set(excluded_cols)]

In [None]:

# ['mean', 'maximum', 'minimum']
df.columns.tolist()

In [None]:
tdf

In [None]:
encoder = LabelEncoder()
for col in tqdm(cat_cols):
    df[col] = encoder.fit_transform(df[col])

In [None]:
# df['train'] = df['Close'].apply(lambda x: not pd.isna(x))
# df.columns = [i.replace('{', '_').replace('}', '_') for i in df.columns]
df.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in df.columns]

In [None]:
X, y = df[df['Close'].notna()].drop(columns=['Close', 'Open', 'High', 'Low'], axis=1), df[['Close', 'stock']][df['Close'].notna()]
X_test, y_test = df[df['Close'].isna()].drop(columns=['Close', 'Open', 'High', 'Low'], axis=1), df[['Close', 'stock']][df['Close'].isna()]

In [None]:
X['days_so_far']

In [None]:
X

In [None]:
X['Close_hat_roll_mean_per_mon'] 

In [None]:
X['Close_hat_roll_min_per_mon']

In [None]:
y

In [None]:
# hyper_params = {
#     'task': 'train',
#     'boosting_type': 'gbdt',
#     'objective': 'regression',
#     'metric': ['rmse'],
#     'learning_rate': 0.1,
#     'feature_fraction': 0.9,
#     'bagging_fraction': 0.7,
#     'bagging_freq': 10,
#     'verbose': 0,
#     "max_depth": 4,
#     "num_leaves": 128,  
#     "max_bin": 512,
#     "num_iterations": 100000,
#     "n_estimators": 1000,
#     "random_state": 32
# }
# model_store = [0] * 103
# metrics = [0] * 103
# preds = []
# # for stock in tqdm(X.stock.unique(), total=103):
# for stock in tqdm(range(103), total=103):
# # for stock in tqdm([1]):
#     df = pd.read_csv(train_file.format(stock), index_col='ID', parse_dates=['Date'])
#     df = expand_df(df)
#     encoder = LabelEncoder()
#     for col in cat_cols:
#         df[col] = encoder.fit_transform(df[col])
        
#     df.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in df.columns]
#     X, y = df[df['Close'].notna()].drop(columns=['Close', 'Open', 'High', 'Low'], axis=1), df[['Close', 'stock']][df['Close'].notna()]
#     X_test, y_test = df[df['Close'].isna()].drop(columns=['Close', 'Open', 'High', 'Low'], axis=1), df[['Close', 'stock']][df['Close'].isna()]
    
    
#     X_tr, X_val, y_tr, y_val = train_test_split(X.drop(columns=['Date']), y['Close'], train_size=0.80, random_state=11568)
    
    
#     model_store[stock] = lgb.LGBMRegressor(**hyper_params)
#     model_store[stock].fit(X_tr, 
#                            y_tr,
#                            eval_set=[(X_val, y_val)],
#                            eval_metric='rmse',
#                            early_stopping_rounds=100,
#                            verbose=False,
#                            )
#     metrics[stock] = list(list(model_store[stock].best_score_.values())[0].values())[0]
#     print(stock, metrics[stock])
#     preds.append(pd.DataFrame({'ID': X_test.index, 'Close': model_store[stock].predict(X_test.drop(columns=['Date']), num_iteration=model_store[stock].best_iteration_)}))



# # pd.concat(preds).to_csv('result.csv', index=False)
# # FileLink('result.csv')

In [None]:
def rmse(y_true, y_pred):
    return abs(mean_squared_error(y_true, y_pred, squared=False))

In [None]:
model_store1 = [0] * 103
metrics1 = [0] * 103
# df = None

params = {'learning_rate': [.3, .35, .45], 'depth': [2, 3], 'od_wait': [15, 20, 25]}


preds1 = []

def get_predictions(stock):
    df = pd.read_csv(train_file.format(stock), index_col='ID', parse_dates=['Date'])
    df = df.join(tdf[['Open_hat', 'High_hat', 'Low_hat', 'Close_hat']])
    df = expand_df(df)

    df = rolled_mean(df)

    encoder = LabelEncoder()
    for col in cat_cols:
        df[col] = encoder.fit_transform(df[col])

    df.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in df.columns]

    X, y = df[df['Close'].notna()].drop(columns=['Close', 'Open', 'High', 'Low'], axis=1), df[['Close', 'stock']][df['Close'].notna()]
    X_test, y_test = df[df['Close'].isna()].drop(columns=['Close', 'Open', 'High', 'Low'], axis=1), df[['Close', 'stock']][df['Close'].isna()]

    X_tr, X_val, y_tr, y_val = train_test_split(X, y['Close'], train_size=.8, random_state=11568)
    model = CatBoostRegressor(
        loss_function='RMSE', 
#         depth=2, 
#         learning_rate=0.4, 
        iterations=800,
        random_seed=18,
        od_type='Iter',
#         od_wait=20,
        thread_count=1,# task_type="GPU"
    )
    
    
#     model.fit(
#         X_tr, y_tr, use_best_model=True,
#         cat_features=cat_cols,
#         eval_set=(X_val, y_val),
#         verbose=False,
#         plot=False,
#     )
    grid = GridSearchCV(estimator=model, param_grid=params, cv=2, n_jobs=2, 
                        scoring=make_scorer(rmse, greater_is_better=False), verbose=0)
    grid.fit(
        X, y['Close']
    )
# #     model_store1[stock] = grid
#     print("best_params")
#     metrics1[stock] = grid.best_params_

    return pd.DataFrame({'Params': str(grid.best_params_), 'Score': grid.best_score_, 'ID': X_test.index, 'Close': grid.predict(X_test)})


num_cores = 4
preds1 = Parallel(n_jobs=num_cores)(delayed(get_predictions)(stock) for stock in tqdm(range(103)))
# preds1 = [get_predictions(stock) for stock in tqdm(range(1))]

In [None]:
preds1[0]['Params'].iloc[0]

In [None]:
model_store = [0] * 103
metrics = [0] * 103

preds = []

parameters = {#'nthread':[1], #when use hyperthread, xgboost may become slower
              'objective':['reg:squarederror'],
              'learning_rate': [.1, .2, .3], #so called `eta` value
              'max_depth': [7, 8, 6],
              'min_child_weight': [4],
              'silent': [1],
              'subsample': [0.7],
              'colsample_bytree': [0.7],
              'n_estimators': [500]}


def get_predictions1(stock):
    df = pd.read_csv(train_file.format(stock), index_col='ID', parse_dates=['Date'])
    df = df.join(tdf[['Open_hat', 'High_hat', 'Low_hat', 'Close_hat']])
    df = expand_df(df)

    df = rolled_mean(df)

    encoder = LabelEncoder()
    for col in cat_cols:
        df[col] = encoder.fit_transform(df[col])

    df.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in df.columns]

    X, y = df[df['Close'].notna()].drop(columns=['Close', 'Open', 'High', 'Low'], axis=1), df[['Close', 'stock']][df['Close'].notna()]
    X_test, y_test = df[df['Close'].isna()].drop(columns=['Close', 'Open', 'High', 'Low'], axis=1), df[['Close', 'stock']][df['Close'].isna()]

    X_tr, X_val, y_tr, y_val = train_test_split(X, y['Close'], train_size=.8, random_state=11568)
    model = XGBRegressor(tree_method='gpu_hist', gpu_id=0)
    
    xgb_grid = GridSearchCV(model,
                           parameters,
                           cv=2,
                           n_jobs=1,
                           scoring=make_scorer(rmse, greater_is_better=False),
                           verbose=False
                           )
    
    xgb_grid.fit(
        X.drop('Date', axis=1), y['Close']
    )
#     model_store[stock] = xgb_grid
#     metrics[stock] = xgb_grid.best_score_
#     print(str(xgb_grid.best_params_))
    return pd.DataFrame({'Params': str(xgb_grid.best_params_), 'Score': abs(xgb_grid.best_score_), 'ID': X_test.index, 'Close': xgb_grid.predict(X_test.drop('Date', axis=1))})


num_cores = 16
# preds = Parallel(n_jobs=num_cores)(delayed(get_predictions1)(stock) for stock in tqdm(range(103)))

# preds = [get_predictions1(stock) for stock in tqdm(range(1))]

In [None]:
# preds[0]

In [None]:
# preds = []
# for stock in X.stock.unique(): 

# # for stock in [1]:
#     pred1 = model_store[stock].predict(X_test[X_test['stock'] == stock].drop(columns=['Date']), num_iteration=model_store[stock].best_iteration_)
#     pred2 = model_store1[stock].predict(X_test[X_test['stock'] == stock])
    
#     if metrics[stock] > metrics1[stock]:
#         print("picked first")
# #         preds.append(pd.DataFrame({'ID': X_test[X_test['stock'] == stock].index, 'Close': model_store1[stock].predict(X_test[X_test['stock'] == stock])}))
#         preds.append(pd.DataFrame({'ID': X_test[X_test['stock'] == stock].index, 'Close': (pred2 * 7 + pred1) / 8}))
    
#     else:
#         print("picked second")
# #         preds.append(pd.DataFrame({'ID': X_test[X_test['stock'] == stock].index, 'Close': model_store[stock].predict(X_test[X_test['stock'] == stock].drop(columns=['Date']), num_iteration=model_store[stock].best_iteration_)}))
#         preds.append(pd.DataFrame({'ID': X_test[X_test['stock'] == stock].index, 'Close': (pred2 * 4 + pred1) / 5}))
    

pd.concat(preds1)[['ID', 'Close']].to_csv('result_latest.csv', index=False)

pd.concat(preds1).to_csv('result_catboost_analysis_latest.csv', index=False)

# pd.concat(preds)[['ID', 'Close']].to_csv('result1.csv', index=False)
# pd.concat(preds1).shape       

In [None]:
pd.concat(preds1)

In [None]:
# pd.concat(preds)['Params'].iloc[0]

In [None]:
FileLink('result.csv')

In [None]:
# FileLink('result1.csv')

In [None]:

!cat result_latest.csv

In [None]:
# for stock in X_test.stock.unique():
# !cat result1.csv

In [None]:
!cat result_catboost_analysis_latest.csv