In [37]:
import numpy as np
import pandas as pd
import joblib

read_data

In [121]:
def read_data(path, path_ihsg, 
              save_file = True,
              return_file = True,
              set_index = None):


    emiten = pd.read_csv(path, index_col = set_index)
    ihsg = pd.read_csv(path_ihsg, index_col = set_index)
    merged = pd.merge(emiten, ihsg, how='left', on='Date')
    merged['Close+1'] = merged['Close_x'].shift(-1)

    if save_file:
        joblib.dump(merged, "output/merged.pkl")
    
    if return_file:
        return merged



def split_input_output(dataset,
                       target_column,
                       save_file = True,
                       return_file = True):
    
    output_df = dataset[target_column]
    input_df = dataset.drop([target_column],
                            axis = 1)
    
    if save_file:
        joblib.dump(output_df, "output/output_df.pkl")
        joblib.dump(input_df, "output/input_df.pkl")
    
    if return_file:
        return output_df, input_df

def x_split(input_df, return_file=True, save_file=True):
    X_train = input_df[:int(input_df.shape[0]*0.6)]
    test_val = input_df[int(input_df.shape[0]*0.6):]
    X_valid = test_val[:int(test_val.shape[0]*0.5)]
    X_test = test_val[int(test_val.shape[0]*0.5):]

    if save_file:
        joblib.dump(X_train, "output/X_train.pkl")
        joblib.dump(X_valid, "output/X_valid.pkl")
        joblib.dump(X_test, "output/X_test.pkl")

    if return_file:
        return X_train, X_valid, X_test
        
def y_split(output_df, return_file=True, save_file=True):
    y_train = output_df[:int(output_df.shape[0]*0.6)]
    y_test_val = output_df[int(output_df.shape[0]*0.6):]
    y_valid = y_test_val[:int(y_test_val.shape[0]*0.5)]
    y_test = y_test_val[int(y_test_val.shape[0]*0.5):]
    
    if save_file:
        joblib.dump(y_train, "output/y_train.pkl")
        joblib.dump(y_valid, "output/y_valid.pkl")
        joblib.dump(y_test, "output/y_test.pkl")

    if return_file:
        return y_train, y_valid, y_test

In [122]:
DATA_PATH = "data/AMRT.csv"
DATA_PATH_IHSG = "data/ihsg.csv"
TARGET_COLUMN = "Close+1"
INDEX_COLUMN = "Date"

data_house = read_data(DATA_PATH, DATA_PATH_IHSG,
                       set_index = INDEX_COLUMN)
output_df, input_df = split_input_output(
                            data_house,
                            TARGET_COLUMN)

X_train, X_valid, X_test = x_split(input_df)
y_train, y_valid, y_test = y_split(output_df)

PREPROCESSING

In [123]:
def process_emiten(proceed):
    proceed = proceed.drop(['Open_x', 'Open_y', 'High_x', 'High_y', 'Low_x', 'Low_y', 'Adj Close_x', 'Adj Close_y'], axis = 1)
    proceed.rename(columns = {'Close_x':'Close', 'Volume_x':'Volume', 'Close_y':'Close_ihsg', 'Volume_y':'Volume_ihsg'}, inplace = True)
    proceed["Volume"].replace(to_replace=0, method='bfill')
    proceed["Volume_ihsg"].replace(to_replace=0, method='bfill')
    proceed.replace(to_replace=0, method='bfill')
    return proceed

def process_y(y_data):
    y_data.replace(to_replace=0, method='bfill')
    return y_data

In [124]:
def processing_data(save_file=True, return_file=True):
    X_train = process_emiten(joblib.load("output/X_train.pkl"))
    X_valid = process_emiten(joblib.load("output/X_valid.pkl"))
    X_test = process_emiten(joblib.load("output/X_test.pkl"))
    y_train = process_y(joblib.load("output/y_train.pkl"))
    y_valid = process_y(joblib.load("output/y_valid.pkl"))
    y_test = process_y(joblib.load("output/y_test.pkl"))
    
    if save_file:
        joblib.dump(X_train, "output/X_train_proceed.pkl")
        joblib.dump(X_valid, "output/X_valid_proceed.pkl")
        joblib.dump(X_test, "output/X_test_proceed.pkl")
        joblib.dump(y_train, "output/y_train_final.pkl")
        joblib.dump(y_valid, "output/y_valid_final.pkl")
        joblib.dump(y_test, "output/y_test_final.pkl")
    if return_file:
        X_train, X_valid, X_test, y_train, y_valid, y_test
    

feature engineering

In [127]:
def make_sma(xdata, save_file=True, return_file=True):
    periode = [5,20,60,120]
    alpha = [0.1, 0.3]
    for i in periode:
        for k in alpha:
            xdata["SMA_", i] = xdata.Close.rolling(i, min_periods=1).mean()
            xdata["dis_sma", i] = xdata["Close"] - xdata["SMA_", i]
            xdata["em_", k] = xdata.Close.ewm(alpha=k, adjust=False).mean()
    xdata.rename(columns = {('SMA_', 5):'SMA_5',
                        ('SMA_', 20):'SMA_20',
                        ('SMA_', 60): 'SMA_60', 
                        ('SMA_', 120): 'SMA_120',
                        ('em_', 0.1): 'em_0.1',
                        ('em_', 0.3): 'em_0.3',
                        ('dis_sma', 5): 'dis_sam_5',
                        ('dis_sma', 20): 'dis_sam_20',
                        ('dis_sma', 60): 'dis_sam_60',
                        ('dis_sma', 120): 'dis_sam_120'}, inplace = True)
    return xdata

In [128]:
def making_sma(save_file=True, return_file=True):
    X_train = make_sma(joblib.load("output/X_train_proceed.pkl"))
    X_valid = make_sma(joblib.load("output/X_valid_proceed.pkl"))
    X_test = make_sma(joblib.load("output/X_test_proceed.pkl"))
    
    if save_file:
        joblib.dump(X_train, "output/X_train_final.pkl")
        joblib.dump(X_valid, "output/X_valid_final.pkl")
        joblib.dump(X_test, "output/X_test_final.pkl")
    if return_file:
        X_train, X_valid, X_test

Model Search

In [130]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn import model_selection
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.svm import LinearSVR
from sklearn import metrics
from sklearn.model_selection import TimeSeriesSplit
import random
import yaml
import time

ts_cv = TimeSeriesSplit(
    n_splits=5,
    max_train_size=None,
)

In [131]:
f = open("params.yaml", "r")
params = yaml.load(f, Loader=yaml.SafeLoader)
f.close()

In [132]:
def read_data(params):
    x_train = joblib.load(params['DUMP_TRAIN'])
    y_train = joblib.load(params['Y_PATH_TRAIN'])
    x_valid = joblib.load(params['DUMP_VALID'])
    y_valid = joblib.load(params['Y_PATH_VALID'])

    return x_train, y_train, x_valid, y_valid

def model_ridge():
    param_dist = {'alpha': [0.1, 0.25, 0.5, 0.75]}
    base_model = Ridge(random_state=42)
    return param_dist, base_model

def model_lasso():
    param_dist = {'alpha': np.random.uniform(0.01,3,1000)}
    base_model = Lasso(random_state=42, selection='random')
    return param_dist, base_model


def model_rf():
    param_dist = {"n_estimators": [100, 250, 500, 1000]}
    base_model = RandomForestRegressor(random_state=0, n_jobs=-1)
    return param_dist, base_model


def model_svr():
    param_dist = {'C': [0.25, 0.5, 1, 1.25]}
    base_model = LinearSVR(dual=False, max_iter=10000)
    return param_dist, base_model

def random_search_cv(model, param, scoring, n_iter, x, y, verbosity=0):
    random_fit = RandomizedSearchCV(estimator=model,
                                    param_distributions=param,
                                    scoring=scoring,
                                    n_iter=n_iter,
                                    cv=ts_cv,
                                    random_state=0,
                                    verbose=verbosity)
    random_fit.fit(x, y)
    return random_fit

In [133]:
def evaluate(true, predicted):
    mae = metrics.mean_absolute_error(true, predicted)
    mse = metrics.mean_squared_error(true, predicted)
    rmse = np.sqrt(metrics.mean_squared_error(true, predicted))
    r2_square = metrics.r2_score(true, predicted)
    return mae, mse, rmse, r2_square


def fit(x_train, y_train, model, model_param, general_params):
    """
    Fit model

    Args:
        - model(callable): Sklearn / imblearn model
        - model_param(dict): sklearn's RandomizedSearchCV param_distribution
        - general_params(dict): general parameters for the function
            - target(str) : y column to be used   
            - scoring(str) : sklearn cross-val scoring scheme
            - n_iter_search : RandomizedSearchCV number of iteration
    """

    model_fitted = random_search_cv(model, model_param,
                                    params['scoring'],
                                    params['n_iter_search'],
                                    x_train, y_train,
                                    params['verbosity'])

    print(
        f'Model: {model_fitted.best_estimator_}, {params["scoring"]}: {model_fitted.best_score_}')

    return model_fitted


def validation_score(x_valid, y_valid, model_fitted):
    
    # Report default
    y_predicted = model_fitted.predict(x_valid)
    mae, mse, rmse, r2_square = evaluate(y_valid, y_predicted)
    score = {'mae':mae, 'mse':mse, 'rmse':rmse, 'r2': r2_square}

    return score

def select_model(train_log_dict):
    temp = []
    for score in train_log_dict['model_score']:
        temp.append(score['rmse'])
    best_model = train_log_dict['model_fit'][temp.index(min(temp))]
    best_parameter = train_log_dict['model_report'][temp.index(min(temp))]
    best_report = train_log_dict['model_score'][temp.index(min(temp))]
    
    return best_model, best_parameter, best_report

training

In [134]:
def main(params):
    '''
    Main function of modelling
    
    Parameters
    ----------
    params: .yaml file contain (dict) of general parameters for the read_data and model_lib function
        - DUMP_TRAIN (str)  : location of preprocessed training data pickle
        - Y_PATH_TRAIN (str): location of target column pickle for training data
        - DUMP_VALID (str)  : location of preprocessed validation data pickle
        - Y_PATH_VALID (str): location of target column  pickle validation data
        - target(str) : y column to be used   
        - scoring(str) : sklearn cross-val scoring scheme
        - n_iter_search : RandomizedSearchCV number of iteration
    '''

    ridge = model_ridge
    lasso = model_lasso
    rf = model_rf
    lsvr = model_svr
    
    # Make a dictionary "train_log_dict" to be saved later as pickle containing model information in training stage
    train_log_dict = {'model': [ridge, lasso, rf, lsvr],
                      'model_name': [],
                      'model_fit': [],
                      'model_report': [],
                      'model_score': [],
                      'fit_time': []}
    
    # Read data after preprocessing
    x_train, y_train, x_valid, y_valid  = read_data(params)

    # Iterate list model
    for model in train_log_dict['model']:
        # initiate the model
        param_model, base_model = model()
        # logging model name
        train_log_dict['model_name'].append(base_model.__class__.__name__)
        print(
           f'Fitting {base_model.__class__.__name__}')

        # Training
        t0 = time.time()
        
        # Searching best parameter using Random Search CV
        fitted_model, best_estimator = fit(
            x_train, y_train, base_model, param_model, params)
        elapsed_time = time.time() - t0
        print(f'elapsed time: {elapsed_time} s \n')
        train_log_dict['fit_time'].append(elapsed_time)
        train_log_dict['model_fit'].append(best_estimator.__class__.__name__)
        
        # Fitting model with best params to data training
        best_estimator.fit(x_train, y_train)
        train_log_dict['model_report'].append(best_estimator)

        
        # Validate model to validation data
        score = validation_score( x_valid, y_valid, best_estimator)
        train_log_dict['model_score'].append(score)

    # Select which model in model list has best score evaluation (minimum rmse) in validation data
    best_model, best_estimator, best_report = select_model(
        train_log_dict)
    print(
        f"Model: {best_model}, Score: {best_report}, Parameter: {best_estimator}")
    
    # Dump model name
    joblib.dump(best_model, f'output/model/model_name.pkl')
    # Dump best model estimator with best param
    joblib.dump(best_estimator, 'output/model/best_estimator.pkl')
    # Dump training log
    joblib.dump(train_log_dict, 'output/model/train_log.pkl')

In [135]:
main(params)

Fitting Ridge


ValueError: For multi-metric scoring, the parameter refit must be set to a scorer key or a callable to refit an estimator with the best parameter setting on the whole data and make the best_* attributes available for that metric. If this is not needed, refit should be set to False explicitly. True was passed.