In [51]:
import numpy as np
import pandas as pd
import joblib

read_data

In [52]:
def read_data(path, path_ihsg, 
              save_file = True,
              return_file = True,
              set_index = None):


    emiten = pd.read_csv(path, index_col = set_index)
    ihsg = pd.read_csv(path_ihsg, index_col = set_index)
    merged = pd.merge(emiten, ihsg, how='left', on='Date')
    merged['Close+1'] = merged['Close_x'].shift(-1)

    if save_file:
        joblib.dump(merged, "output/merged.pkl")
    
    if return_file:
        return merged



def split_input_output(dataset,
                       target_column,
                       save_file = True,
                       return_file = True):
    
    output_df = dataset[target_column]
    input_df = dataset.drop([target_column],
                            axis = 1)
    
    if save_file:
        joblib.dump(output_df, "output/output_df.pkl")
        joblib.dump(input_df, "output/input_df.pkl")
    
    if return_file:
        return output_df, input_df

def x_split(input_df, return_file=True, save_file=True):
    X_train = input_df[:int(input_df.shape[0]*0.6)]
    test_val = input_df[int(input_df.shape[0]*0.6):]
    X_valid = test_val[:int(test_val.shape[0]*0.5)]
    X_test = test_val[int(test_val.shape[0]*0.5):]

    if save_file:
        joblib.dump(X_train, "output/X_train.pkl")
        joblib.dump(X_valid, "output/X_valid.pkl")
        joblib.dump(X_test, "output/X_test.pkl")

    if return_file:
        return X_train, X_valid, X_test
        
def y_split(output_df, return_file=True, save_file=True):
    y_train = output_df[:int(output_df.shape[0]*0.6)]
    y_test_val = output_df[int(output_df.shape[0]*0.6):]
    y_valid = y_test_val[:int(y_test_val.shape[0]*0.5)]
    y_test = y_test_val[int(y_test_val.shape[0]*0.5):]
    
    if save_file:
        joblib.dump(y_train, "output/y_train.pkl")
        joblib.dump(y_valid, "output/y_valid.pkl")
        joblib.dump(y_test, "output/y_test.pkl")

    if return_file:
        return y_train, y_valid, y_test

In [53]:
DATA_PATH = "data/AMRT.csv"
DATA_PATH_IHSG = "data/ihsg.csv"
TARGET_COLUMN = "Close+1"
INDEX_COLUMN = "Date"

data_house = read_data(DATA_PATH, DATA_PATH_IHSG,
                       set_index = INDEX_COLUMN)
output_df, input_df = split_input_output(
                            data_house,
                            TARGET_COLUMN)

X_train, X_val, X_test = x_split(input_df)
y_train, y_val, y_test = y_split(output_df)

PREPROCESSING

In [54]:
def process_emiten(proceed):
    proceed = proceed.drop(['Open_x', 'Open_y', 'High_x', 'High_y', 'Low_x', 'Low_y', 'Adj Close_x', 'Adj Close_y'], axis = 1)
    proceed.rename(columns = {'Close_x':'Close', 'Volume_x':'Volume', 'Close_y':'Close_ihsg', 'Volume_y':'Volume_ihsg'}, inplace = True)
    proceed.dropna(inplace=True)
    proceed.drop(proceed.loc[proceed["Volume"]==0].index, inplace=True)
    proceed.drop(proceed.loc[proceed["Volume_ihsg"]==0].index, inplace=True)
    return proceed

In [55]:
def processing_data(save_file=True, return_file=True):
    X_train = process_emiten(joblib.load("output/X_train.pkl"))
    X_valid = process_emiten(joblib.load("output/X_valid.pkl"))
    X_test = process_emiten(joblib.load("output/X_test.pkl"))
    
    if save_file:
        joblib.dump(X_train, "output/X_train_proceed.pkl")
        joblib.dump(X_valid, "output/X_valid_proceed.pkl")
        joblib.dump(X_test, "output/X_test_proceed.pkl")
    if return_file:
        X_train, X_valid, X_test
    

feature engineering

In [44]:
def make_sma(xdata, save_file=True, return_file=True):
    periode = [5,20,60,120]
    alpha = [0.1, 0.3]
    for i in periode:
        for k in alpha:
            xdata["SMA_", i] = xdata.Close.rolling(i, min_periods=1).mean()
            xdata["dis_sma", i] = xdata["Close"] - xdata["SMA_", i]
            xdata["em_", k] = xdata.Close.ewm(alpha=k, adjust=False).mean()
    xdata.rename(columns = {('SMA_', 5):'SMA_5',
                        ('SMA_', 20):'SMA_20',
                        ('SMA_', 60): 'SMA_60', 
                        ('SMA_', 120): 'SMA_120',
                        ('em_', 0.1): 'em_0.1',
                        ('em_', 0.3): 'em_0.3',
                        ('dis_sma', 5): 'dis_sam_5',
                        ('dis_sma', 20): 'dis_sam_20',
                        ('dis_sma', 60): 'dis_sam_60',
                        ('dis_sma', 120): 'dis_sam_120'}, inplace = True)
    return xdata

In [45]:
def making_sma(save_file=True, return_file=True):
    X_train = make_sma(joblib.load("output/X_train_proceed.pkl"))
    X_valid = make_sma(joblib.load("output/X_valid_proceed.pkl"))
    X_test = make_sma(joblib.load("output/X_test_proceed.pkl"))
    
    if save_file:
        joblib.dump(X_train, "output/X_train_final.pkl")
        joblib.dump(X_valid, "output/X_valid_final.pkl")
        joblib.dump(X_test, "output/X_test_final.pkl")
    if return_file:
        X_train, X_valid, X_test

Model Search

In [17]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn import model_selection
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.svm import SVR
from sklearn import metrics
import random
import yaml

In [20]:
f = open("params.yaml", "r")
params = yaml.load(f, Loader=yaml.SafeLoader)
f.close()

In [28]:
def read_data(params):
    x_train = joblib.load(params['DUMP_TRAIN'])
    y_train = joblib.load(params['Y_PATH_TRAIN'])
    x_valid = joblib.load(params['DUMP_VALID'])
    y_valid = joblib.load(params['Y_PATH_VALID'])

    return x_train, y_train, x_valid, y_valid

In [30]:
x_train, y_train, x_valid, y_valid = read_data(params)