In [3]:
import numpy as np
import pandas as pd
import joblib

In [4]:
def read_data(path, path_ihsg, 
              save_file = True,
              return_file = True,
              set_index = None):
    '''
    Read data from data folder in csv format.
    
    Parameters
    ----------
    path: str
          path to data
    
    '''
    
    emiten = pd.read_csv(path, index_col = set_index)
    ihsg = pd.read_csv(path_ihsg, index_col = set_index)
    
    def merge_emiten(emiten, ihsg):
        merged = pd.merge(emiten, ihsg, how='left', on='Date')
        merged.index = pd.to_datetime(merged.index)
        merged['Close+1'] = merged['Close'].shift(-1)
        return merged

    if save_file:
        joblib.dump(merge_emiten(emiten, ihsg), "merged.pkl")
    
    if return_file:
        return merge_emiten(emiten, ihsg)



def split_input_output(dataset,
                       target_column,
                       save_file = True,
                       return_file = True):
    
    output_df = dataset[target_column]
    input_df = dataset.drop([target_column],
                            axis = 1)
    
    if save_file:
        joblib.dump(output_df, "output_df.pkl")
        joblib.dump(input_df, "input_df.pkl")
    
    if return_file:
        return output_df, input_df

def x_split(input_df, return_file=True, save_file=True):
    X_train = input_df[:int(input_df.shape[0]*0.6)]
    test_val = input_df[int(input_df.shape[0]*0.6):]
    X_val = test_val[:int(test_val.shape[0]*0.5)]
    X_test = test_val[int(test_val.shape[0]*0.5):]

    if save_file:
        joblib.dump(X_train, "X_train.pkl")
        joblib.dump(X_val, "X_val.pkl")
        joblib.dump(X_test, "X_test.pkl")

    if return_file:
        return X_train, X_val, X_test
        
def y_split(output_df, return_file=True, save_file=True):
    y_train = output_df[:int(output_df.shape[0]*0.6)]
    y_test_val = output_df[int(output_df.shape[0]*0.6):]
    y_val = y_test_val[:int(y_test_val.shape[0]*0.5)]
    y_test = y_test_val[int(y_test_val.shape[0]*0.5):]
    
    if save_file:
        joblib.dump(y_train, "y_train.pkl")
        joblib.dump(y_val, "y_valid.pkl")
        joblib.dump(y_test, "y_test.pkl")

    if return_file:
        return y_train, y_val, y_test

In [5]:
DATA_PATH = "data/AMRT.csv"
DATA_PATH_IHSG = "data/ihsg.csv"
TARGET_COLUMN = "Close+1"
INDEX_COLUMN = "Date"

data_house = read_data(DATA_PATH, DATA_PATH_IHSG,
                       set_index = INDEX_COLUMN)
output_df, input_df = split_input_output(
                            data_house,
                            TARGET_COLUMN)

X_train, X_val, X_test = x_split(input_df)
y_train, y_val, y_test = y_split(output_df)

In [27]:
def process_emiten(proceed, save_file=True, return_file=True):
        proceed = proceed.drop(['Open_x', 'Open_y', 'High_x', 'High_y', 'Low_x', 'Low_y', 'Adj Close_x', 'Adj Close_y'], axis = 1)
        proceed.rename(columns = {'Close_x':'Close', 'Volume_x':'Volume', 'Close_y':'Close_ihsg', 'Volume_y':'Volume_ihsg'}, inplace = True)
        proceed.dropna(inplace=True)
        proceed.drop(proceed.loc[proceed["Volume"]==0].index, inplace=True)
        proceed.drop(proceed.loc[proceed["Volume_ihsg"]==0].index, inplace=True)
        
        if save_file:
            joblib.dump(proceed, "proceed.pkl")
        
        if return_file:
            return proceed

In [30]:
def SMA(feature_sma):
    feature_smas = []
    periode = [5,20,60,120]
    for i in periode:
        feature_sma["SMA_", i] = feature_sma.Close.rolling(i, min_periods=1).mean()
    return feature_sma.append(feature_smas)

In [8]:
def EMA(feature_ema):
    feature_emas = []
    periodes = [0.1, 0.3]
    for i in periodes:
        feature_ema["em_", i] = feature_ema.Close.ewm(alpha=i, adjust=False).mean()
    return feature_ema.append(feature_emas)

In [32]:
def dis_sma(feature_dis):
    feature_dis_get = []
    periode = [5,20,60,120]
    for i in periode:
        feature_dis["dis_sma", i] = feature_dis["Close"] - SMA(feature_dis)["SMA_", i]
    return feature_dis.append(feature_dis_get)