In [93]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.tree         import DecisionTreeRegressor
from sklearn.ensemble     import RandomForestRegressor

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import mean_squared_error

In [36]:
def import_data(csv): 
    
    df = pd.read_csv(csv)
    df.index = pd.to_datetime(df['date'], format='%Y-%m-%d')
    df = df.drop('date', axis=1)
    
    close_columns = []
    high_columns = []
    low_columns = []
    open_columns = []
    volume_columns = []
    open_int_columns = []

    for i in df.columns:
        if "close" in i:
            close_columns.append(i)
        elif "high" in i:
            high_columns.append(i)
        elif "low" in i:
            low_columns.append(i)
        elif "open_int" in i:
            open_int_columns.append(i)
        elif "open" in i:
            open_columns.append(i)
        elif "volume" in i:
            volume_columns.append(i)

    close_df = df[close_columns]
    high_df = df[high_columns]
    low_df = df[low_columns]
    open_df = df[open_columns]
    volume_df = df[volume_columns]
    open_int_df = df[open_int_columns]
    
    return [close_df, high_df, low_df, open_df, volume_df, open_int_df]

def garman_klass_vol(high_df, low_df, close_df, open_df, period=60):
    """
    Estima a volatilidade a partir dos seguintes preços: alta, baixa, abertura e fechamento
    """
    # Calculando parcelas internas da somatoria
    x_hl = (1/2)*(np.log(np.divide(high_df, low_df))) ** 2
    x_co = - (2 * np.log(2) - 1)* (np.log(np.divide(close_df, open_df))**2)
    
    # Somando parcelas calculadas
    x = x_hl + x_co.values
    
    x.columns = [x[0:3] + "gk" for x in x.columns]
    
    # Criando dataframe para atribuir as volatilidades
    gk = x.copy()
    
    # Termo constante fora da somatoria (Considerando vol diaria)
    const = 1/period
    
    # Atribuindo not a number, para os valores iniciais
    gk.iloc[:period,:] = np.nan
    
    # iteração do centro de massa da vol
    for row in range(period, len(high_df)):
        gk.iloc[row] = np.sqrt(const * np.sum(x.iloc[row-period:row,:]))
        
    return gk

def parkinson_vol(high_df, low_df, period=60):
    """
    Estimando a volatilidade a partir dos preço de Alta e de Baixa
    """
    
    # Calculando parcela interna da somatoria
    x = np.log(np.divide(high_df, low_df)) ** 2
    x.columns = [x[0:3] + "pv" for x in x.columns]
    
    # Criando dataframe para atribuir as volatilidades
    pv = x.copy()
    
    # Termo constante fora da somatoria (Considerando vol diaria)
    const = 1 / (4 * period * np.log(2))
    
    # Atribuindo not a number, para os valores iniciais
    pv.iloc[:period,:] = np.nan
        
    # iteração do centro de massa da vol
    for row in range(period, len(high_df)):
        pv.iloc[row] = np.sqrt(const * np.sum(x.iloc[row-period:row,:]))
        
    return pv

def monthly_volume(volume, period=20):
    
    daily_volume = volume.copy()
    
    for row in range(period, len(volume)):
        daily_volume.iloc[row] = volume.iloc[row-period:row,:].cumsum().iloc[-1]
        
    monthly_volume = daily_volume.resample("BM").last().ffill()
    
    return monthly_volume
    

In [49]:
close_df, high_df, low_df, open_df, volume_df, open_int_df = import_data("data.csv")

# Retornos diários
returns_daily = close_df.pct_change().fillna(0)

# Retornos mensais
returns_monthly = close_df.pct_change(20).fillna(0).resample('BM').last().ffill()

# Vol diária
vol_daily = returns_daily.ewm(adjust=True, com=60, min_periods=0).std().dropna()

# Vol de 261 dias, apenas o último dia de cada mês
vol_monthly = (np.sqrt(261)*vol_daily).resample('BM').last().ffill()

pv_df = parkinson_vol(high_df, low_df)
pv_monthly = (np.sqrt(261)*pv_df).resample('BM').last().ffill()

gk_df = garman_klass_vol(high_df, low_df, close_df, open_df)
gk_monthly = (np.sqrt(261)*gk_df).resample('BM').last().ffill()

monthly_volume = monthly_volume(volume_df)

## Feature Engineering

In [61]:
train = pd.DataFrame()

train["Returns Monthly"] = returns_monthly.iloc[:,0]
train["EWMA Monthly"] = vol_monthly.iloc[:,0]
train["Parkinson"] = pv_monthly.iloc[:,0]
train["Garman-Klass"] = gk_monthly.iloc[:,0]
train["Monthly Volume"] = monthly_volume.iloc[:,0]

prices = close_df.iloc[:,0]

for lag in range(3,12):
    train["Lagged Momentum " + str(lag)] = prices.pct_change(lag * 20)
    
test = train["Returns Monthly"].shift(-1)

In [62]:
train.tail()

Unnamed: 0_level_0,Returns Monthly,EWMA Monthly,Parkinson,Garman-Klass,Monthly Volume,Lagged Momentum 3,Lagged Momentum 4,Lagged Momentum 5,Lagged Momentum 6,Lagged Momentum 7,Lagged Momentum 8,Lagged Momentum 9,Lagged Momentum 10,Lagged Momentum 11
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2019-02-28,-0.01262,0.104332,0.095968,0.094675,2113513.0,-0.028649,-0.005309,-0.011797,-0.024791,-0.04468,-0.065249,-0.061058,-0.088932,-0.091605
2019-03-29,0.003521,0.095175,0.092544,0.091479,2173698.0,0.006498,-0.018325,0.000281,-0.013158,-0.020753,-0.044394,-0.062993,-0.061265,-0.087241
2019-04-30,-0.004223,0.091805,0.081642,0.079977,1921625.0,-0.018999,-0.011183,-0.034925,-0.011735,-0.018182,-0.031092,-0.050852,-0.071288,-0.067124
2019-05-31,-0.014864,0.083834,0.070287,0.071457,1903275.0,-0.036417,-0.030105,-0.034277,-0.050743,-0.030645,-0.035883,-0.063013,-0.06175,-0.08997
2019-06-28,0.005648,0.079359,0.068232,0.068433,2539780.0,,,,,,,,,


In [63]:
test.tail()

date
2019-02-28    0.003521
2019-03-29   -0.004223
2019-04-30   -0.014864
2019-05-31    0.005648
2019-06-28         NaN
Freq: BM, Name: Returns Monthly, dtype: float64

In [95]:
X = train.dropna().iloc[:100]
y = np.sign(test.dropna().iloc[:100])

X_test = train.dropna().iloc[100:101]
y_test = np.sign(test.dropna().iloc[100:101])

## Trainning

In [96]:
rf = RandomForestClassifier()
rf.fit(X, y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [97]:
result = rf.predict(X_test)

mean_squared_error(result, y_test)

4.0

In [98]:
print(result, y_test)

[1.] date
2007-10-31   -1.0
Freq: BM, Name: Returns Monthly, dtype: float64
