### Load Libraries

In [4]:
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
import plotly.graph_objs as go
import os
import warnings
import pandas as pd
import numpy as np
import sys

sys.path.append('../../src/utils/')
from data_wrangler import timeseries_to_supervise, create_custom_target, fetch_topn_features, create_all_features, convert_custom_target_to_actual
from model_utils import evaluate_model, train_model, extract_shap_values, select_model
from visualization import plot_predictions

import importlib
importlib.reload(sys.modules['visualization'])
from visualization import plot_predictions

### Define Data Paths/Variables

In [16]:
data_paths = {'RAW_DATA': '../../datasets/rawdata/market_data/',
                 'FINANCIAL_RESULTS': '../../datasets/processed_data/financial_results/',
                 'INDEX_FEATURES': '../../datasets/processed_data/index_features/',
                 'FEATURE_IMP_PATH': '../../datasets/processed_data/feature_importance/LightGBM/',
                 'AGG_SENTIMENT': '../../datasets/processed_data/agg_sentiment_scores/agg_sentiment.csv',
                 'TOPIC_SENTIMENT': '../../datasets/processed_data/agg_sentiment_scores/agg_sent_topic.csv',
                 'TICKER_SENTIMENT': '../../datasets/processed_data/agg_sentiment_scores/ticker_news_sent.csv',
                 'TICKERS': ['EIHOTEL.BO', 'ELGIEQUIP.BO', 'IPCALAB.BO', 'PGHL.BO',  'TV18BRDCST.BO'],
                 'TOPIC_IDS': [33, 921, 495, 495, 921]

             }

train_size = 0.8  # 80% for training, 20% for testing
window_size = 10  # Number of past records to consider
target_price = 'ln_target'
seed= 42

ticker = data_paths['TICKERS'][2]
topic_id = data_paths['TOPIC_IDS'][2]

### Load Data

In [12]:
path = '../../datasets/processed_data/combined_features/'    

if os.path.isfile(path):
       combined_df = pd.read_csv(path + ticker + '.csv.gz')
else:            
    # create all the features
    combined_df = create_all_features(data_paths, ticker, topic_id)        
    combined_df.to_csv(path + ticker + '.csv', index=False)

combined_date_df = combined_df['date']
combined_df = combined_df.drop(columns='date')        

shape after combining index featuress results: (4021, 633)


### Split Data/Train model

In [13]:
# fetch topn features as per feature importance
topn_feature_count = 50
# fetch topn features as per feature importance
topn_features_df = fetch_topn_features(data_paths['FEATURE_IMP_PATH'], topn_feature_count)
topn_features = topn_features_df['feature'].values.tolist()
topn_features = topn_features + ['yesterday_close', 'ln_target']        

../../datasets/processed_data/feature_importance/LightGBM/


In [14]:
# do train/test split the data with shuffle = False
train_data, test_data = train_test_split(combined_df.loc[:, topn_features], train_size=train_size, shuffle=False)
train_date, test_date = train_test_split(combined_date_df, train_size=train_size, shuffle=False)

# convert timeseries to be used in supervise learning model
X_train, y_train, indx_train = timeseries_to_supervise(train_data, window_size, target_price)  

# further split test set to have an hold out set to be used for backtesting
eval_data, test_data = train_test_split(test_data, train_size=0.5, shuffle=False)
eval_date, test_date = train_test_split(test_date, train_size=0.5, shuffle=False)


# convert timeseries to be used in supervise learning model    
X_eval, y_eval, indx_eval = timeseries_to_supervise(eval_data, window_size, target_price)  

### Train the Model

In [15]:
%time
# Step 4: Build the Random Forest Model
model_name = 'LightGBM'
model = select_model(model_name, seed)

# train the Random Forest model
trained_model = train_model(model, X_train, y_train)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.25 µs
Total trainging time:  0:00:02.236023


#### Evaluate Model

In [17]:
import joblib
path = '../../trained_models/LightGBM/IPCALAB.BO.pkl'

trained_model = joblib.load(path)
trained_model

In [18]:
# evaluate the fitted model using mape and rmse metrics
predictions_df, mape, rmse = evaluate_model(trained_model, window_size, eval_data, eval_date, X_eval, y_eval)                        

print("for ticker {0} mean absolute percentage error: {1}, root_mean_square_error: {2}".format(ticker, round(mape, 3), round(rmse, 3)))

for ticker IPCALAB.BO mean absolute percentage error: 0.019, root_mean_square_error: 1.432


### Compute Shap values for Feature Importance

In [7]:
# compute the feature importance using TreeSHAP 
feature_importance_df = extract_shap_values(trained_model, train_data, X_train, window_size)
feature_importance_df.columns = ['shap_value' + '_' + ticker,  'feature']

feature_importance_df.head()

Unnamed: 0,shap_value_IPCALAB.BO,feature
0,1e-05,volatility_kch_20
1,9e-06,volume_obv_200
2,9e-06,trend_ema_slow_50
3,8e-06,trend_visual_ichimoku_a_100
4,6e-06,volatility_kcc


In [20]:
fig = plot_predictions(predictions_df, ticker)

#### Plot Predictions from file 

In [21]:
path = '../../datasets/processed_data/model_predictions/LightGBM/'
ticker = data_paths['TICKERS'][0]
predictions_df = pd.read_csv(path + ticker + '.csv')

In [23]:
# plot chart for actual and predicted values with including predicted values range(highe/lower)    
fig = plot_predictions(predictions_df, ticker)    