# Importing libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import altair as alt

# Importing data

In [2]:
# Importing LightGBM results
LGBM_no_sent = pd.read_csv('../datasets/processed_data/model_predictions/LightGBM/woSentiment/PGHL.BO.csv')
LGBM_w_sent = pd.read_csv('../datasets/processed_data/model_predictions/LightGBM/wSentiment/PGHL.BO.csv')

In [3]:
# Importing Prophet results
Prophet = pd.read_csv('../datasets/processed_data/model_predictions/Prophet/PGHL.BO.csv')

In [4]:
# Importing RF results
RF_no_sent = pd.read_csv('random_forest/rf_PGHL_without_sentiment.csv')
RF_w_sent = pd.read_csv('random_forest/rf_PGHL.csv')

In [5]:
# Importing LSTM results
LSTM = pd.read_csv('../datasets/processed_data/model_predictions/LSTM/PGHL_LSTM_predictions.csv')

In [6]:
# Linear Regression results
LR_no_sent = pd.read_csv('linear_regression/lr_PGHL_without_sentiment.csv')
LR_w_sent = pd.read_csv('linear_regression/lr_PGHL.csv')

# Combining datasets

## LSTM predictions outcome

In [7]:
LSTM.shape

(727, 4)

In [8]:
LSTM.head()

Unnamed: 0,date,Actual High,Predicted high LSTM (no sentiments),Predicted high LSTM (with sentiments)
0,2020-06-23,4228.899902,4055.678879,4027.823663
1,2020-06-24,4123.0,3811.401374,4058.600804
2,2020-06-25,4230.100098,4208.568634,3971.555611
3,2020-06-26,4230.850098,4265.89479,4080.846886
4,2020-06-29,4150.0,4250.807348,4025.863762


## LGBM predictions outcome

In [9]:
LGBM_no_sent.head(2)

Unnamed: 0,date,high,pred_high
0,2020-06-23,4228.9,4233.207
1,2020-06-24,4123.0,4210.195


In [10]:
LGBM_no_sent.columns = ['date', 'high', 'Predicted high LGBM (no sentiments)']

In [11]:
LGBM_no_sent.shape

(358, 3)

In [12]:
LGBM_w_sent.head(2)

Unnamed: 0,date,high,pred_high
0,2020-06-23,4228.9,4263.27
1,2020-06-24,4123.0,4248.697


In [13]:
LGBM_w_sent.columns = ['date', 'high', 'Predicted high LGBM (with sentiments)']

In [14]:
LGBM_w_sent.shape

(358, 3)

## Prophet predictions outcome

In [15]:
Prophet.head(2)

Unnamed: 0,date,high,pred_high_lower,pred_high_upper,pred_high
0,2021-11-30,5342.799805,5227.875,5370.885,5298.366
1,2021-12-01,5360.0,5313.452,5459.738,5386.257


In [16]:
Prophet.columns= ['date', 'high', 'pred_high_lower_Prophet', 'pred_high_upper_Prophet', 'Predicted high Prophet']

In [17]:
Prophet.shape

(369, 5)

## RF predictions outcome

In [18]:
RF_no_sent.head(2)

Unnamed: 0,date,y_test,y_pred
0,2020-06-23,4228.899902,4309.011199
1,2020-06-24,4123.0,4203.002106


In [19]:
RF_no_sent.columns = ['date', 'y_test', 'Predicted high RF (no sentiments)']

In [20]:
RF_no_sent.shape

(358, 3)

In [21]:
RF_w_sent.columns = ['date', 'y_test', 'Predicted high RF (with sentiments)']

In [22]:
RF_w_sent.head(2)

Unnamed: 0,date,y_test,Predicted high RF (with sentiments)
0,2020-06-23,4228.899902,4290.706549
1,2020-06-24,4123.0,4204.649146


In [23]:
RF_w_sent.shape

(727, 3)

## Linear regression predictions outcome

In [24]:
LR_no_sent.head(2)

Unnamed: 0,date,y_test,y_pred,label,predicted_label
0,2020-06-23,4228.899902,4241.896582,Actual,Predicted
1,2020-06-24,4123.0,4193.118008,Actual,Predicted


In [25]:
LR_no_sent.columns = ['date', 'high', 'Predicted high LR (no sentiments)', 'label', 'predicted_label']

In [26]:
LR_w_sent.head(2)

Unnamed: 0,date,y_test,y_pred
0,2020-06-23,4228.899902,4242.008489
1,2020-06-24,4123.0,4191.24086


In [27]:
LR_w_sent.columns = ['date', 'high', 'Predicted high LR (with sentiments)']

## Combining results into dataframe

In [28]:
df = pd.concat([RF_no_sent, RF_w_sent['Predicted high RF (with sentiments)'], 
                LGBM_no_sent['Predicted high LGBM (no sentiments)'], 
                LGBM_w_sent['Predicted high LGBM (with sentiments)'],
                LR_no_sent['Predicted high LR (no sentiments)'],
                LR_w_sent['Predicted high LR (with sentiments)']], axis=1)

In [29]:
df.head()

Unnamed: 0,date,y_test,Predicted high RF (no sentiments),Predicted high RF (with sentiments),Predicted high LGBM (no sentiments),Predicted high LGBM (with sentiments),Predicted high LR (no sentiments),Predicted high LR (with sentiments)
0,2020-06-23,4228.899902,4309.011199,4290.706549,4233.207,4263.27,4241.896582,4242.008489
1,2020-06-24,4123.0,4203.002106,4204.649146,4210.195,4248.697,4193.118008,4191.24086
2,2020-06-25,4230.100098,4165.671817,4166.28419,4192.94,4205.292,4187.463257,4182.84139
3,2020-06-26,4230.850098,4287.13301,4304.28281,4309.198,4302.127,4325.627181,4321.487069
4,2020-06-29,4150.0,4209.492186,4214.394953,4253.206,4232.973,4211.940476,4206.007914


In [30]:
# Merging results with the LSTM results
result = pd.merge(df, LSTM, how="left", on=["date", "date"])

In [31]:
result = result.drop('y_test', axis=1)

In [32]:
result.head()

Unnamed: 0,date,Predicted high RF (no sentiments),Predicted high RF (with sentiments),Predicted high LGBM (no sentiments),Predicted high LGBM (with sentiments),Predicted high LR (no sentiments),Predicted high LR (with sentiments),Actual High,Predicted high LSTM (no sentiments),Predicted high LSTM (with sentiments)
0,2020-06-23,4309.011199,4290.706549,4233.207,4263.27,4241.896582,4242.008489,4228.899902,4055.678879,4027.823663
1,2020-06-24,4203.002106,4204.649146,4210.195,4248.697,4193.118008,4191.24086,4123.0,3811.401374,4058.600804
2,2020-06-25,4165.671817,4166.28419,4192.94,4205.292,4187.463257,4182.84139,4230.100098,4208.568634,3971.555611
3,2020-06-26,4287.13301,4304.28281,4309.198,4302.127,4325.627181,4321.487069,4230.850098,4265.89479,4080.846886
4,2020-06-29,4209.492186,4214.394953,4253.206,4232.973,4211.940476,4206.007914,4150.0,4250.807348,4025.863762


In [33]:
result.columns

Index(['date', 'Predicted high RF (no sentiments)',
       'Predicted high RF (with sentiments)',
       'Predicted high LGBM (no sentiments)',
       'Predicted high LGBM (with sentiments)',
       'Predicted high LR (no sentiments)',
       'Predicted high LR (with sentiments)', 'Actual High',
       'Predicted high LSTM (no sentiments)',
       'Predicted high LSTM (with sentiments)'],
      dtype='object')

In [34]:
melted_df = pd.melt(result, id_vars =['date'],
                     value_vars =['Actual High','Predicted high LSTM (no sentiments)',
                                  'Predicted high LSTM (with sentiments)',
                                 'Predicted high RF (no sentiments)', 
                                  'Predicted high RF (with sentiments)', 
                                  'Predicted high LGBM (no sentiments)', 
                                  'Predicted high LGBM (with sentiments)',
                                 'Predicted high LR (no sentiments)',
                                 'Predicted high LR (with sentiments)'],
                    var_name ='Method', value_name ='High price')

In [35]:
melted_df.head(5)

Unnamed: 0,date,Method,High price
0,2020-06-23,Actual High,4228.899902
1,2020-06-24,Actual High,4123.0
2,2020-06-25,Actual High,4230.100098
3,2020-06-26,Actual High,4230.850098
4,2020-06-29,Actual High,4150.0


In [36]:
melted_df.Method.unique()

array(['Actual High', 'Predicted high LSTM (no sentiments)',
       'Predicted high LSTM (with sentiments)',
       'Predicted high RF (no sentiments)',
       'Predicted high RF (with sentiments)',
       'Predicted high LGBM (no sentiments)',
       'Predicted high LGBM (with sentiments)',
       'Predicted high LR (no sentiments)',
       'Predicted high LR (with sentiments)'], dtype=object)

In [37]:
# use the 538 theme
alt.themes.enable('fivethirtyeight')
alt.data_transformers.enable('default', max_rows=None)

DataTransformerRegistry.enable('default')

In [38]:
zoom = alt.selection_interval(encodings=["x", "y"])

palette = alt.Scale(domain=['Actual High', 'Predicted high LSTM (no sentiments)',
       'Predicted high LSTM (with sentiments)',
       'Predicted high RF (no sentiments)',
       'Predicted high RF (with sentiments)',
       'Predicted high LGBM (no sentiments)',
       'Predicted high LGBM (with sentiments)',
                           'Predicted high LR (no sentiments)','Predicted high LR (with sentiments)'],
                  range=['#330000', '#FF0000', '#0000FF', '#00FF00', '#FF8000','#663300','#808080', '#00FFFF', '#FF00FF'])

nearest = alt.selection_point(nearest=True, on='mouseover', clear='mouseout',
                        fields=['date'], empty=False)


minimap = (
    alt.Chart(melted_df)
    .mark_line()
    .add_params(zoom)
    .encode(
        x="date:T",
        y="High price:Q",
        color=alt.condition(zoom, "Method", alt.value("darkgrey")),
    )
    .properties(
        width=200,
        height=200,
        title="PGHL 'high'price"))

# Transparent selectors across the chart. This is what tells us
# the x-value of the cursor
selectors = alt.Chart(melted_df).mark_point().encode(
    x='date:T',
    opacity=alt.value(0),
).add_params(
    nearest
)

columns = sorted(melted_df.Method.unique())
selection = alt.selection_single(
    fields=['date'], nearest=True, on='mouseover', empty='none', clear='mouseout'
)

detail = (
    alt.Chart(melted_df)
    .mark_line()
    .encode(
        alt.X("date:T").scale(domain={"param": zoom.name, "encoding": "x"}),
        alt.Y("High price:Q").scale(domain={"param": zoom.name, "encoding": "y"}),
        color = alt.Color('Method:N',scale=palette, legend=alt.Legend(labelLimit = 400)),
    )
    .properties(width=600, height=400, title="PGHL 'high' stock price prediction results -- detail view")
)

points = detail.mark_point().transform_filter(selection)

rule = detail.transform_pivot(
    'Method', value='High price', groupby=['date']
).mark_rule().encode(
    opacity=alt.condition(selection, alt.value(0.3), alt.value(0)),
    tooltip=[alt.Tooltip(c, type='quantitative') for c in columns]
).add_selection(selection)

(detail | minimap).configure_legend(
    orient='none',direction = 'vertical',legendX = 650,  legendY = 250, offset=-200,
    symbolDirection='vertical',
        titleFontSize=16,
        labelFontSize=14
    ).configure_title(fontSize=18)





In [39]:
source = melted_df
zoom = alt.selection_interval(encodings=["x", "y"])

minimap = (
    alt.Chart(melted_df)
    .mark_line()
    .add_params(zoom)
    .encode(
        x="date:T",
        y="High price:Q",
        color=alt.condition(zoom, "Method", alt.value("darkgrey")),
    )
    .properties(
        width=200,
        height=200,
        ))

selectors = alt.Chart(melted_df).mark_point().encode(
    x='date:T',
    opacity=alt.value(0),
).add_params(
    nearest
)

base = alt.Chart(source).encode(x='date:T')
columns = sorted(source.Method.unique())
selection = alt.selection_single(
    fields=['date'], nearest=True, on='mouseover', empty='none', clear='mouseout'
)

lines = base.mark_line().encode(
        alt.X("date:T").scale(domain={"param": zoom.name, "encoding": "x"}),
        alt.Y("High price:Q").scale(domain={"param": zoom.name, "encoding": "y"}),
        color = alt.Color('Method:N',scale=palette, legend=alt.Legend(labelLimit = 400))
).properties(width=600, height=400, title="PGHL 'high' stock price prediction results -- detail view")

points = lines.mark_point().transform_filter(selection)

rule = base.transform_pivot(
    'Method', value='High price', groupby=['date']
).mark_rule().encode(
    opacity=alt.condition(selection, alt.value(0.3), alt.value(0)),
    tooltip=[alt.Tooltip(c, type='quantitative') for c in columns]
).add_selection(selection)

(lines + points + rule | minimap).configure_legend(
    orient='none',direction = 'vertical',legendX = 650,  legendY = 250, offset=-200,
    symbolDirection='vertical',
        titleFontSize=16,
        labelFontSize=14
    ).configure_title(fontSize=18)