In [6]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import plotly.graph_objs as go
import plotly.express as px
from plotly.subplots import make_subplots
import gzip
from datetime import datetime, timedelta
import numpy as np
from statistics import mean
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
import matplotlib.pyplot as plt

In [7]:
from technical_indicators import calculate_daily_relative_difference

In [8]:
# reasons for 2018 dips 
# https://www.pbs.org/newshour/economy/making-sense/6-factors-that-fueled-the-stock-market-dive-in-2018

# covid
# https://www.epw.in/journal/2021/47/special-articles/impact-covid-19-stock-market-and-corporate-firms.html
# shows that number of cases and deaths have little to no relation to stock prices, so how stock prices react 
# to situation remains a matter of curiosity

In [13]:
def plot_closes_crises(df, title, column_close, varname):
    fig = make_subplots(rows=1, cols=1, specs=[[{'type':'xy'}]])
    
    df_before = df[df['Date'] <= datetime(2007, 10, 1)]
    df_between = df[(df['Date'] >= datetime(2009, 6, 1)) & (df['Date'] <= datetime(2020, 2, 1))]
    df_after = df[df['Date'] >= datetime(2020, 12, 1)]

    fig.add_trace(go.Scatter(x=df_before['Date'], y=df_before[column_close], mode="lines", name="Close Prices", line_color="#646ef9"), row=1, col=1)
    fig.add_trace(go.Scatter(x=df_between['Date'], y=df_between[column_close], mode="lines", line_color="#646ef9", showlegend=False), row=1, col=1)
    fig.add_trace(go.Scatter(x=df_after['Date'], y=df_after[column_close], mode="lines", line_color="#646ef9", showlegend=False), row=1, col=1)
    
    # economic crisis
    df_temp = df[(df['Date'] >= datetime(2007, 10, 1)) & (df['Date'] <= datetime(2009, 6, 1))]
    fig.add_trace(go.Scatter(x=df_temp['Date'], y=df_temp[column_close], mode="lines", name="2008 Economic Crisis", line={'color':'red'}), row=1, col=1)
    
    diff = (df[varname + '_Close'].max() - df[varname + '_Close'].min()) * 0.2
    notation_x = datetime(2007, 10, 1) + (datetime(2009, 6, 1) - datetime(2007, 10, 1))/2
    fig.add_annotation(x=notation_x, y=df_before.iloc[-1][varname + "_Close"] + diff,
            text="2008 Economic Crisis", showarrow=False)
    
    # covid-19
    df_temp = df[(df['Date'] >= datetime(2020, 2, 1)) & (df['Date'] <= datetime(2020, 12, 1))]
    fig.add_trace(go.Scatter(x=df_temp['Date'], y=df_temp[column_close], mode="lines", name="Covid-19 Crisis", line={'color':'red'}), row=1, col=1)
    
    diff = (df[varname + '_Close'].max() - df[varname + '_Close'].min()) * 0.2
    notation_x = datetime(2020, 2, 1) + (datetime(2020, 12, 1) - datetime(2020, 2, 1))/2
    fig.add_annotation(x=notation_x, y=df_temp[varname + "_Close"].min() - diff,
            text="Covid-19 Crisis", showarrow=False)
    
    fig.update_xaxes(title_text="Date", row=1, col=1)
    fig.update_yaxes(title_text="Price", row=1, col=1)
    layout = dict(title=title, height=600, width=800, showlegend=False)
    fig.update_layout(layout)
    fig.write_image("Plots/" + varname + "_close_crises.png")
    fig.show()
    
def crisis_analysis(df, period_start, period_end, varname):
    column_open = varname + "_Open"
    column_close = varname + "_Close"
    df_temp = df[(df['Date'] >= period_start) & (df['Date'] < period_end)]
    # PAS OP DAT JE WEL DE EERSTE PIEK NEEMT EN NIET DE TWEEDE
    high = df_temp[column_close].max()
    low = df_temp[column_close].min()
    return (high-low)/high

def plot_closes(df, title, column_close, varname):
    fig = make_subplots(rows=1, cols=1, specs=[[{'type':'xy'}]])
    fig.add_trace(go.Scatter(x=df['Date'], y=df[column_close], mode="lines", name="Close prices"), row=1, col=1)
    fig.update_xaxes(title_text="Date", row=1, col=1)
    fig.update_yaxes(title_text="Price", row=1, col=1)
    layout = dict(title=title, height=600, width=800)
    fig.update_layout(layout)
    fig.write_image("Plots/" + varname + "_close_reduced.png")
    fig.show()

def calculate_statistics(df, filename, varname):
    column_open = varname + "_Open"
    column_close = varname + "_Close"
    column_change = varname + "_relative_change_perc_1"

    df_up = df[df[column_change] > 0]
    df_down = df[df[column_change] <= 0] # smaller or equal since index generally goes up. 
        # Attempts to balance classes
        # Another possibility is to remove trend from index

    return [varname, 
            np.where(df[column_open] < df[column_close], 1, 0).sum(), 
            np.where(df[column_open] >= df[column_close], 1, 0).sum(),
            df[column_change].mean(),
            df[column_change].abs().mean(),
            df[column_change].abs().median(),
            df_up[column_change].mean(),
            df_up[column_change].abs().median(),
            abs(df_down[column_change].mean()),
            abs(df_down[column_change].median()),
            df_up[column_change].std(),
            abs(df_down[column_change].std())
           ]

In [17]:
def retrieve_data(filename, varname):
    df_new = pd.read_csv(filename)
    df_new["Date"] = pd.to_datetime(df_new["Date"])
    try:
        df_new.columns = ["Date", "Close", "Open", "High", "Low", "Vol.", "Change%"]
    except ValueError:
        try:
            df_new.columns = ["Date", "Close", "Open", "High", "Low", "Change%"]
        except ValueError:
            df_new.columns = ["Date", "Close", "Open", "High", "Low"]

    rows = []
    for index, row in df_new.iterrows():
        date = row['Date']
        p_close = float(str(row['Close']).replace(',', ''))
        p_open = float(str(row['Open']).replace(',', ''))
        p_high = float(str(row['High']).replace(',', ''))
        p_low = float(str(row['Low']).replace(',', ''))
        if df_new.columns.tolist() == ["Date", "Close", "Open", "High", "Low", "Vol.", "Change%"]:
            if str(row['Vol.']) == "-":
                volume = 0
            else:
                if row['Vol.'][-1] == "B": # volume in billions
                    volume = float(row['Vol.'][:len(row['Vol.']) - 1]) * 1000000000
                elif row['Vol.'][-1] == "M": # volume in millions
                    volume = float(row['Vol.'][:len(row['Vol.']) - 1]) * 1000000
                elif row['Vol.'][-1] == "K": # volume in thousands
                    volume = float(row['Vol.'][:len(row['Vol.']) - 1]) * 1000
                else:
                    print(varname, row['Date'], row['Vol.'])
            rows.append([date, p_close, p_open, p_high, p_low, volume])
        else:
            rows.append([date, p_close, p_open, p_high, p_low])
    df_new = pd.DataFrame(rows)
    if len(rows[0]) == 6:
        df_new.columns = ["Date", varname + "_Close", varname + "_Open", varname + "_High", varname + "_Low", varname + "_Volume"]
    else:
        df_new.columns = ["Date", varname + "_Close", varname + "_Open", varname + "_High", varname + "_Low"]
    
    df_new = df_new.sort_values(by='Date').reset_index(drop=True)
#     plot_closes_crises(df_new, "Plot of " + varname + " Daily Close Prices", varname + "_Close", varname)
    
#     perc_crash_2008 = crisis_analysis(df_new, datetime(2007, 10, 1), datetime(2009, 6, 1), varname)
#     perc_crash_covid = crisis_analysis(df_new, datetime(2020, 2, 1), datetime(2020, 12, 1), varname)
#     print(f"{varname} 2008 crash {perc_crash_2008}")
#     print(f"{varname} Covid crash {perc_crash_covid}")
   
    # remove crises
    df_new = df_new[(df_new['Date'] >= datetime(2009, 7, 1)) & (df_new['Date'] <= datetime(2019, 12, 31))]
    
    # add relative difference between open and close to data per day
    df_new = calculate_daily_relative_difference(df_new,  varname + "_Open",  varname + "_Close", varname)
    
#     plot_closes(df_new, "Plot of " + varname + " Closing Prices (Reduced Interval)", varname + "_Close", varname)
    return df_new

files = {
    # varname: filename
    "S&P500": "Dataset v2/Indices/S&P 500 Historical Data.csv",
    "US30": "Dataset v2/Indices/Dow Jones Industrial Average Historical Data.csv", 
    "US2000": "Dataset v2/Indices/US Small Cap 2000 Historical Data.csv",
    "NASDAQ": "Dataset v2/Indices/NASDAQ Composite Historical Data.csv", 
    "GER30": "Dataset v2/Indices/DAX Historical Data.csv",
    "CAC40": "Dataset v2/Indices/CAC 40 Historical Data.csv",
    "UK100": "Dataset v2/Indices/invUK100 Historical Data.csv",
    "SSE50": "Dataset v2/Indices/Shanghai Composite Historical Data.csv",
    "HS50": "Dataset v2/Indices/Hang Seng Historical Data.csv",
    "NIKKEI225": "Dataset v2/Indices/NIKKEI 225 Historical Data.csv",
}

print(f"Retrieving {len(files)} files")
i = 1
df = pd.DataFrame()
stats_rows = []
for file in files:
    print(f"Retrieving {file} - {i}/{len(files)}")
    df = retrieve_data(files[file], file)
    stats_rows.append(calculate_statistics(df, files[file], file))
    i += 1
df_stats = pd.DataFrame(stats_rows)
df_stats.columns = ['market', 'candles up', 'candles down', 
                    'mean % change', 'mean abs % change', 'med abs % change',
                    'mean % change up', 'med % change up', 
                    'mean abs % change down', 'med abs % change down',
                    'standard deviation change up', 'standard deviation change down',
                    'variance change up', 'variance change down']
df_stats

Retrieving 10 files
Retrieving S&P500 - 1/10
Retrieving US30 - 2/10
Retrieving US2000 - 3/10
Retrieving NASDAQ - 4/10
Retrieving GER30 - 5/10
Retrieving CAC40 - 6/10
Retrieving UK100 - 7/10
Retrieving SSE50 - 8/10
Retrieving HS50 - 9/10
Retrieving NIKKEI225 - 10/10


ValueError: Length mismatch: Expected axis has 12 elements, new values have 14 elements

In [16]:
df_stats['candles down'] = -1 * df_stats['candles down']

KeyError: 'candles down'

In [15]:
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'xy'}, {'type':'xy'}]], shared_yaxes=True, horizontal_spacing=0)

fig.append_trace(go.Bar(x=df_stats['candles down'], y=df_stats['market'], 
                        orientation='h', showlegend=True,
                        text=df_stats['candles down'].abs(),
                        name='Downwards Candles', marker_color='#00377B'), 1, 1)
fig.append_trace(go.Bar(x=df_stats['candles up'], y=df_stats['market'],
                        orientation='h', showlegend=True, 
                        text=df_stats['candles up'], 
                        name='Upwards Candles',
                        marker_color='#0D96A2'), 1, 2)
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False, categoryorder='total ascending', ticksuffix=' ', showline=False)
fig.update_traces(hovertemplate=None)

fig.update_layout(title='Overview of Number of Upwards and Downwards Candles per Index',
                  margin=dict(t=80, b=0, l=70, r=40),
                  hovermode="y unified",
                  plot_bgcolor='#FFFFFF', paper_bgcolor='#FFFFFF',
                  legend=dict(orientation="h", yanchor="bottom", y=1, xanchor="center", x=0.5)
                 )
fig.write_image("distribution of candles.png")
fig.show()

KeyError: 'candles down'

In [None]:

fig = make_subplots(rows=1, cols=1, specs=[[{'type':'xy'}]])
for i, row in df_stats.iterrows():
    fig.add_trace(go.Scatter(x=[0], y=[row['mean abs % change']], mode='markers', name=row['market']), row=1, col=1)
# fig.update_xaxes(title_text="Date", row=1, col=1)
# fig.update_yaxes(title_text="Price", row=1, col=1)
layout = dict(title="test")
fig.update_layout(layout)
fig.show()

In [None]:
# barplot of mean absolute percentual change

df_temp = df_stats.sort_values(by='mean abs % change', ascending=False).reset_index(drop=True)
df_temp['mean abs % change'] = df_temp['mean abs % change' ]* 100

fig = make_subplots(rows=1, cols=1, specs=[[{'type':'xy'}]])
for i, row in df_temp.iterrows():
    fig.add_trace(go.Bar(x=[i], y=[row['mean abs % change']], name=row['market'], text=str(round(row['mean abs % change'], 2)) + "%"), row=1, col=1)

fig.update_xaxes(title_text="Index", showticklabels=False, row=1, col=1)
fig.update_yaxes(title_text="Percentual Change per Day", row=1, col=1)
layout = dict(title="Mean Absolute Percentual Change per Day")
fig.update_layout(layout)
fig.show()

In [None]:
df_temp = df_stats.sort_values(by='mean % change up', ascending=False).reset_index(drop=True)
df_temp['mean % change up'] = df_temp['mean % change up' ]* 100

fig = make_subplots(rows=1, cols=1, specs=[[{'type':'xy'}]])
for i, row in df_temp.iterrows():
    fig.add_trace(go.Bar(x=[i], y=[row['mean % change up']], name=row['market'], text=str(round(row['mean % change up'], 2)) + "%"), row=1, col=1)

fig.update_xaxes(title_text="Index", showticklabels=False, row=1, col=1)
fig.update_yaxes(title_text="Percentual Upwards Change per Day", row=1, col=1)
layout = dict(title="Mean Absolute Percentual Upwards Change per Day")
fig.update_layout(layout)
fig.show()

In [None]:
df_temp = df_stats.sort_values(by='mean abs % change down', ascending=False).reset_index(drop=True)
df_temp['mean abs % change down'] = df_temp['mean abs % change down' ]* 100

fig = make_subplots(rows=1, cols=1, specs=[[{'type':'xy'}]])
for i, row in df_temp.iterrows():
    fig.add_trace(go.Bar(x=[i], y=[row['mean abs % change down']], name=row['market'], text=str(round(row['mean abs % change down'], 2)) + "%"), row=1, col=1)

fig.update_xaxes(title_text="Index", showticklabels=False, row=1, col=1)
fig.update_yaxes(title_text="Percentual Downwards Change per Day", row=1, col=1)
layout = dict(title="Mean Absolute Percentual Downwards Change per Day")
fig.update_layout(layout)
fig.show()

In [12]:
def retrieve_data_simplified(filename, varname):
    df_new = pd.read_csv(filename)
    df_new["Date"] = pd.to_datetime(df_new["Date"])
    try:
        df_new.columns = ["Date", "Close", "Open", "High", "Low", "Vol.", "Change%"]
    except ValueError:
        try:
            df_new.columns = ["Date", "Close", "Open", "High", "Low", "Change%"]
        except ValueError:
            df_new.columns = ["Date", "Close", "Open", "High", "Low"]

    rows = []
    for index, row in df_new.iterrows():
        date = row['Date']
        p_close = float(str(row['Close']).replace(',', ''))
        p_open = float(str(row['Open']).replace(',', ''))
        p_high = float(str(row['High']).replace(',', ''))
        p_low = float(str(row['Low']).replace(',', ''))
        if df_new.columns.tolist() == ["Date", "Close", "Open", "High", "Low", "Vol.", "Change%"]:
            if str(row['Vol.']) == "-":
                volume = 0
            else:
                if row['Vol.'][-1] == "B": # volume in billions
                    volume = float(row['Vol.'][:len(row['Vol.']) - 1]) * 1000000000
                elif row['Vol.'][-1] == "M": # volume in millions
                    volume = float(row['Vol.'][:len(row['Vol.']) - 1]) * 1000000
                elif row['Vol.'][-1] == "K": # volume in thousands
                    volume = float(row['Vol.'][:len(row['Vol.']) - 1]) * 1000
                else:
                    print(varname, row['Date'], row['Vol.'])
            rows.append([date, p_close, p_open, p_high, p_low, volume])
        else:
            rows.append([date, p_close, p_open, p_high, p_low])
    df_new = pd.DataFrame(rows)
    if len(rows[0]) == 6:
        df_new.columns = ["Date", varname + "_Close", varname + "_Open", varname + "_High", varname + "_Low", varname + "_Volume"]
    else:
        df_new.columns = ["Date", varname + "_Close", varname + "_Open", varname + "_High", varname + "_Low"]
    
    df_new = df_new.sort_values(by='Date').reset_index(drop=True)
    df_new = df_new[(df_new['Date'] >= datetime(2009, 7, 1)) & (df_new['Date'] <= datetime(2019, 12, 31))]
    
    return df_new

def plot_distribution(df, column_open, column_close, varname):
    df['dif'] = (df[column_close] - df[column_open])
    df['%dif'] = (df[column_close] - df[column_open]) / df[column_open]
    
#     fig = px.histogram(df, x='dif')
#     fig.update_xaxes(title_text="Daily Difference Between Open and Close Prices")
#     fig.update_yaxes(title_text="Count")
#     layout = dict(title="Distribution of Difference Between Open and Close Prices")
#     fig.update_layout(layout)
#     fig.show()
    
    fig = px.histogram(df, x='%dif')
    
    fig.add_vline(x=0, line_dash='dash', line_color='firebrick')
    sub = df[df['%dif'] > 0]
    fig.add_vline(x=np.median(sub['%dif']), line_dash='dash', line_color='firebrick')
    if varname == "S&P500":
        print(np.median(sub['%dif']))
    sub = df[df['%dif'] < 0]
    fig.add_vline(x=np.median(sub['%dif']), line_dash='dash', line_color='firebrick')
    if varname == "S&P500":
        print(np.median(sub['%dif']))
    
    fig.update_xaxes(title_text="Daily Percentual Difference Between Open and Close Prices")
    fig.update_yaxes(title_text="Frequency")
    layout = dict(title=f"Distribution of {varname} Percentual Difference Between Open and Close Prices")
    fig.update_layout(layout)
    fig.write_image("Plots/" + varname + "_dist_perc_diff.png")
    fig.show()
    
files = {
    # varname: filename
    "S&P500": "Dataset v2/Indices/S&P 500 Historical Data.csv",
    "US30": "Dataset v2/Indices/Dow Jones Industrial Average Historical Data.csv", 
#     "US2000": "Dataset v2/Indices/US Small Cap 2000 Historical Data.csv",
    "NASDAQ": "Dataset v2/Indices/NASDAQ Composite Historical Data.csv", 
#     "GER30": "Dataset v2/Indices/DAX Historical Data.csv",
#     "CAC40": "Dataset v2/Indices/CAC 40 Historical Data.csv",
#     "UK100": "Dataset v2/Indices/invUK100 Historical Data.csv",
#     "SHCOMP": "Dataset v2/Indices/Shanghai Composite Historical Data.csv",
#     "HS50": "Dataset v2/Indices/Hang Seng Historical Data.csv",
#     "NIKKEI225": "Dataset v2/Indices/NIKKEI 225 Historical Data.csv",
}
    
print(f"Retrieving {len(files)} files")
i = 1
df = pd.DataFrame()
stats_rows = []
for file in files:
    print(f"Retrieving {file} - {i}/{len(files)}")
    df = retrieve_data_simplified(files[file], file)
    plot_distribution(df, file + "_Open", file + "_Close", file)

Retrieving 3 files
Retrieving S&P500 - 1/3
0.004077632475755081
-0.0036183622171721604


Retrieving US30 - 1/3


Retrieving NASDAQ - 1/3
