In [7]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import plotly.graph_objs as go
import plotly.express as px
from plotly.subplots import make_subplots
import gzip
from datetime import datetime, timedelta
import numpy as np
from statistics import mean
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
import matplotlib.pyplot as plt

In [8]:
from technical_indicators import calculate_daily_relative_difference

In [27]:
def retrieve_data(filename, varname):
    df_new = pd.read_csv(filename)
    df_new["Date"] = pd.to_datetime(df_new["Date"])
    try:
        df_new.columns = ["Date", "Close", "Open", "High", "Low", "Vol.", "Change%"]
    except ValueError:
        try:
            df_new.columns = ["Date", "Close", "Open", "High", "Low", "Change%"]
        except ValueError:
            df_new.columns = ["Date", "Close", "Open", "High", "Low"]

    rows = []
    for index, row in df_new.iterrows():
        date = row['Date']
        p_close = float(str(row['Close']).replace(',', ''))
        p_open = float(str(row['Open']).replace(',', ''))
        p_high = float(str(row['High']).replace(',', ''))
        p_low = float(str(row['Low']).replace(',', ''))
        if df_new.columns.tolist() == ["Date", "Close", "Open", "High", "Low", "Vol.", "Change%"]:
            if str(row['Vol.']) == "-":
                volume = 0
            else:
                if row['Vol.'][-1] == "B": # volume in billions
                    volume = float(row['Vol.'][:len(row['Vol.']) - 1]) * 1000000000
                elif row['Vol.'][-1] == "M": # volume in millions
                    volume = float(row['Vol.'][:len(row['Vol.']) - 1]) * 1000000
                elif row['Vol.'][-1] == "K": # volume in thousands
                    volume = float(row['Vol.'][:len(row['Vol.']) - 1]) * 1000
                else:
                    print(varname, row['Date'], row['Vol.'])
            rows.append([date, p_close, p_open, p_high, p_low, volume])
        else:
            rows.append([date, p_close, p_open, p_high, p_low])
    df_new = pd.DataFrame(rows)
    if len(rows[0]) == 6:
        df_new.columns = ["Date", varname + "_Close", varname + "_Open", varname + "_High", varname + "_Low", varname + "_Volume"]
    else:
        df_new.columns = ["Date", varname + "_Close", varname + "_Open", varname + "_High", varname + "_Low"]
    
    df_new = df_new.sort_values(by='Date').reset_index(drop=True)
   
    # remove crises
    df_new = df_new[(df_new['Date'] >= datetime(2009, 7, 1)) & (df_new['Date'] <= datetime(2019, 12, 31))]
    
    # add relative difference between open and close to data per day
    df_new = calculate_daily_relative_difference(df_new,  varname + "_Open",  varname + "_Close", varname)
    
#     plot_closes(df_new, "Plot of " + varname + " Closing Prices (Reduced Interval)", varname + "_Close")
    return df_new

files = {
    # varname: filename
    "S&P500": "Dataset v2/Indices/S&P 500 Historical Data.csv",
    "US30": "Dataset v2/Indices/Dow Jones Industrial Average Historical Data.csv", 
    "US2000": "Dataset v2/Indices/US Small Cap 2000 Historical Data.csv",
    "NASDAQ": "Dataset v2/Indices/NASDAQ Composite Historical Data.csv", 
    "GER30": "Dataset v2/Indices/DAX Historical Data.csv",
    "CAC40": "Dataset v2/Indices/CAC 40 Historical Data.csv",
    "UK100": "Dataset v2/Indices/invUK100 Historical Data.csv",
    "SHCOMP": "Dataset v2/Indices/Shanghai Composite Historical Data.csv",
    "HS50": "Dataset v2/Indices/Hang Seng Historical Data.csv",
    "NIKKEI225": "Dataset v2/Indices/NIKKEI 225 Historical Data.csv",
}

files_list = []
for file in files:
    files_list.append(file)



df_sp500 = retrieve_data(files["S&P500"], "S&P500")
df_us30 = retrieve_data(files["US30"], "US30")

In [39]:
def plot_corr(lags, rs):
    fig = make_subplots(rows=1, cols=1, specs=[[{'type':'xy'}]])

    fig.add_trace(go.Scatter(x=lags, y=rs, mode="lines", line_color="#646ef9"), row=1, col=1)

    fig.update_xaxes(title_text="Lag", row=1, col=1)
    fig.update_yaxes(title_text="Cross Correlation", row=1, col=1)
    layout = dict(title="Cross Correlation between S&P500 and US30 Daily Differences", height=600, width=800, showlegend=False)
    fig.update_layout(layout)
    fig.show()

def crosscorr(datax, datay, lag=0):
    # takes two pandas series
    return datax.corr(datay.shift(lag))


# Time lagged cross correlation
var1 = "S&P500"
var2 = "US30"
series1 = df_sp500['S&P500_relative_change_perc_1']
series2 = df_us30['US30_relative_change_perc_1']

# only consider historic lags
lags = np.arange(-(100), (-1), 1)
rs = np.nan_to_num([crosscorr(series1, series2, lag) for lag in lags])

print(f"Cross correlation between {var1} and {var2}")
print(f"Highest correlation of {np.max(rs)} at lag {lags[np.argmax(rs)]}")
print(f"Lowest correlation of {np.min(rs)} at lag {lags[np.argmin(rs)]}")

plot_corr(lags, rs)

Cross correlation between S&P500 and US30
Highest correlation of 0.05827945049449841 at lag -77
Lowest correlation of -0.08629235333518674 at lag -48
