In [2]:
import warnings
warnings.simplefilter(action='ignore', category=UserWarning)

import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import plotly.graph_objs as go
import plotly.express as px
from plotly.subplots import make_subplots
import gzip
from datetime import datetime, timedelta
from statistics import mean
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
import matplotlib.pyplot as plt

In [19]:
def retrieve_data(varname, filename):
    df = pd.read_csv(filename)
    df["Date"] = pd.to_datetime(df["Date"])
    return df

def crosscorr(datax, datay, lag=0):
    return datax.corr(datay.shift(lag))

def execute_time_lagged_cross_correlation(df, full_col_name, col_name, colored, closes, title, filename, leading=False):
    rmin = -10
    rmax = 10
    
    fig1 = make_subplots(rows=1, cols=1, specs=[[{'type':'xy'}]])
    for close_col in closes:
        t1 = df[full_col_name]
        t2 = df[close_col]

        rs = [crosscorr(t1, t2, lag) for lag in range(rmin, rmax + 1)]

        offset = np.floor(len(rs)/2)-np.argmax(rs)
        if leading:
            name = close_col[len(col_name):]
        else:
            name = close_col[:-len(col_name)]
        if max(rs) < 0.1 and min(rs) > -0.1:
#             fig1.add_trace(go.Scatter(y=rs, mode="lines", name=name), row=1, col=1)
#         else:
            fig1.add_trace(go.Scatter(y=rs, mode="lines", showlegend=False, line=dict(color="lightgray")), row=1, col=1)
    for close_col in closes:
        t1 = df[full_col_name]
        t2 = df[close_col]

        rs = [crosscorr(t1, t2, lag) for lag in range(rmin, rmax + 1)]

        offset = np.floor(len(rs)/2)-np.argmax(rs)
        if leading:
            name = close_col[len(col_name):]
        else:
            name = close_col[:-len(col_name)]
        if max(rs) >= 0.1 or min(rs) <= -0.1:
            fig1.add_trace(go.Scatter(y=rs, mode="lines", name=name), row=1, col=1)
#         else:
#             fig1.add_trace(go.Scatter(y=rs, mode="lines", showlegend=False, line=dict(color="lightgray")), row=1, col=1)


    
#     for close_col in closes:
#         if close_col not in colored:
#             t1 = df[full_col_name]
#             t2 = df[close_col]

#             rs = [crosscorr(t1, t2, lag) for lag in range(rmin, rmax + 1)]

#             offset = np.floor(len(rs)/2)-np.argmax(rs)
#             if leading:
#                 name = close_col[len(col_name):]
#             else:
#                 name = close_col[:-len(col_name)]
#             fig1.add_trace(go.Scatter(y=rs, mode="lines", name=name), row=1, col=1)
            
    fig1.add_vline(x=(abs(rmin) + abs(rmax)) / 2, line=dict(color="red", dash="dash"))
    fig1.update_layout(
        title = title, 
        xaxis1 = dict(title_text = 'Offset',
                    tickmode = 'array',
                    tickvals = list(range(0, abs(rmin) + rmax + 1, 1)),
                    ticktext = list(range(rmin, rmax + abs(rmin) + 1, 1))),
        yaxis1 = dict(title_text = "Pearson Correlation"),
        height = 1000,
        width = 920

    )
    fig1.write_image(filename)
    fig1.show()

In [20]:
# SP500
files = {
    # varname: filename
    "S&P500": "Dataset v3/SP500_combined_data_20220422.csv",
}
for file in files:
    df = retrieve_data(file, files[file])

col_name = "_relative_change_perc_1"
full_col_name = "SP500_relative_change_perc_1"

closes = []
for column in df.columns.tolist():
    if column[-len(col_name):] == col_name:
        closes.append(column)

colored = []
title = 'Time Lagged Cross Correlation Between SP500 Relative Daily Change and Lagged Variables'
filename = 'Plots/Correlation Plot SP500 Relative2.png'
execute_time_lagged_cross_correlation(df, full_col_name, col_name, colored, closes, title, filename, False)

col_name = "SP500_"
full_col_name = "SP500_relative_change_perc_1"

closes = []
for column in df.columns.tolist():
    if column[:len(col_name)] == col_name:
        closes.append(column)

colored = []
title = 'Time Lagged Cross Correlation Between SP500 Relative Daily Change and Technical Indicators'
filename = 'Plots/Correlation Plot SP500 Relative Technical Indicators2.png'
execute_time_lagged_cross_correlation(df, full_col_name, col_name, colored, closes, title, filename, True)

"""

DESCRIPTION FOR CORRELATION IN CLOSES
Positive
    In general, it can be seen that the correlation is highest for lag 0, suggesting co-movements 
        between the different stock prices, but no clear leader-follower relationships. This suggests that there is
        no clear use for adding multiple lags other than t-1 for each additional variable
    The indices are generally highly correlated with the SP500, with the lagged SP500 showing the highest correlation. 
        This is in line with expectations.
    Index futures also show relatively high correlations, especially that of the SP500 futures. The SSE50 and HS50
        futures show the lowest correlations some margin.
    From the index constituents, APPL and FB show the highest correlation and GOOG the lowest.
    
    Forex pairs with base currency that is not USD are positively correlated. USDCHF shows little correlation with the SP500
Negative
    Forex pairs show the largest negative correlation, with NZDUSD showing the lowest correlation of the forex pairs.
    Commodities show an average negative correlation, with gold barely showing any correlation. 
    This is not in line with findings shown in other studies.
    
    For negative correlation in general, it can be seen that lag zero does not necessarily lead to the largest 
        (absolute) correlation. In fact, lag -50 often shows a larger negative correlation. 
        This shows somewhat of a leader-follower relation between the commodities and SP500, where the SP500 leads the 
        commodities. This is in line with findings presented by...
"""

'\n\nDESCRIPTION FOR CORRELATION IN CLOSES\nPositive\n    In general, it can be seen that the correlation is highest for lag 0, suggesting co-movements \n        between the different stock prices, but no clear leader-follower relationships. This suggests that there is\n        no clear use for adding multiple lags other than t-1 for each additional variable\n    The indices are generally highly correlated with the SP500, with the lagged SP500 showing the highest correlation. \n        This is in line with expectations.\n    Index futures also show relatively high correlations, especially that of the SP500 futures. The SSE50 and HS50\n        futures show the lowest correlations some margin.\n    From the index constituents, APPL and FB show the highest correlation and GOOG the lowest.\n    \n    Forex pairs with base currency that is not USD are positively correlated. USDCHF shows little correlation with the SP500\nNegative\n    Forex pairs show the largest negative correlation, with 

In [21]:
# US30
files = {
    # varname: filename
    "US30": "Dataset v3/US30_combined_data_20220422.csv",
}
for file in files:
    df = retrieve_data(file, files[file])

col_name = "_relative_change_perc_1"
full_col_name = "US30_relative_change_perc_1"

closes = []
for column in df.columns.tolist():
    if column[-len(col_name):] == col_name:
        closes.append(column)
colored = []
title = 'Time Lagged Cross Correlation Between US30 Relative Daily Change and Lagged Variables'
filename = 'Plots/Correlation Plot US30 Relative2.png'
execute_time_lagged_cross_correlation(df, full_col_name, col_name, colored, closes, title, filename, False)

col_name = "US30_"
full_col_name = "US30_relative_change_perc_1"

closes = []
for column in df.columns.tolist():
    if column[:len(col_name)] == col_name:
        closes.append(column)

colored = []
title = 'Time Lagged Cross Correlation Between US30 Relative Daily Change and Technical Indicators'
filename = 'Plots/Correlation Plot US30 Relative Technical Indicators2.png'
execute_time_lagged_cross_correlation(df, full_col_name, col_name, colored, closes, title, filename, True)

In [22]:
# NASDAQ
files = {
    # varname: filename
    "NASDAQ": "Dataset v3/NASDAQ_combined_data_20220422.csv",
}
for file in files:
    df = retrieve_data(file, files[file])

col_name = "_relative_change_perc_1"
full_col_name = "NASDAQ_relative_change_perc_1"

closes = []
for column in df.columns.tolist():
    if column[-len(col_name):] == col_name:
        closes.append(column)
colored = []
title = 'Time Lagged Cross Correlation Between NASDAQ Relative Daily Change and Lagged Variables'
filename = 'Plots/Correlation Plot NASDAQ Relative2.png'
execute_time_lagged_cross_correlation(df, full_col_name, col_name, colored, closes, title, filename)

col_name = "NASDAQ_"
full_col_name = "NASDAQ_relative_change_perc_1"

closes = []
for column in df.columns.tolist():
    if column[:len(col_name)] == col_name:
        closes.append(column)

colored = []
title = 'Time Lagged Cross Correlation Between NASDAQ Relative Daily Change and Technical Indicators'
filename = 'Plots/Correlation Plot NASDAQ Relative Technical Indicators2.png'
execute_time_lagged_cross_correlation(df, full_col_name, col_name, colored, closes, title, filename, True)