In [1]:
from multiprocessing import Pool
import os
import time
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import minmax_scale
from correlation_analysis import get_data, get_close_data, get_close_df, offset_and_rolling_lag_corr, fit_sinusoid_to_correlation, fitted_sine_directional_acc, high_corr_perc
from statistical_tests import statistical_routine

# Reading the database file

In [2]:
database = pd.read_csv("Lagged Rolling Correlation Dataset.csv")

## General Functions for Analysis

In [16]:
def plot_corr_and_sine(idx, database):
    """
    Takes in row idx. Plots correlation graph with fitted sine.
    """
    data_row = database.loc[idx]
    indep_close = get_close_data(data_row["Independent Variable"])
    dep_close = get_close_data(data_row["Dependent Variable"])
    correlation = offset_and_rolling_lag_corr(indep_close, dep_close, data_row["Offset Lag"], data_row["Rolling Lag"])
    fit_res = fit_sinusoid_to_correlation(correlation)
    tt = np.arange(0, len(correlation), 1)
    
    plt.figure(figsize=(16,8))
    plt.title("{}'s correlation with current day {}: offset lag of {} and rolling lag {}".format(data_row["Dependent Variable"],
                                                                                    data_row["Independent Variable"],
                                                                                    data_row["Offset Lag"],
                                                                                    data_row["Rolling Lag"]), 
                                                                                    fontsize=18)
    plt.plot(correlation.values, label="Lagged Rolling Point Correlation")
    plt.plot(fit_res["fitfunc"](tt), label="Fitted Sinewave (Period={})".format(data_row["Periodicity"]))
    plt.legend()
    plt.grid()    
    plt.show()
      
def plot_normalised(list_of_var):
    """
    Takes list of numpy array or pandas series and plots a normalised chart
    """
    plt.figure(figsize=(16,8))
    
    for var in list_of_var:
        plt.plot(minmax_scale(var))
        
    plt.grid()    
    plt.show()
    
def convert_list_of_dfs_to_df(list_of_dfs):
    """
    Converts a list of dataframes to a single dataframe by extending the rows 
    """
    df = pd.DataFrame()

    for i in range(0, len(list_of_dfs)):

        df = pd.concat([df, list_of_dfs[i]], axis=0)
        
    return df

def run_pool(func, func_inputs):
    """
    Runs multiprocess pool for a given function and its inputs as a list of lists
    Returns function output as a list of lists for each input list case
    """
    pool = Pool(os.cpu_count())
    func_output = pool.starmap(func, func_inputs)
    pool.close()
    
    return func_output

# Phase 1: Experiments; Creating Pandas Queries - Subsets of the overall database 
## Query 1: 
- Independent Variable is either BTC or ETH
- Independent Variable cannot equal Dependent Variable (no autocorrelation cases)
- High correlation % must be greater than 0.35 (tendency to have strong correlations so future trends can be discerned)
- Direction accuracy must be greater than 0.7 (periodic predictability)

In [22]:
sort_order = "Directional Accuracy"
high_corr_thresh = 0.2
dir_acc_thresh = 0.7
indep_variables = ["BTC-USD", "ETH-USD", "ADA-USD", "XRP-USD", "BCH-USD", "LTC-USD"] # if you want all set equal to database["Independent Variables"].unique()

q1_df = database[
            ((database["Independent Variable"].isin(indep_variables))) &
            (database["Independent Variable"] != database["Dependent Variable"]) & # removing autocorrelation cases
            (database["Offset Lag"] >= database["Rolling Lag"]) & # ensuring no overlap between correlated points
            (database["High Correlation Percentage"] > high_corr_thresh) & # high correlation threshold
            (database["Directional Accuracy"] > dir_acc_thresh) # directional accuracy threshold
            ].sort_values(by=sort_order, ascending=False) # sorting by directional accuracy - most period

q1_df.head()

Unnamed: 0.1,Unnamed: 0,Independent Variable,Dependent Variable,Offset Lag,Rolling Lag,Threshold,Periodicity,Directional Accuracy,High Correlation Percentage
634962,634962,BTC-USD,BNB-USD,29,29,0.7,34,0.772727,0.262136
610573,610573,BTC-USD,BNB-USD,28,28,0.7,34,0.764516,0.244373
563477,563477,BTC-USD,BNB-USD,28,26,0.7,34,0.762821,0.239617
491992,491992,BTC-USD,BNB-USD,27,23,0.7,34,0.759494,0.205047
611414,611414,BTC-USD,BNB-USD,29,28,0.7,34,0.757282,0.264516


## Query 2:
- Adding conditions for:
    - rolling lag lower threshold
    - offset lag lower threshold
    - periodicity upper threshold
    - high correlation percentage lower threshold
    - directional accuracy lower threshold
    - sort order
    
- Creating for loop so it can be run for every variable combination

In [6]:
rolling_lag = 3
offset_lag = 2
period_upper_thresh = 100
high_corr_thresh = 0.3
dir_acc_thresh = 0.3
k = 100
sort_order = "Directional Accuracy"

q2_df = pd.DataFrame()
names = database["Independent Variable"].unique()

for indep in names[:2]:
    for dep in names[:2]:

        tmp = database[
                 (database["Independent Variable"] != database["Dependent Variable"]) &
                 (database["Offset Lag"] >= database["Rolling Lag"]) &
                 (database["Independent Variable"] == indep) & 
                 (database["Dependent Variable"] == dep) &
                 (database["Rolling Lag"] >= rolling_lag) &
                 (database["Offset Lag" ] >= offset_lag) &
                 (database["Periodicity" ] <= period_upper_thresh) &
                 (database["High Correlation Percentage"] > high_corr_thresh) &
                 (database["Directional Accuracy"] > dir_acc_thresh)      
                 
                ].sort_values(by=sort_order, ascending=False).iloc[:k]
        
        
        q2_df = pd.concat([q2_df, tmp], axis=0)
        
        
q2_df.head()

Unnamed: 0.1,Unnamed: 0,Independent Variable,Dependent Variable,Offset Lag,Rolling Lag,Threshold,Periodicity,Directional Accuracy,High Correlation Percentage
42051,42051,BTC-USD,ETH-USD,24,4,0.7,5,0.609467,0.380531
18503,18503,BTC-USD,ETH-USD,24,3,0.7,5,0.60767,0.555882
21026,21026,BTC-USD,ETH-USD,27,3,0.7,4,0.60119,0.522255
38687,38687,BTC-USD,ETH-USD,20,4,0.7,7,0.599415,0.346939
63917,63917,BTC-USD,ETH-USD,22,5,0.7,11,0.59882,0.308824


# Phase 2: Hypotheses; Defining Hypotheses - To run statistical tests on
- These hypotheses are located in the "hypotheses.py" file: they need to be in a file so they can be inserted as an input into the statistical_routine function on the multiprocess pool

In [7]:
from hypotheses import high_corr_hypothesis, high_corr_while_incr, high_corr_while_decr, incr_corr, decr_corr

# Phase 3: Running Multiprocess Pool; running statisticals tests on the processor pool and obtaining results

### Defining Inputs for Multiprocess Pool

In [12]:
inputs = q1_df[["Independent Variable", "Dependent Variable", "Offset Lag", "Rolling Lag"]].values.tolist()

for l in inputs:
    l.append(incr_corr)

### Running Multiprocess Pool

In [18]:
output = run_pool(statistical_routine, inputs)

### Converting Outputs of Mutliprocess Pool into single dataframe

In [20]:
stats_df = convert_list_of_dfs_to_df(output)

### Querying Dataframe for results

In [21]:
stats_df[
    (stats_df["Market Variable"] == "Independent Returns") &
    (stats_df["Mann Whitney Res (Means)"] == "Reject NH")]

Unnamed: 0,Independent Name,Dependent Name,Offset Lag,Rolling Lag,Market Variable,Mann Whitney P value,Mann Whitney Res (Means),Levene P value,Levene Res (Variances),Kruskal P value,Kruskal Res (Medians)
0,BTC-USD,BNB-USD,29,29,Independent Returns,0.049533,Reject NH,0.884796,Can't Reject NH,0.049486,Reject NH
0,BTC-USD,BNB-USD,28,28,Independent Returns,0.040439,Reject NH,0.715092,Can't Reject NH,0.040402,Reject NH
0,BTC-USD,BNB-USD,29,24,Independent Returns,0.019576,Reject NH,0.733599,Can't Reject NH,0.019557,Reject NH
0,BTC-USD,EGLD-USD,26,26,Independent Returns,0.048208,Reject NH,0.84681,Can't Reject NH,0.048164,Reject NH
0,BTC-USD,BNB-USD,28,21,Independent Returns,0.024209,Reject NH,0.694265,Can't Reject NH,0.024187,Reject NH
0,XRP-USD,LTC-USD,25,21,Independent Returns,0.039325,Reject NH,0.30196,Can't Reject NH,0.039289,Reject NH
