In [315]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import plotly.graph_objs as go
import plotly.express as px
from plotly.subplots import make_subplots
import gzip
from datetime import datetime, timedelta
import numpy as np
from statistics import mean
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
import matplotlib.pyplot as plt
import math
import collections
from technical_indicators import calculate_daily_relative_difference

In [316]:
def save_uk100(df, filename):
#     display(df)
    df.columns = ["Date", "Close", "Open", "High", "Low", "Vol.", "H", "aL", "L"]
    df = df[["Date", "Close", "Open", "High", "Low", "Vol."]]
    df.to_csv("Dataset v3" + filename[10:], index=False)
    print(f"\tSaved file as: Dataset v3" + filename[10:])

def save_nikkei225(df, filename):
#     display(df)
    df.columns = ["Date", "Close", "Open", "High", "Low", "Vol."]
    df.to_csv("Dataset v3" + filename[10:], index=False)
    print(f"\tSaved file as: Dataset v3" + filename[10:])

In [320]:
def plot_dist_line(df, mu, col, varname, title="", xaxis=""):
    
    np.random.seed(111)

    mu = df[col].mean()
    var = df[col].var()
    
    a = (((1 - mu) / var) - (1 / mu)) * mu ** 2
    b = a * ((1 / mu) - 1)
    distribution = np.random.beta(a=a, b=b, size=15000)
    for i in range(len(distribution)):
        distribution[i] = round(distribution[i],4)
        
    frequency = {}
    for item in distribution:
        if item in frequency:
            frequency[item] += 1
        else:
            frequency[item] = 1
    frequency = collections.OrderedDict(sorted(frequency.items()))
        
    val = [*frequency.keys()]
    freq = [frequency[x] for x in val]
    
    fig = go.Figure()
    fig.add_trace(go.Histogram(x=df[col], showlegend=False))
    fig.add_trace(go.Scatter(x=val, y=freq, mode="lines", line_color='firebrick', showlegend=False))
    fig.update_xaxes(title_text=xaxis)
    fig.update_yaxes(title_text="Count")
    layout = dict(title=title, width=700, height=600)
    fig.update_layout(layout)
    fig.write_image("Plots/" + varname + "_dist_" + col + "_line.png")
    fig.show()

def plot_dist(df, col, varname, title="", xaxis=""):        
    fig = go.Figure()
    fig.add_trace(go.Histogram(x=df[col]))
    fig.update_xaxes(title_text=xaxis)
    fig.update_yaxes(title_text="Count")
    layout = dict(title=title, width=700, height=600)
    fig.update_layout(layout)
    fig.write_image("Plots/" + varname + "_dist_" + col + ".png")
    fig.show()


def impute_candles(df, varname):
    
    df['HighDev%'] = (df[varname + '_High'] - (df[varname + '_Close'] + df[varname + '_Open']) / 2) / ((df[varname + '_Close'] + df[varname + '_Open']) / 2)
    df['LowDev%'] = (df[varname + '_Low'] - (df[varname + '_Open'] + df[varname + '_Close']) / 2) / ((df[varname + '_Open'] + df[varname + '_Close']) / 2)    
    df['AbsLowDev%'] = abs(df['LowDev%'])

    # plots resemble exponential distribution
    mu_high = df['HighDev%'].mean()
    mu_low = df['AbsLowDev%'].mean()
    
    plot_dist(df, 'HighDev%', varname, "Distribution of Highs Relative to Open and Close Prices per Day", "High Price Relative to Open and Close Prices per Day")
    plot_dist(df, 'AbsLowDev%', varname, "Distribution of Absolute Lows Relative to Open and Close Prices per Day", "Absolute Low Price Relative to Open and Close Prices per Day")
    
    plot_dist_line(df, mu_high, 'HighDev%', varname, "Distribution of Highs Relative to Open and Close Prices per Day", "High Price Relative to Open and Close Prices per Day")
    plot_dist_line(df, mu_low, 'AbsLowDev%', varname, "Distribution of Absolute Lows Relative to Open and Close Prices per Day", "Absolute Low Price Relative to Open and Close Prices per Day")
    
    sample_high = np.random.exponential(scale=mu_high, size=1)
    sample_low = np.random.exponential(scale=mu_low, size=1)
    
    np.random.seed(222)
    
    mu = df['HighDev%'].mean()
    var = df['HighDev%'].var()
    a = (((1 - mu) / var) - (1 / mu)) * mu ** 2
    b = a * ((1 / mu) - 1)
    high_samples = np.random.beta(a=a, b=b, size=2)
    print(f"\tFormulated beta distribution with alpha={a} and beta={b}")
    print(f"\tHigh samples: {high_samples}")
    
    mu = df['AbsLowDev%'].mean()
    var = df['AbsLowDev%'].var()
    a = (((1 - mu) / var) - (1 / mu)) * mu ** 2
    b = a * ((1 / mu) - 1)
    low_samples = -1 * np.random.beta(a=a, b=b, size=2)
    print(f"\tFormulated beta distribution with alpha={a} and beta={b}")
    print(f"\tLow samples: {low_samples}")
    
    dates = [datetime(2010,7,28), datetime(2019,3,5)]
    for i, date in enumerate(dates):
        date_next = (date + timedelta(days=1))
        date_prev = (date - timedelta(days=1))
        for x in df.iterrows():

            if x[1]['Date'] == date_prev:
                p_open = x[1][varname + '_Close']
            elif x[1]['Date'] == date_next:
                p_close = x[1][varname + '_Open']
                
        p_high = (p_close + p_open) / 2 + high_samples[i] * ((p_close + p_open) / 2)
        p_low = (p_close + p_open) / 2 - low_samples[i] * ((p_close + p_open) / 2)
        print(f"\tAppending {str(date)} with open {p_open}\thigh {p_high}\tlow {p_low}\t close {p_close}")
        df = df.append({'Date': date,
                        varname + '_Close': p_close,
                        varname + '_Open': p_open,
                        varname + '_High': p_high,
                        varname + '_Low': p_low,
                        varname + '_Volume': 0.0,
#                         varname + '_Change%': perc_change # Change% not in NIKKEI225 data
                       }, ignore_index=True)
    df = df.sort_values(by='Date').reset_index(drop=True)
    return df
    

In [321]:
def add_candles(df, varname):
    dates = ["28/07/2010", "29/07/2010"]
    dates = [datetime(2010,7,28), datetime(2010,7,29)]
    prices = [[9614.74, 9760.31, 9614.74, 9753.27], 
              [9653.51, 9732.76, 9648.97, 9696.02]]
    volumes = [0, 0]
    
    for i, date in enumerate(dates):
        p_open = prices[i][0]
        p_high = prices[i][1]
        p_low = prices[i][2]
        p_close = prices[i][3]
        
        perc_change = (p_close - p_open) / p_open
        
        volume = volumes[i]
        
        print(f"\tAppending {str(date)} with open {p_open}\thigh {p_high}\tlow {p_low}\t close {p_close}")
        df = df.append({'Date': date,
                        varname + '_Close': p_close,
                        varname + '_Open': p_open,
                        varname + '_High': p_high,
                        varname + '_Low': p_low,
                        varname + '_Volume': volume,
#                         varname + '_Change%': perc_change # Change% not in NIKKEI225 data
                       }, ignore_index=True)
    df = df.sort_values(by='Date').reset_index(drop=True)
    return df

In [322]:
def retrieve_data(filename, varname):
    df_new = pd.read_csv(filename)
    df_new["Date"] = pd.to_datetime(df_new["Date"])
    try:
        df_new.columns = ["Date", "Close", "Open", "High", "Low", "Vol.", "Change%"]
    except ValueError:
        try:
            df_new.columns = ["Date", "Close", "Open", "High", "Low", "Change%"]
        except ValueError:
            df_new.columns = ["Date", "Close", "Open", "High", "Low"]

    rows = []
    for index, row in df_new.iterrows():
        date = row['Date']
        p_close = float(str(row['Close']).replace(',', ''))
        p_open = float(str(row['Open']).replace(',', ''))
        p_high = float(str(row['High']).replace(',', ''))
        p_low = float(str(row['Low']).replace(',', ''))
        if df_new.columns.tolist() == ["Date", "Close", "Open", "High", "Low", "Vol.", "Change%"]:
            if str(row['Vol.']) == "-":
                volume = 0
            else:
                if row['Vol.'][-1] == "B": # volume in billions
                    volume = float(row['Vol.'][:len(row['Vol.']) - 1]) * 1000000000
                elif row['Vol.'][-1] == "M": # volume in millions
                    volume = float(row['Vol.'][:len(row['Vol.']) - 1]) * 1000000
                elif row['Vol.'][-1] == "K": # volume in thousands
                    volume = float(row['Vol.'][:len(row['Vol.']) - 1]) * 1000
                else:
                    print(varname, row['Date'], row['Vol.'])
            rows.append([date, p_close, p_open, p_high, p_low, volume])
        else:
            rows.append([date, p_close, p_open, p_high, p_low])
    df_new = pd.DataFrame(rows)
    if len(rows[0]) == 6:
        df_new.columns = ["Date", varname + "_Close", varname + "_Open", varname + "_High", varname + "_Low", varname + "_Volume"]
    else:
        df_new.columns = ["Date", varname + "_Close", varname + "_Open", varname + "_High", varname + "_Low"]
    
    df_new = df_new.sort_values(by='Date').reset_index(drop=True)
    df_new = df_new[(df_new['Date'] >= datetime(2009, 7, 1)) & (df_new['Date'] <= datetime(2019, 12, 31))]
    
    return df_new

files = {
    # varname: filename
    "UK100": "Dataset v2/Indices/invUK100 Historical Data.csv",
    "NIKKEI225": "Dataset v2/Indices/NIKKEI 225 Historical Data.csv",
}

print(f"Retrieving {len(files)} files")
i = 1
df = pd.DataFrame()
stats_rows = []
for file in files:
    print(file)
    df = retrieve_data(files[file], file)
    if file == "UK100":
        df = impute_candles(df, file)
        save_uk100(df, files[file])
    elif file == "NIKKEI225":
        df = add_candles(df, file)
        save_nikkei225(df, files[file])
    i += 1

Retrieving 2 files
UK100


	Formulated beta distribution with alpha=2.3079760027963077 and beta=405.28680035811306
	High samples: [0.01492081 0.00257647]
	Formulated beta distribution with alpha=2.0169124244123195 and beta=342.104754003317
	Low samples: [-0.00313893 -0.00916729]
	Appending 2010-07-28 00:00:00 with open 869.0	high 878.8706756638805	low 868.6681599298955	 close 862.9
	Appending 2019-03-05 00:00:00 with open 1153.2	high 1160.3318739175538	low 1167.9597616860513	 close 1161.5
	Saved file as: Dataset v3/Indices/invUK100 Historical Data.csv
NIKKEI225
	Appending 2010-07-28 00:00:00 with open 9614.74	high 9760.31	low 9614.74	 close 9753.27
	Appending 2010-07-29 00:00:00 with open 9653.51	high 9732.76	low 9648.97	 close 9696.02
	Saved file as: Dataset v3/Indices/NIKKEI 225 Historical Data.csv
