In [1]:
import warnings
warnings.simplefilter(action='ignore', category=UserWarning)

import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import plotly.graph_objs as go
import plotly.express as px
from plotly.subplots import make_subplots
import gzip
from datetime import datetime, timedelta
from statistics import mean
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
from technical_indicators import calculate_daily_relative_difference, \
    calculate_average_relative_difference, \
    calculate_exponential_moving_average, \
    calculate_moving_average, \
    calculate_average_true_range, \
    calculate_weeks_high, \
    calculate_weeks_low, \
    calculate_relative_strength_index, \
    calculate_stochastic_k, \
    calculate_stochastic_d, \
    calculate_momentum, \
    calculate_williams_r, \
    calculate_ad_oscillator, \
    calculate_disparity, \
    calculate_bollinger_bands, \
    calculate_on_balance_volume, \
    calculate_stdev_on_balance_volume, \
    calculate_moving_average_convergence_divergence

In [3]:
US_holidays = [
            datetime(2009,7,3), # Independence day
            datetime(2009,9,7), # Labor day
            datetime(2009,11,26), # Thanksgiving
            datetime(2009,12,25), # Christmas
               
            datetime(2010,1,1), # New Year's day
            datetime(2010,1,18), # Martin Luther King Jr. day
            datetime(2010,2,15), # President's day
            datetime(2010,4,2), # Good Friday
            datetime(2010,5,31), # Memorial day
            datetime(2010,7,5), # Independence day
            datetime(2010,9,6), # Labor day
            datetime(2010,11,25), # Thanksgiving
            datetime(2010,12,24), # Christmas
               
            datetime(2011,1,1), # New Year's day
            datetime(2011,1,17), # Martin Luther King Jr. day
            datetime(2011,2,21), # President's day
            datetime(2011,4,22), # Good Friday
            datetime(2011,5,30), # Memorial day
            datetime(2011,7,4), # Independence day
            datetime(2011,9,5), # Labor day
            datetime(2011,11,24), # Thanksgiving
            datetime(2011,12,26), # Christmas
               
            datetime(2012,1,1), # New Year's day
            datetime(2012,1,2), # New Year's day
            datetime(2012,1,16), # Martin Luther King Jr. day
            datetime(2012,2,20), # President's day
            datetime(2012,4,6), # Good Friday
            datetime(2012,5,28), # Memorial day
            datetime(2012,7,4), # Independence day
            datetime(2012,9,3), # Labor day
    
            datetime(2012,10,29), # Labor day
            datetime(2012,10,30), # Labor day
    
            datetime(2012,11,22), # Thanksgiving
            datetime(2012,12,25), # Christmas

            datetime(2013,1,1), # New Year's day
            datetime(2013,1,21), # Martin Luther King Jr. day
            datetime(2013,2,18), # President's day
            datetime(2013,3,29), # Good Friday
            datetime(2013,5,27), # Memorial day
            datetime(2013,7,4), # Independence day
            datetime(2013,9,2), # Labor day
            datetime(2013,11,28), # Thanksgiving
            datetime(2013,12,25), # Christmas
            
            datetime(2014,1,1), # New Year's day
            datetime(2014,1,20), # Martin Luther King Jr. day
            datetime(2014,2,17), # President's day
            datetime(2014,4,18), # Good Friday
            datetime(2014,5,26), # Memorial day
            datetime(2014,7,4), # Independence day
            datetime(2014,9,1), # Labor day
            datetime(2014,11,27), # Thanksgiving
            datetime(2014,12,25), # Christmas
            
            datetime(2015,1,1), # New Year's day
            datetime(2015,1,19), # Martin Luther King Jr. day
            datetime(2015,2,16), # President's day
            datetime(2015,4,3), # Good Friday
            datetime(2015,5,25), # Memorial day
            datetime(2015,7,3), # Independence day
            datetime(2015,9,7), # Labor day
            datetime(2015,11,26), # Thanksgiving
            datetime(2015,12,25), # Christmas
                     
            datetime(2016,1,1), # New Year's day
            datetime(2016,1,18), # Martin Luther King Jr. day
            datetime(2016,2,15), # President's day
            datetime(2016,3,25), # Good Friday
            datetime(2016,5,30), # Memorial day
            datetime(2016,7,4), # Independence day
            datetime(2016,9,5), # Labor day
            datetime(2016,11,24), # Thanksgiving
            datetime(2016,12,26), # Christmas
            
            datetime(2017,1,1), # New Year's day
            datetime(2017,1,2), # New Year's day
            datetime(2017,1,16), # Martin Luther King Jr. day
            datetime(2017,2,20), # President's day
            datetime(2017,4,14), # Good Friday
            datetime(2017,5,29), # Memorial day
            datetime(2017,7,4), # Independence day
            datetime(2017,9,4), # Labor day
            datetime(2017,11,23), # Thanksgiving
            datetime(2017,12,25), # Christmas
               
            datetime(2018,1,1), # New Year's day
            datetime(2018,1,15), # Martin Luther King Jr. day
            datetime(2018,2,19), # President's day
            datetime(2018,3,30), # Good Friday
            datetime(2018,5,28), # Memorial day
            datetime(2018,7,4), # Independence day
            datetime(2018,9,3), # Labor day
            datetime(2018,11,22), # Thanksgiving
            datetime(2018,12,5), # Christmas
            datetime(2018,12,25), # Christmas
            
            datetime(2019,1,1), # New Year's day
            datetime(2019,1,21), # Martin Luther King Jr. day
            datetime(2019,2,18), # President's day
            datetime(2019,4,19), # Good Friday
            datetime(2019,5,27), # Memorial day
            datetime(2019,7,4), # Independence day
            datetime(2019,9,2), # Labor day
            datetime(2019,11,28), # Thanksgiving
            datetime(2019,12,25) # Christmas
]

In [4]:
files = {
    # varname: filename
    "SP500": "Dataset v3/Indices/S&P 500 Historical Data.csv",
    "US30": "Dataset v3/Indices/Dow Jones Industrial Average Historical Data.csv", 
    "US2000": "Dataset v3/Indices/US Small Cap 2000 Historical Data.csv",
    "NASDAQ": "Dataset v3/Indices/NASDAQ Composite Historical Data.csv", 
    "GER30": "Dataset v3/Indices/DAX Historical Data.csv",
    "CAC40": "Dataset v3/Indices/CAC 40 Historical Data.csv",
    "UK100": "Dataset v3/Indices/invUK100 Historical Data.csv",
    "SSE50": "Dataset v3/Indices/Shanghai Composite Historical Data.csv",
    "HS50": "Dataset v3/Indices/Hang Seng Historical Data.csv",
    "NIKKEI225": "Dataset v3/Indices/Nikkei 225 Historical Data.csv",
    
    "SP500_F": "Dataset v3/Index Futures/US 500 Cash Historical Data.csv",
    "US30_F": "Dataset v3/Index Futures/US 30 Cash Historical Data.csv", 
    "US2000_F": "Dataset v3/Index Futures/Small Cap 2000 Cash Historical Data.csv", 
    "NASDAQ_F": "Dataset v3/Index Futures/US Tech 100 Cash Historical Data.csv", 
    "GER30_F": "Dataset v3/Index Futures/DAX Futures Historical Data.csv", 
    "CAC40_F": "Dataset v3/Index Futures/CAC 40 Futures Historical Data.csv", 
    "UK100_F": "Dataset v3/Index Futures/FTSE 100 Futures Historical Data.csv",
    "SSE50_F": "Dataset v3/Index Futures/CSI 300 Futures Historical Data.csv",
    # "SHCOMP_F": "Dataset v3/Futures/SHCOMP Futures Historical Data.csv", # have not been able to find SHCOMP Futures data
    "HS50_F": "Dataset v3/Index Futures/Hang Seng Futures Historical Data.csv",
    "NIKKEI225_F": "Dataset v3/Index Futures/Nikkei 225 Futures Historical Data.csv",

    "AAPL": "Dataset v3/Index Constituents/NASDAQ/AAPL Historical Data.csv", 
    "AMZN": "Dataset v3/Index Constituents/NASDAQ/AMZN Historical Data.csv",
    "MSFT": "Dataset v3/Index Constituents/NASDAQ/MSFT Historical Data.csv", 

    "BrentOil_F": "Dataset v3/Commodities/Brent Oil Futures Historical Data.csv", 
    "Copper_F": "Dataset v3/Commodities/Copper Futures Historical Data.csv", 
    "WTIOil_F": "Dataset v3/Commodities/Crude Oil WTI Futures Historical Data.csv", 
    "NaturalGas_F": "Dataset v3/Commodities/Natural Gas Futures Historical Data.csv",
    "Corn_F": "Dataset v3/Commodities/US Corn Futures Historical Data.csv", 
#     "Soybeans_F": "Dataset v3/Commodities/US Soybeans Futures Historical Data.csv", 
    "Gold_F": "Dataset v3/Commodities/Gold Futures Historical Data.csv", 
    "Silver_F": "Dataset v3/Commodities/Silver Futures Historical Data.csv", 

#     "WTIOil": "Dataset v3/Commodities/WTI_USD Historical Data.csv", 
#     "XAGUSD": "Dataset v3/Commodities/XAG_USD Historical Data.csv", 
#     "XAUUSD": "Dataset v3/Commodities/XAU_USD Historical Data.csv", 

    "AUDUSD": "Dataset v3/Forex/USD/AUD_USD Historical Data.csv", 
    "EURUSD": "Dataset v3/Forex/USD/EUR_USD Historical Data.csv", 
    "GBPUSD": "Dataset v3/Forex/USD/GBP_USD Historical Data.csv", 
    "NZDUSD": "Dataset v3/Forex/USD/NZD_USD Historical Data.csv", 
    "USDCAD": "Dataset v3/Forex/USD/USD_CAD Historical Data.csv", 
    "USDCHF": "Dataset v3/Forex/USD/USD_CHF Historical Data.csv", 
    "USDHKD": "Dataset v3/Forex/USD/USD_HKD Historical Data.csv", 
    "USDJPY": "Dataset v3/Forex/USD/USD_JPY Historical Data.csv", 
#     "USDKRW": "Dataset v3/Forex/USD/USD_KRW Historical Data.csv", 
    
#     "TBill1M": "Dataset v3/1-Month T-Bill Rate Historical Data.csv", 
#     "TBill3M": "Dataset v3/3-Month T-Bill Rate Historical Data.csv", 
#     "TBill6M": "Dataset v3/6-Month T-Bill Rate Historical Data.csv", 
#     "Treasury1Y": "Dataset v3/1-Year Treasury Constant Maturity Rate Historical Data.csv", 
#     "Treasury5Y": "Dataset v3/5-Year Treasury Constant Maturity Rate Historical Data.csv", 
#     "Treasury10Y": "Dataset v3/10-Year Treasury Constant Maturity Rate Historical Data.csv", 
}

In [5]:
def retrieve_full_data(df, filename, varname):
    df_new = pd.read_csv(filename)
    df_new["Date"] = pd.to_datetime(df_new["Date"])
    try:
        df_new.columns = ["Date", "Close", "Open", "High", "Low", "Vol.", "Change%"]
    except ValueError:
        try:
            df_new.columns = ["Date", "Close", "Open", "High", "Low", "Change%"]
        except ValueError:
            df_new.columns = ["Date", "Close", "Open", "High", "Low"]

    rows = []
    for index, row in df_new.iterrows():
        date = row['Date']
        p_close = float(str(row['Close']).replace(',', ''))
        p_open = float(str(row['Open']).replace(',', ''))
        p_high = float(str(row['High']).replace(',', ''))
        p_low = float(str(row['Low']).replace(',', ''))
        if df_new.columns.tolist() == ["Date", "Close", "Open", "High", "Low", "Vol.", "Change%"]:
            if str(row['Vol.']) == "-":
                volume = 0
            else:
                if row['Vol.'][-1] == "B": # volume in billions
                    volume = float(row['Vol.'][:len(row['Vol.']) - 1]) * 1000000000
                elif row['Vol.'][-1] == "M": # volume in millions
                    volume = float(row['Vol.'][:len(row['Vol.']) - 1]) * 1000000
                elif row['Vol.'][-1] == "K": # volume in thousands
                    volume = float(row['Vol.'][:len(row['Vol.']) - 1]) * 1000
                else:
                    print(varname, row['Date'], row['Vol.'])
            rows.append([date, p_close, p_open, p_high, p_low, volume])
        else:
            rows.append([date, p_close, p_open, p_high, p_low])
    df_new = pd.DataFrame(rows)
    if len(rows[0]) == 6:
        df_new.columns = ["Date", varname + "_Close", varname + "_Open", varname + "_High", varname + "_Low", varname + "_Volume"]
    else:
        df_new.columns = ["Date", varname + "_Close", varname + "_Open", varname + "_High", varname + "_Low"]
    
    df_new = df_new.sort_values(by='Date').reset_index(drop=True)
    
    # add relative difference between open and close to data per day
    df_new = calculate_daily_relative_difference(df_new,  varname + "_Open",  varname + "_Close", varname)
    # add the average relative difference between open and close to data per day
    df_new = calculate_average_relative_difference(df_new,  varname + "_Open",  varname + "_Close", [5, 10, 20, 50], varname)

    if not varname in ["TBill1M", "TBill3M", "TBill6M", "Treasury1Y", "Treasury5Y", "Treasury10Y"]:
        # add exponential moving average of closes to data
        df_new = calculate_exponential_moving_average(df_new, varname + "_relative_change_perc_1", [5, 10, 20, 50], varname)
        # add moving average of closes to data
        df_new = calculate_moving_average(df_new, varname + "_relative_change_perc_1", [5, 10, 20, 50], varname)
        # add x week high to data
        df_new = calculate_weeks_high(df_new, varname + "_High", [1, 10, 52], varname)
        # add x week low to data
        df_new = calculate_weeks_low(df_new, varname + "_Low", [1, 10, 52], varname)
        # add average true range to data
        df_new = calculate_average_true_range(df_new, varname + "_Open", varname + "_High", varname + "_Low", varname + "_Close", [5, 10, 20, 50], varname)
        # add RSI to data -- 14 by definition
        df_new = calculate_relative_strength_index(df_new, varname + "_Open", varname + "_Close", [14, 28], varname)
        # add stochastic K% to data
        df_new = calculate_stochastic_k(df_new, varname + "_High", varname + "_Low", varname + "_Close", [5, 10, 20, 50], varname)
        # add stochastic D% to data (K% required)
        df_new = calculate_stochastic_d(df_new, varname + "_High", varname + "_Low", varname + "_Close", [5, 10, 20, 50], varname)
        # add momentum to data
        df_new = calculate_momentum(df_new, varname + "_Close", [4, 8, 16], varname)
        # add williams R% to data
        df_new = calculate_williams_r(df_new, varname + "_High", varname + "_Low", varname + "_Close", [5, 10, 20, 50], varname)
        # add A/D oscillator to data
        df_new = calculate_ad_oscillator(df_new, varname + "_High", varname + "_Low", varname + "_Close", varname)
        # add disparity to data
        df_new = calculate_disparity(df_new, varname + "_Close", [5, 10, 20, 50], varname)
        # add bollinger bands to data
        df_new = calculate_bollinger_bands(df_new, varname + "_Close", [5, 10, 20, 50], varname)
        # add moving average convergence divergence to data
        df_new = calculate_moving_average_convergence_divergence(df_new, varname + "_Close", [[12, 26]], varname)
        # add on-balance volume to data
        df_new = calculate_on_balance_volume(df_new, varname + "_Open", varname + "_Close", varname + "_Volume", varname)
        # add standard deviation in on-balance volume to data
        df_new = calculate_stdev_on_balance_volume(df_new, varname + "_Open", varname + "_Close", varname + "_Volume", [5, 10, 20, 50], varname)
    
    df_new = df_new[(df_new['Date'] >= datetime(2009, 7, 1)) & (df_new['Date'] <= datetime(2019, 12, 31))]
    df_new = df_new.drop(varname + "_Open", 1)
    df_new = df_new.drop(varname + "_High", 1)
    df_new = df_new.drop(varname + "_Low", 1)
    df_new = df_new.drop(varname + "_Close", 1)
    
    if df.empty:
        df = df_new
    else:
        df = pd.merge(df, df_new, on='Date', how='outer')
    return df

def retrieve_data(df, filename, varname):
    df_new = pd.read_csv(filename)
    df_new["Date"] = pd.to_datetime(df_new["Date"])
    try:
        df_new.columns = ["Date", "Close", "Open", "High", "Low", "Vol.", "Change%"]
    except ValueError:
        try:
            df_new.columns = ["Date", "Close", "Open", "High", "Low", "Change%"]
        except ValueError:
            df_new.columns = ["Date", "Close", "Open", "High", "Low"]

    rows = []
    for index, row in df_new.iterrows():
        date = row['Date']
        p_close = float(str(row['Close']).replace(',', ''))
        p_open = float(str(row['Open']).replace(',', ''))
        p_high = float(str(row['High']).replace(',', ''))
        p_low = float(str(row['Low']).replace(',', ''))
        if df_new.columns.tolist() == ["Date", "Close", "Open", "High", "Low", "Vol.", "Change%"]:
            if str(row['Vol.']) == "-":
                volume = 0
            else:
                if row['Vol.'][-1] == "B": # volume in billions
                    volume = float(row['Vol.'][:len(row['Vol.']) - 1]) * 1000000000
                elif row['Vol.'][-1] == "M": # volume in millions
                    volume = float(row['Vol.'][:len(row['Vol.']) - 1]) * 1000000
                elif row['Vol.'][-1] == "K": # volume in thousands
                    volume = float(row['Vol.'][:len(row['Vol.']) - 1]) * 1000
                else:
                    print(varname, row['Date'], row['Vol.'])
            rows.append([date, p_close, p_open, p_high, p_low, volume])
        else:
            rows.append([date, p_close, p_open, p_high, p_low])
    df_new = pd.DataFrame(rows)
    if len(rows[0]) == 6:
        df_new.columns = ["Date", varname + "_Close", varname + "_Open", varname + "_High", varname + "_Low", varname + "_Volume"]
    else:
        df_new.columns = ["Date", varname + "_Close", varname + "_Open", varname + "_High", varname + "_Low"]
    
    df_new = df_new.sort_values(by='Date').reset_index(drop=True)
    
    # add relative difference between open and close to data per day
    df_new = calculate_daily_relative_difference(df_new,  varname + "_Open",  varname + "_Close", varname)
    # add the average relative difference between open and close to data per day
    df_new = calculate_average_relative_difference(df_new,  varname + "_Open",  varname + "_Close", [5, 10, 20, 50], varname)
    
    df_new = df_new.drop(varname + "_Open", 1)
    df_new = df_new.drop(varname + "_High", 1)
    df_new = df_new.drop(varname + "_Low", 1)
    df_new = df_new.drop(varname + "_Close", 1)
    
    if df.empty:
        df = df_new
    else:
        df = pd.merge(df, df_new, on='Date', how='outer')
    return df

def create_US_dataset(focus_file, files, holidays):
    print(f"Creating {focus_file} dataset")
    print(f"Retrieving {len(files)} files")
    i = 1
    df = pd.DataFrame()
    for file in files:
        print(f"Retrieving {file} - {i}/{len(files)}")
        if file == focus_file:
            df = retrieve_full_data(df, files[file], file)
        else:
            df = retrieve_data(df, files[file], file)
        i += 1
    df = df.sort_values(by='Date').reset_index(drop=True)
    df = df[(df['Date'] >= datetime(2009, 7, 1)) & (df['Date'] <= datetime(2019, 12, 31))]
    display(df)
    return df


df_nasdaq = create_US_dataset("NASDAQ", files, US_holidays)

Creating NASDAQ dataset
Retrieving 38 files
Retrieving SP500 - 1/38
Retrieving US30 - 2/38
Retrieving US2000 - 3/38
Retrieving NASDAQ - 4/38
Retrieving GER30 - 5/38
Retrieving CAC40 - 6/38
Retrieving UK100 - 7/38
Retrieving SSE50 - 8/38
Retrieving HS50 - 9/38
Retrieving NIKKEI225 - 10/38
Retrieving SP500_F - 11/38
Retrieving US30_F - 12/38
Retrieving US2000_F - 13/38
Retrieving NASDAQ_F - 14/38
Retrieving GER30_F - 15/38
Retrieving CAC40_F - 16/38
Retrieving UK100_F - 17/38
Retrieving SSE50_F - 18/38
Retrieving HS50_F - 19/38
Retrieving NIKKEI225_F - 20/38
Retrieving AAPL - 21/38
Retrieving AMZN - 22/38
Retrieving MSFT - 23/38
Retrieving BrentOil_F - 24/38
Retrieving Copper_F - 25/38
Retrieving WTIOil_F - 26/38
Retrieving NaturalGas_F - 27/38
Retrieving Corn_F - 28/38
Retrieving Gold_F - 29/38
Retrieving Silver_F - 30/38
Retrieving AUDUSD - 31/38
Retrieving EURUSD - 32/38
Retrieving GBPUSD - 33/38
Retrieving NZDUSD - 34/38
Retrieving USDCAD - 35/38
Retrieving USDCHF - 36/38
Retrieving 

Unnamed: 0,Date,SP500_Volume,SP500_relative_change_perc_1,SP500_relative_change_perc_5,SP500_relative_change_perc_10,SP500_relative_change_perc_20,SP500_relative_change_perc_50,US30_Volume,US30_relative_change_perc_1,US30_relative_change_perc_5,...,USDHKD_relative_change_perc_1,USDHKD_relative_change_perc_5,USDHKD_relative_change_perc_10,USDHKD_relative_change_perc_20,USDHKD_relative_change_perc_50,USDJPY_relative_change_perc_1,USDJPY_relative_change_perc_5,USDJPY_relative_change_perc_10,USDJPY_relative_change_perc_20,USDJPY_relative_change_perc_50
1173,2009-07-01,0.0,0.002726,0.005099,0.001479,-0.000382,0.001847,184600000.0,0.006692,0.004993,...,0.000090,0.000085,0.000081,0.000087,0.000088,0.002910,0.001785,0.000867,0.000530,-0.000260
1174,2009-07-02,0.0,-0.026942,-0.004917,-0.002039,-0.002264,0.001396,157800000.0,-0.026139,-0.004408,...,0.000077,0.000083,0.000081,0.000090,0.000088,-0.007252,0.000209,-0.000608,-0.000166,-0.000370
1175,2009-07-03,,,,,,,,,,...,0.000000,0.000062,0.000079,0.000080,0.000087,0.002087,0.001817,0.000014,-0.001221,-0.000077
1176,2009-07-06,0.0,0.004976,-0.003935,-0.001680,-0.001720,0.001323,206900000.0,0.005504,-0.002595,...,0.000077,0.000065,0.000071,0.000077,0.000088,-0.006676,-0.001599,0.000023,-0.001458,-0.000130
1177,2009-07-07,0.0,-0.019553,-0.009448,-0.000902,-0.002752,0.000643,210880000.0,-0.019381,-0.008586,...,0.000077,0.000065,0.000075,0.000079,0.000090,-0.006301,-0.003047,-0.000502,-0.001230,-0.000291
1178,2009-07-08,0.0,-0.002653,-0.008289,-0.001351,-0.002995,0.000713,325250000.0,0.002622,-0.006140,...,0.000116,0.000070,0.000077,0.000079,0.000090,-0.020082,-0.007645,-0.002930,-0.002645,-0.000911
1179,2009-07-09,0.0,0.001589,-0.008517,-0.001709,-0.002726,0.000729,192660000.0,0.000509,-0.007377,...,0.000026,0.000059,0.000071,0.000071,0.000089,0.004637,-0.005267,-0.002529,-0.002067,-0.001086
1180,2009-07-10,0.0,-0.001023,-0.003333,-0.004125,-0.003088,0.000317,174260000.0,-0.004396,-0.003028,...,0.000026,0.000065,0.000063,0.000075,0.000088,-0.006655,-0.007016,-0.002599,-0.002846,-0.001361
1181,2009-07-13,0.0,0.024421,0.000556,-0.001689,-0.002014,0.000891,253520000.0,0.022691,0.000409,...,0.000052,0.000059,0.000062,0.000070,0.000087,0.006814,-0.004318,-0.002958,-0.002088,-0.001112
1182,2009-07-14,0.0,0.005629,0.005592,-0.001928,-0.000739,0.000894,189170000.0,0.003375,0.004960,...,0.000026,0.000049,0.000057,0.000070,0.000086,0.005800,-0.001897,-0.002472,-0.000969,-0.000974


In [6]:
def remove_holidays(df, holidays):
    df_new = pd.DataFrame()
    for row in df.iterrows():
        if row[1]['Date'].weekday() == 5 or row[1]['Date'].weekday() == 6 or row[1]['Date'] in holidays:
            pass
        else:
            df_new = df_new.append(row[1])
    return df_new

df_nasdaq = remove_holidays(df_nasdaq, US_holidays)
df_nasdaq

Unnamed: 0,AAPL_Volume,AAPL_relative_change_perc_1,AAPL_relative_change_perc_10,AAPL_relative_change_perc_20,AAPL_relative_change_perc_5,AAPL_relative_change_perc_50,AMZN_Volume,AMZN_relative_change_perc_1,AMZN_relative_change_perc_10,AMZN_relative_change_perc_20,...,USDJPY_relative_change_perc_10,USDJPY_relative_change_perc_20,USDJPY_relative_change_perc_5,USDJPY_relative_change_perc_50,WTIOil_F_Volume,WTIOil_F_relative_change_perc_1,WTIOil_F_relative_change_perc_10,WTIOil_F_relative_change_perc_20,WTIOil_F_relative_change_perc_5,WTIOil_F_relative_change_perc_50
1173,414270000.0,-0.003906,0.000911,-0.000532,0.007047,0.001238,6980000.0,-0.033404,-0.002406,-0.002648,...,0.000867,0.000530,0.001785,-0.000260,304640.0,-0.016182,-0.002302,0.002222,0.000783,0.008412
1174,370620000.0,-0.007937,0.000323,-0.002229,-0.000314,0.001261,7220000.0,-0.022069,-0.003140,-0.003810,...,-0.000608,-0.000166,0.000209,-0.000370,243250.0,-0.035972,-0.006676,-0.001635,-0.012093,0.007622
1176,498690000.0,0.000000,-0.000691,-0.002036,-0.004322,0.001438,7370000.0,-0.004461,-0.004572,-0.004769,...,0.000023,-0.001458,-0.001599,-0.000130,256990.0,-0.036697,-0.007823,-0.003259,-0.016077,0.006422
1177,461780000.0,-0.022222,-0.000722,-0.003148,-0.006813,0.001129,6490000.0,-0.036929,-0.004309,-0.006391,...,-0.000502,-0.001230,-0.003047,-0.000291,262360.0,-0.019629,-0.005551,-0.004072,-0.026473,0.005264
1178,575930000.0,0.010309,0.001952,-0.002243,-0.004751,0.001062,8540000.0,0.011771,-0.001498,-0.005888,...,-0.002930,-0.002645,-0.007645,-0.000911,334960.0,-0.034826,-0.012146,-0.006774,-0.028661,0.005077
1179,343150000.0,-0.010163,0.000522,-0.002062,-0.006002,0.000768,6360000.0,-0.001917,-0.003318,-0.005419,...,-0.002529,-0.002067,-0.005267,-0.001086,314800.0,0.000497,-0.012271,-0.007331,-0.025326,0.005159
1180,445270000.0,0.016427,-0.000722,-0.001442,-0.001130,0.001051,5980000.0,0.001419,-0.007095,-0.004995,...,-0.002599,-0.002846,-0.007016,-0.001361,264750.0,-0.009264,-0.016039,-0.008647,-0.019984,0.004275
1181,483770000.0,0.020080,-0.000718,0.000268,0.002886,0.001542,8050000.0,0.043551,-0.005158,-0.002258,...,-0.002958,-0.002088,-0.004318,-0.001112,301910.0,-0.002840,-0.014645,-0.008520,-0.013212,0.004088
1182,347500000.0,0.001972,0.000456,0.000367,0.007725,0.001358,4550000.0,0.005768,-0.003579,-0.002181,...,-0.002472,-0.000969,-0.001897,-0.000974,305950.0,-0.010309,-0.018911,-0.007942,-0.011348,0.002999
1183,485580000.0,0.013514,0.001808,0.001145,0.008366,0.001017,6340000.0,0.018675,-0.001760,-0.000322,...,-0.002176,-0.000654,0.003294,-0.000838,279730.0,0.029269,-0.013595,-0.006592,0.001471,0.002881


In [7]:
df_nan = df_nasdaq.isna().sum()
for index, value in df_nan.items():
    print(f"Index : {index}\t Value : {value}")

Index : AAPL_Volume	 Value : 0
Index : AAPL_relative_change_perc_1	 Value : 0
Index : AAPL_relative_change_perc_10	 Value : 0
Index : AAPL_relative_change_perc_20	 Value : 0
Index : AAPL_relative_change_perc_5	 Value : 0
Index : AAPL_relative_change_perc_50	 Value : 0
Index : AMZN_Volume	 Value : 0
Index : AMZN_relative_change_perc_1	 Value : 0
Index : AMZN_relative_change_perc_10	 Value : 0
Index : AMZN_relative_change_perc_20	 Value : 0
Index : AMZN_relative_change_perc_5	 Value : 0
Index : AMZN_relative_change_perc_50	 Value : 0
Index : AUDUSD_relative_change_perc_1	 Value : 0
Index : AUDUSD_relative_change_perc_10	 Value : 0
Index : AUDUSD_relative_change_perc_20	 Value : 0
Index : AUDUSD_relative_change_perc_5	 Value : 0
Index : AUDUSD_relative_change_perc_50	 Value : 0
Index : BrentOil_F_Volume	 Value : 0
Index : BrentOil_F_relative_change_perc_1	 Value : 0
Index : BrentOil_F_relative_change_perc_10	 Value : 0
Index : BrentOil_F_relative_change_perc_20	 Value : 0
Index : BrentOil

In [8]:
# display(df)

df = df_nasdaq.copy()

df = df.set_index(['Date'])
# display(df)
df = df.interpolate(method='time')
df = df.fillna(method='bfill')
# display(df)
df_nan = df.isna().sum()
for index, value in df_nan.items():
    if value > 0:
        print(f"Index : {index}\t Value : {value}")
        
df_nasdaq = df.copy()

In [9]:
df_nasdaq.to_csv("Dataset v3/nasdaq_combined_data_20220422.csv")