In [126]:
import pandas as pd
from os import listdir
from os.path import isfile, join
import numpy as np
from datetime import timedelta

In [117]:
# get data in desirved format
dow_files_addr = "./Data/Dow_30_1_min/"
etf_files_addr = "./Data/50_ETFs_1min/"

dow_files_names = [f for f in listdir(dow_files_addr)]
etf_files_names = [f for f in listdir(etf_files_addr)]

dow_names = [f.split(".")[0] for f in dow_files_names]
etf_names = [f.split(".")[0] for f in etf_files_names]

dow_dfs = []
etf_dfs = []
headers = ["date", "time", "open_price", "close_price", "highest_price", "lowest_price", "volumn"]
dtypes={"date": "str", 
    "time":"str",
    "open_price": "float", 
    "highest_price": "float",
    "lowest_price": "float", 
    "close_price": "float",
    "volumn": "int"}

print("Start read dow file...")
for dow_name in dow_files_names:
    df = pd.read_csv(dow_files_addr + dow_name, sep=",",names=headers,dtype=dtypes, header=None)
    df["datetime"] = df["date"] + " " + df["time"]
    df["datetime"] = pd.to_datetime(df["datetime"], format="%m/%d/%Y %H:%M")
    df["hour"] = df["time"].str.split(":", expand=True)[0].astype(int)
    df["minute"] = df["time"].str.split(":", expand=True)[1].astype(int)
    df = df.sort_values(by=["datetime"])
    df.drop(columns=["date"])
    df.drop(columns=["time"])
    dow_dfs.append(df)
print("Finish read dow file.")
print("---------------------")
print("Start read etf file...")
for etf_name in etf_files_names:
    df = pd.read_csv(etf_files_addr + etf_name, sep=",",names=headers,dtype=dtypes, header=None)
    df["datetime"] = df["date"] + " " + df["time"]
    df["datetime"] = pd.to_datetime(df["datetime"], format="%m/%d/%Y %H:%M")
    df["hour"] = df["time"].str.split(":", expand=True)[0].astype(int)
    df["minute"] = df["time"].str.split(":", expand=True)[1].astype(int)
    df = df.sort_values(by=["datetime"])
    df.drop(columns=["date"])
    df.drop(columns=["time"])
    etf_dfs.append(df)
print("Finish read etf file.")

Start read dow file...
Finish read dow file.
---------------------
Start read etf file...
Finish read etf file.


In [118]:
# get start and end for both dow and etf that cove evey df

dow_time_slice = [min(dow_dfs[0]["datetime"]), max(dow_dfs[0]["datetime"])]
etf_time_slice = [min(etf_dfs[0]["datetime"]), max(etf_dfs[0]["datetime"])]

print("Start check dow date...")
i = 0
for df in dow_dfs:
    start = min(df["datetime"])
    if start > dow_time_slice[0]:
        print("start", dow_files_names[i])
        print(start)
        dow_time_slice[0] = start
    end = max(df["datetime"])
    if end < dow_time_slice[1]:
        print("end", dow_files_names[i])
        print(end)
        dow_time_slice[1] = end
    i += 1
print("Finish check dow date...")
print("---------------------")
print("Start check etf date...")
i = 0
for df in etf_dfs:
    start = min(df["datetime"])
    if start > etf_time_slice[0]:
        print("start", dow_files_names[i])
        print(start)
        etf_time_slice[0] = start
    end = max(df["datetime"])
    if end < etf_time_slice[1]:
        print("end", dow_files_names[i])
        print(end)
        etf_time_slice[1] = end
    i += 1
print("Finish check etf date...")

Start check dow date...
start GM.txt
2010-11-18 09:36:00
end GM.txt
2020-06-05 19:44:00
Finish check dow date...
---------------------
Start check etf date...
start AAPL.txt
2003-04-14 09:32:00
end AAPL.txt
2020-06-05 19:41:00
end AIG.txt
2020-06-05 17:38:00
start AXP.txt
2008-11-19 09:30:00
start T.txt
2009-06-25 09:40:00
Finish check etf date...


In [119]:
# limit all of the df to the same time frames
for i in range(len(dow_dfs)):
    dow_dfs[i] = dow_dfs[i].loc[(dow_dfs[i]["datetime"] >= dow_time_slice[0]) & (dow_dfs[i]["datetime"] <= dow_time_slice[1])]
for i in range(len(etf_dfs)):
    etf_dfs[i] = etf_dfs[i].loc[(etf_dfs[i]["datetime"] >= etf_time_slice[0]) & (etf_dfs[i]["datetime"] <= etf_time_slice[1])]

In [62]:
# fill in the missing minutes
for i in range(len(dow_dfs)):
    cur_open_price = dow_dfs[i].iloc[0]["open_price"]
    cur_close_price = dow_dfs[i].iloc[0]["close_price"]
    cur_highest_price = dow_dfs[i].iloc[0]["highest_price"]  
    cur_lowest_price = dow_dfs[i].iloc[0]["lowest_price"]
    # cur_volumn = dow_dfs[i].iloc[0]["volumn"]
    
    start_time = dow_time_slice[0]
    end_time = dow_time_slice[1]
    cur_time = start_time
    
    df = pd.DataFrame(columns=["open_price", "close_price", "highest_price", "lowest_price", "volumn", "datetime", "hour", "minute"])
    while cur_time < end_time:
        # if exist
        if cur_time in pd.DatetimeIndex(dow_dfs[i]["datetime"]):
            i = pd.DatetimeIndex(dow_dfs[i]["datetime"]).to_numpy().tolist().index(cur_time.to_numpy().tolist())
            df = df.append(df.iloc[i], ignore_index=True)
            cur_open_price = dow_dfs[i].iloc[i]["open_price"]
            cur_close_price = dow_dfs[i].iloc[i]["close_price"]    
            cur_lowest_price = dow_dfs[i].iloc[i]["lowest_price"]
            # cur_volumn = dow_dfs[i].iloc[i]["volumn"]
        # if not exist
        else:
            pd.DataFrame(np.array([[cur_open_price, cur_close_price, cur_highest_price, cur_lowest_price, 0, cur_time, 
                                    dow_time_slice[0].to_pydatetime().hour, dow_time_slice[0].to_pydatetime().minute]]))
        # update time
        cur_time += timedelta(minutes=1)
        if cur_time.to_pydatetime().hour >= 6 and cur_time.to_pydatetime().minute != 0:
            cur_time += timedelta(days=1)
            cur_time.replace(hour=9, minute=0)

In [122]:
type(dow_time_slice[0])

pandas._libs.tslibs.timestamps.Timestamp

In [123]:
t = dow_time_slice[0]

In [133]:
t

Timestamp('2010-11-19 13:36:00')

In [132]:
t += timedelta(hours=28)

In [131]:
t.replace(hour=5)

Timestamp('2010-11-18 05:36:00')

In [140]:
etf_dfs[1]["hour"].unique()

array([ 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,  6,  7,  8,  4,  5, 20,
        3, 23, 21, 22,  0,  1])

In [116]:
dow_time_slice[0].to_pydatetime().year

2010

In [109]:
pd.DatetimeIndex(df["datetime"]).to_numpy().tolist().index(dow_time_slice[0].to_numpy().tolist())

0

In [96]:
df.iloc[0]["datetime"]

Timestamp('2010-11-18 09:36:00')

In [93]:
df = dow_dfs[0]

In [98]:
tmp = pd.DataFrame(columns=["date", "time", "open_price", "close_price", "highest_price", "lowest_price", "volumn", "datetime", "hour", "minute"])

In [100]:
tmp

Unnamed: 0,date,time,open_price,close_price,highest_price,lowest_price,volumn,datetime,hour,minute


In [106]:
tmp = tmp.append(df.iloc[0], ignore_index=True)

In [107]:
tmp

Unnamed: 0,date,time,open_price,close_price,highest_price,lowest_price,volumn,datetime,hour,minute
0,11/18/2010,09:36,29.85,29.87,29.84,29.85,169113,2010-11-18 09:36:00,9,36


In [80]:
dow_time_slice[0] in pd.DatetimeIndex(df["datetime"])

True

In [83]:
dow_time_slice[0].to_numpy()

numpy.datetime64('2010-11-18T09:36:00.000000000')

In [86]:
pd.DatetimeIndex(df["datetime"]).to_numpy().tolist().index(dow_time_slice[1].to_numpy().tolist())

1052569

In [70]:
pd.DatetimeIndex(dow_time_slice[0])

TypeError: DatetimeIndex() must be called with a collection of some kind, Timestamp('2010-11-18 09:36:00') was passed

In [5]:
dow_time_slice

[Timestamp('2010-11-18 09:36:00'), Timestamp('2020-06-05 19:44:00')]

In [8]:
etf_time_slice

[Timestamp('2009-06-25 09:40:00'), Timestamp('2020-06-05 17:38:00')]

In [64]:
[Timestamp('2010-11-18 09:36:00'), Timestamp('2020-06-05 19:44:00')]
[Timestamp('2009-06-25 09:40:00'), Timestamp('2020-06-05 17:38:00')]

NameError: name 'Timestamp' is not defined