In [2]:
import datetime
import time
import numpy as np
import pandas as pd
import logging
import glob
import os
%matplotlib inline

In [3]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger("barchart")
handler = logging.FileHandler('barchart_data_cleaner.log')
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)

In [4]:
def timestamp_barchart_to_est(barchart_timestamp_str):
    """
    barchart historical timestamp data format is "2008-05-05T13:42:01-04:00",
    which means year-month-dateThour:minute:second-timezonedelta
    This function transform the string to a est datetime format object
    """
    original_time_str = barchart_timestamp_str.rsplit("-", 1)[0]
    utc_zone_hours = int(barchart_timestamp_str.rsplit("-", 1)[1][1])
    original_time = datetime.datetime.strptime(original_time_str, "%Y-%m-%dT%H:%M:%S")
    est_delta_hours = 5 - utc_zone_hours
    est_time = original_time + datetime.timedelta(hours=est_delta_hours)
    return est_time

In [44]:
def generate_barchart_clean_dataframe(raw_df):
    """
    Given a raw dataframe from barchart, generate clean dataframes from raw data frame.
    """
    clean_df = raw_df.copy(deep=True)
    clean_df.loc[:, "timestamp"] = raw_df.loc[:, "timestamp"].apply(timestamp_barchart_to_est)
    clean_df = clean_df.set_index("timestamp", drop=True)
    clean_df.drop('tradingDay', axis=1, inplace=True)
    clean_df.head()
    all_timestamps = pd.date_range(start=clean_df.index[0], end=clean_df.index[-1], freq='1Min')
    clean_df = clean_df.reindex(all_timestamps)
    clean_df.loc[:, "volume"] = clean_df.loc[:, "volume"].fillna(np.int(0)).astype(np.int32)
    clean_df = clean_df.ffill()
    return clean_df

In [6]:
raw_data_folder = r"D:\data\barchart_1min_future_raw"

In [7]:
clean_data_folder = r"D:\data\barchart_1min_future_clean"

In [8]:
raw_data_path_list = [f for f in glob.glob(raw_data_folder + "\*.csv", recursive=True)]

In [None]:
for raw_data_path in raw_data_path_list:
    raw_df = pd.read_csv(raw_data_path, dtype={"volume": np.int32})
    clean_df = generate_barchart_clean_dataframe(raw_df)
    clean_df.to_csv(os.path.join(clean_data_folder, os.path.basename(raw_data_path) + ".csv"),
                    index_label="timestamp",
                    float_format="%.4f")