In [1]:
import os
import zipfile
import pandas as pd

In [None]:
# unzip all files
root = "LOB_Data/"
for file in os.listdir(root): 
    if file.endswith(".zip"): 
        zip_ref = zipfile.ZipFile(root + file)
        zip_ref.extractall(root) 
        zip_ref.close()
        os.remove(root + file) 

In [2]:
# transform all files to LOBSTER format
def transform_to_lobster(target, filename, new_root):
    data = pd.DataFrame(columns=["TIMESTAMP", "ORDER_TYPE", "ORDER_ID", "SIZE", "PRICE", "BUY_SELL_FLAG"])
    with open(filename) as txt_file:
        id_to_info = dict()  # map order_id to [order_price, buy/sell, order_size]
        for line in txt_file:
            cur = line.split()
            if len(cur) < 2:
                continue
            # add order
            if cur[0][-1] == 'A' and cur[2][-len(target):] == target:   
                cur_order_id = cur[1][:-1]
                cur_price = int(cur[3]) if cur[3][-1].isnumeric() else int(cur[3][:-1])
                cur_flag = 1 if cur[1][-1] == 'B' else -1
                cur_size = int(cur[2][:-len(target)])
                data = data.append({"TIMESTAMP" : float(cur[0][:-1]) / 1000, 
                                    "ORDER_TYPE" : 1,
                                    "ORDER_ID" : cur_order_id, 
                                    "SIZE" : cur_size,
                                    "PRICE" : cur_price,
                                    "BUY_SELL_FLAG" : cur_flag }, ignore_index = True)
                id_to_info[cur_order_id] = [cur_price, cur_flag, cur_size]    
            # cancel order
            elif cur[0][-1] == 'X' and cur[1] in id_to_info.keys():     
                data = data.append({"TIMESTAMP" : float(cur[0][:-1]) / 1000, 
                                    "ORDER_TYPE" : 2,
                                    "ORDER_ID" : cur[1], 
                                    "SIZE" : int(cur[2]),
                                    "PRICE" : id_to_info[cur[1]][0],
                                    "BUY_SELL_FLAG" : id_to_info[cur[1]][1] }, ignore_index = True) 
            # delete order
            elif cur[0][-1] == 'D' and cur[1] in id_to_info.keys():    
                data = data.append({"TIMESTAMP" : float(cur[0][:-1]) / 1000, 
                                    "ORDER_TYPE" : 3,
                                    "ORDER_ID" : cur[1], 
                                    "SIZE" : id_to_info[cur[1]][2],
                                    "PRICE" : id_to_info[cur[1]][0],
                                    "BUY_SELL_FLAG" : id_to_info[cur[1]][1] }, ignore_index = True)
            # execute order
            elif cur[0][-1] == 'E' and cur[1] in id_to_info.keys():     
                data = data.append({"TIMESTAMP" : float(cur[0][:-1]) / 1000, 
                                    "ORDER_TYPE" : 4,
                                    "ORDER_ID" : cur[1], 
                                    "SIZE" : int(cur[2]),
                                    "PRICE" : id_to_info[cur[1]][0],
                                    "BUY_SELL_FLAG" : id_to_info[cur[1]][1] }, ignore_index = True)
            # execute hidden limit order
            elif cur[0][-1] == 'P' and cur[2][-len(target):] == target:  
                cur_order_id = cur[1][:-1]
                cur_price = int(cur[3]) if cur[3][-1].isnumeric() else int(cur[3][:-1])
                cur_flag = 1 if cur[1][-1] == 'B' else -1
                cur_size = int(cur[2][:-len(target)])
                data = data.append({"TIMESTAMP" : float(cur[0][:-1]) / 1000, 
                                    "ORDER_TYPE" : 5,
                                    "ORDER_ID" : cur_order_id,    # or is it just 0
                                    "SIZE" : cur_size,
                                    "PRICE" : cur_price,
                                    "BUY_SELL_FLAG" : cur_flag }, ignore_index = True)  
        data = data[data["TIMESTAMP"].between(34200, 57600, inclusive=True)]
        new_filename = new_root + target + "_" + filename[-13:-7] + ".csv"
        data.to_csv(new_filename, header = None, index = False) 
        print(filename + " to " + new_filename + " done!")

target = "IBM"
root = "LOB_Data/"
new_root = "NEW_LOBSTER_" + target + "/"
os.mkdir(new_root)
for file in os.listdir(root):
    transform_to_lobster(target, root + file, new_root)

LOB_Data/S011303-v2.txt to NEW_LOBSTER_IBM/IBM_011303.csv done!
LOB_Data/S011403-v2.txt to NEW_LOBSTER_IBM/IBM_011403.csv done!
LOB_Data/S011503-v2.txt to NEW_LOBSTER_IBM/IBM_011503.csv done!
LOB_Data/S011603-v2.txt to NEW_LOBSTER_IBM/IBM_011603.csv done!
LOB_Data/S011703-v2.txt to NEW_LOBSTER_IBM/IBM_011703.csv done!
LOB_Data/S012003-v2.txt to NEW_LOBSTER_IBM/IBM_012003.csv done!
LOB_Data/S012103-v2.txt to NEW_LOBSTER_IBM/IBM_012103.csv done!
LOB_Data/S012203-v2.txt to NEW_LOBSTER_IBM/IBM_012203.csv done!
LOB_Data/S012303-v2.txt to NEW_LOBSTER_IBM/IBM_012303.csv done!
LOB_Data/S012403-v2.txt to NEW_LOBSTER_IBM/IBM_012403.csv done!
LOB_Data/S012703-v2.txt to NEW_LOBSTER_IBM/IBM_012703.csv done!
LOB_Data/S012803-v2.txt to NEW_LOBSTER_IBM/IBM_012803.csv done!
LOB_Data/S012903-v2.txt to NEW_LOBSTER_IBM/IBM_012903.csv done!
LOB_Data/S013003-v2.txt to NEW_LOBSTER_IBM/IBM_013003.csv done!
LOB_Data/S013103-v2.txt to NEW_LOBSTER_IBM/IBM_013103.csv done!
LOB_Data/S020303-v2.txt to NEW_LOBSTER_I