In [57]:
import os 
import pandas as pd
import tarfile
from collections import Counter
import gzip
import shutil
from helpers import *
pd.set_option("display.max_colwidth", None)

## 1) Data Exploring

In [19]:
current_dir = os.getcwd()
src_path = os.path.join(current_dir,"src") 
dataLoadingDirectory = os.path.join(current_dir,"data","raw",
                                    "flash_crash_DJIA","tar_files")
dataSavingDirectory  = os.path.join(current_dir,"data","raw",
                                    "flash_crash_DJIA","csv_files")

if not os.path.exists(dataSavingDirectory): os.makedirs(dataSavingDirectory)

#### Checking .tar file names 
tar_files = [os.path.join(dataLoadingDirectory,tar_file) for tar_file in os.listdir(dataLoadingDirectory) ]
print( f"We have data for {len(os.listdir(dataLoadingDirectory))} stocks")
print(os.listdir(dataLoadingDirectory))

#### Checking the different types of files in the .tar files
file_types_counter = Counter()
for tar_file in tar_files:
    file_types = list_file_types_in_tar(tar_file)  
    file_types_counter.update(file_types)
print(f"The Different types of files given are : {file_types_counter}")

We have data for 31 stocks
['WMT.N-2010.tar', 'AMGN.OQ-2010.tar', 'RTX.N-2010.tar', 'IBM.N-2010.tar', 'UTX.N-2010.tar', 'NKE.N-2010.tar', 'VZ.N-2010.tar', 'KO.N-2010.tar', 'XOM.N-2010.tar', 'GS.N-2010.tar', 'JPM.N-2010.tar', 'AXP.N-2010.tar', 'MRK.N-2010.tar', 'WBA.OQ-2010.tar', 'CAT.N-2010.tar', 'DOW.N-2010.tar', 'V.N-2010.tar', 'CVX.N-2010.tar', 'PFE.N-2010.tar', 'JNJ.N-2010.tar', 'MMM.N-2010.tar', 'TRV.N-2010.tar', 'CSCO.OQ-2010.tar', 'PG.N-2010.tar', 'HD.N-2010.tar', 'BA.N-2010.tar', 'MSFT.OQ-2010.tar', 'UNH.N-2010.tar', 'MCD.N-2010.tar', 'INTC.OQ-2010.tar', 'AAPL.OQ-2010.tar']
The Different types of files given are : Counter({'.gz': 16182})


### a) Data Extraction and cleaning:

In [20]:
# Next step: For each tar file : Create a folder with its name and extract the files inside it, the all folders will be saved in general folder csv_files
for tar_file in tar_files:
    # Extract the name of the .tar file without extension to use as directory name
    dir_name = os.path.splitext(os.path.basename(tar_file))[0]
    dir_path = os.path.join(dataSavingDirectory, dir_name)

    # Create a directory for extracted files
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)

    # Open the .tar file
    with tarfile.open(tar_file, "r") as tar:
        # Extract each file directly, ignoring the internal directory structure
        for member in tar.getmembers():
            if member.isfile():
                member.name = os.path.basename(member.name)  # Remove the internal directory structure
                tar.extract(member, dir_path)


In [33]:
#### Example of Reading a csv.gz files 
directory = os.listdir(dataSavingDirectory)

dir_folder = os.path.join(dataSavingDirectory,directory[0])
print("-"*50)
print(f"folder {dir_folder} has {len(os.listdir(dir_folder))} files") 
print(os.listdir(dir_folder)[:5])

--------------------------------------------------
folder /Users/yacine/Desktop/Flash_crash/data/raw/flash_crash_DJIA/csv_files/GS.N-2010 has 524 files
['2010-08-26-GS.N-bbo.csv.gz', '2010-01-27-GS.N-trade.csv.gz', '2010-07-06-GS.N-trade.csv.gz', '2010-11-25-GS.N-bbo.csv.gz', '2010-07-16-GS.N-bbo.csv.gz']


In [37]:
### Making trade and bbo folder for each stock 
directories = [os.path.join(dataSavingDirectory, directory) for directory in os.listdir(dataSavingDirectory) if not directory.endswith('.DS_Store')]
trade_directories = [os.path.join(directory,"trade") for directory in directories ]
bbo_directories = [os.path.join(directory,"bbo") for directory in directories]

### Creating these directories if not existed yet : 
for trade_dir in trade_directories:
    if not os.path.exists(trade_dir) :
        os.makedirs(trade_dir)

for bbo_dir in bbo_directories:
    if not os.path.exists(bbo_dir) :
        os.makedirs(bbo_dir)

### We will iterate over each directory : 
for directory in directories:
    trade_dir  = os.path.join(directory,"trade")
    bbo_dir    = os.path.join(directory,"bbo")
    for filename in os.listdir(directory):
        if filename.endswith(".csv.gz"):
            if 'trade' in filename:
            # Move trade files to the trade directory
                shutil.move(os.path.join(directory, filename), os.path.join(trade_dir, filename))
            elif 'bbo' in filename:
            # Move bbo files to the bbo directory
                shutil.move(os.path.join(directory, filename), os.path.join(bbo_dir, filename))


In [54]:
## Example of trade file : 
trade_dir = trade_directories[0]
print(f"Number of files in {trade_dir} = ",len(os.listdir(trade_dir)))

trade_file = os.listdir(trade_dir)[0]
# Using gzip.open to decompress the file and read it with pandas
with gzip.open(os.path.join(trade_dir,trade_file), 'rt') as file:
    df = pd.read_csv(file)
print(trade_file)
df.head()

Number of files in /Users/yacine/Desktop/Flash_crash/data/raw/flash_crash_DJIA/csv_files/GS.N-2010/trade =  261
2010-01-27-GS.N-trade.csv.gz


Unnamed: 0,xltime,trade-price,trade-volume,trade-stringflag,trade-rawflag
0,40205.604499,150.75,122600,auction,[CTS_QUAL ]O [GV1_TEXT ] O [USER ]Open [USER ]High [USER ]Low
1,40205.6045,150.8,100,uncategorized,[CTS_QUAL ] [GV1_TEXT ]@ [USER ]High
2,40205.6045,150.8,100,uncategorized,[CTS_QUAL ] [GV1_TEXT ]@
3,40205.6045,150.8,100,uncategorized,[CTS_QUAL ] [GV1_TEXT ]@
4,40205.6045,150.8,100,uncategorized,[CTS_QUAL ] [GV1_TEXT ]@


In [55]:
## Example of bbo file : 
bbo_dir = bbo_directories[0]
bbo_file = os.listdir(bbo_dir)[0]
print(f"Number of files in {bbo_dir} = ",len(os.listdir(bbo_dir)))

# Using gzip.open to decompress the file and read it with pandas
with gzip.open(os.path.join(bbo_dir,bbo_file), 'rt') as file:
    df = pd.read_csv(file)
df.head(4)

Number of files in /Users/yacine/Desktop/Flash_crash/data/raw/flash_crash_DJIA/csv_files/GS.N-2010/bbo =  261


Unnamed: 0,xltime,bid-price,bid-volume,ask-price,ask-volume
0,40416.562628,144.7,1,144.93,1
1,40416.562628,144.76,1,144.93,1
2,40416.562628,144.76,2,144.93,1
3,40416.562628,144.76,2,144.93,2


### b) Regrouping csv files by month (optional): 

In [60]:
createFoldersGroupingYnMonthfiles(trade_directories) #trade_directories by month
createFoldersGroupingYnMonthfiles(bbo_directories)  #bbo_directories by month

## Conclusion of this Notebook:
 
In this notebook we started from .tar files that were located in folder .tar files, the first thing we did is to explore the contents of the different tar files : we find out all were .gz files: For each tar file  we created its corresponding folder inside csv_file folder , after exploring the names of the gz files, we noticed there were trade and bbo files, as a second step, for each stock, we made a directory of trade regrouping the csv.gz trade files and and a bbo directory regrouping the csv.gz bbo (Best bid and offer ) files, we made in this notebook an inital exploration of a random trade file and a random bbo file, as a final step: We regroup files based on the year and month: Thus the final  structure is:    
csv_files : Directoriy --> Directory for each stock ---> trade and boo directories --> year&month directories --> csv.gz files 

https://chat.openai.com/share/0f635485-122d-41d3-8d32-7416801992b2