In [1]:
import concurrent.futures
import pathlib
import tarfile
import re
import sys

import pandas as pd
import tqdm

sys.path.insert(0, '../src/')
import utils

# Create a map of files and archives

Currently, propensity scores are stored as individual files for each bootstrap iteration of each drug. These files are grouped and stored in roughly 220 `.tgz` (`.tar.gz`) archive files, alongside log files, which give the AUROC values for each bootstrap iteration's propensity scores. That is, how well the propensity scores predict exposure.

Since I was at first not sure whether I could be sure that all files for a given drug would be in a single archive (it turns out that they are), I decided it would be best to create a map of in which archives all files are stored. The relevant data for a drug (for now) are 20 `score` files and 20 `log` files, one of each for each bootstrap iteration.

Archives are between 0.5 - 1.5 GB per file, while the uncompressed `score` files are roughly 11 MB each. `log` files are tiny, as they are simply of the format `{'auc': 0.5, 'acc', 0.6}`.

In [2]:
# Path to where the `.tgz` archives are stored
archives_path = pathlib.Path('../data/archives/')

In [3]:
def get_subfiles(archive_file_path):
    file_locations = list()
    tar = tarfile.open(archive_file_path, mode='r:gz')
    subfiles = tar.getnames()
    
    for subfile in subfiles:
        if 'interaction' in subfile:
            drug = re.match(r'(?:interactions__)([0-9]+)(?:\.npy)', subfile)
            if not drug:
                raise ValueError(f'{archive_file_path.name} contained {subfile} not matched')
            drug = drug.group(1)
            bootstrap = None
            file_type = 'interaction'
        else:
            drug, bootstrap = utils.extract_indices(subfile)
            file_type = re.match('^[a-z]+(?=_.+)', subfile).group()
        file_locations.append([drug, bootstrap, file_type, subfile, 
                               archive_file_path.name])
    return file_locations

In [4]:
archive_files = list(archives_path.glob('scores_*.tgz'))

with concurrent.futures.ProcessPoolExecutor() as executor:
    all_file_locations = list(
        tqdm.tqdm(executor.map(get_subfiles, archive_files),
                  total=len(archive_files))
    )

100%|██████████| 220/220 [11:08<00:00,  3.04s/it]


In [5]:
# Flatten the list of lists of tuples to a list of tuples
all_file_locations = [i for l in all_file_locations for i in l]
files_map = pd.DataFrame(all_file_locations, 
                         columns=['drug', 'bootstrap', 
                                  'file_type', 'file_name', 
                                  'archive_file'])
files_map.to_csv('../data/meta/file_map_offsides.csv', index=False)
files_map.head(2)

Unnamed: 0,drug,bootstrap,file_type,file_name,archive_file
0,2782,10.0,scores,scores_lrc_10__2782.npy,scores_140.tgz
1,2787,10.0,scores,scores_lrc_10__2787.npy,scores_140.tgz


In [6]:
# See whether the files for any drug are split across multiple files
#  Because the value is 1, this indicates that no drug has its files split
#  across multiple archive files.
(
    files_map
    .groupby('drug')['archive_file']
    .nunique()
    .max()
)

1