# Train with bytestrings data when input size is very big

In [1]:
#import sys < MIDWAYU-SPECIFIC IMPORTS
#sys.path.insert(0, "/project2/ishanu/DMYTRO/QNET/quasinet")
from quasinet.qnet import *
from quasinet.qsampling import *

In [30]:
import pandas as pd
import numpy as np
import subprocess
from tqdm import tqdm

def count_lines(filepath):
    result = subprocess.run(['wc', '-l', filepath], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    if result.returncode == 0:
        return int(result.stdout.split()[0])
    else:
        raise Exception(f"Error counting lines: {result.stderr}")

def read_csv_into_bytearray(filepath, nrows = 0):
    """
        When you know for sure the input data is just 
        strings of length 1 of ""s, it is much faster
        to load the csv lines straight into bytestring arrays
        Assumes existing csv-style header and patient_id as the first column
    """
    # Initialize an empty list to store the rows
    rows = []
    
    # Open the file and read line by line
    with open(filepath, 'r') as file:
        # Skip the header row
        next(file)
        
        for i, line in tqdm(
                enumerate(file), 
                position = 0, 
                leave = True,
                total = count_lines(filepath)):
            if nrows and i >= nrows:
                break
            # Split the line by comma and ignore the first value
            values = line.strip().split(',')[1:]
            # Convert to numpy array
            values_array = np.array(values, dtype = "S1")
            rows.append(values_array)
            
    return np.vstack(rows)

In [10]:
INPUT_PATH = "QNET_TRAINING_SET.csv"

In [15]:
"""
    Load the tree names directly from the input
"""
with open(INPUT_PATH, "r") as f:
    ALL_COLS = f.readline().split(',')
    if ALL_COLS[0] == 'patient_id':
        ALL_COLS = ALL_COLS[1:]
    print(ALL_COLS[:5])

['I10_Y75-1-1', 'I10_Y76-1-1', 'I10_Y77-1-1', 'I10_Y78-1-1', 'I10_Y79-1-1']


In [12]:
%%time
X = read_csv_into_bytearray(
    INPUT_PATH
)

100%|█████████▉| 762/763 [00:00<00:00, 14553.21it/s]

CPU times: user 61.8 ms, sys: 29.6 ms, total: 91.4 ms
Wall time: 117 ms





In [36]:
%%time
"""
    Also load as usual, just to see the difference
"""
DF = pd.read_csv(INPUT_PATH, na_filter = False)
Xold = DF.iloc[:,1:].values.astype("<U1") #1: removes patient_id

CPU times: user 143 ms, sys: 4.8 ms, total: 148 ms
Wall time: 145 ms


In [39]:
"""
    COMPARE SIZES
"""
import sys
print(X.shape)
print(Xold.shape)
print(f"X (bytestring) SIZE: {sys.getsizeof(X) / (1000 * 1000):.3f} MB")
print(f"X (old style) SIZE: {sys.getsizeof(Xold) / (1000 * 1000):.3f} MB")

(762, 528)
(762, 528)
X (bytestring) SIZE: 0.402 MB
X (old style) SIZE: 1.609 MB


In [23]:
N_JOBS = 28
BATCH_SIZE = 28
TREE_COLUMNS = ALL_COLS
TREE_BATCHES = []

for i in range(0, len(TREE_COLUMNS), BATCH_SIZE):
    TREE_BATCHES.append(TREE_COLUMNS[i:i+BATCH_SIZE])
    
print(len(TREE_BATCHES))

19


In [25]:
TREE_BATCHES[0][:2]

['I10_Y75-1-1', 'I10_Y76-1-1']

In [26]:
QNET_PARTS = []

for i, BATCH in tqdm(enumerate(TREE_BATCHES), 
                     "Train QNet trees",
                    position = 0, 
                    total = len(TREE_BATCHES),
                     unit = 'batches',
                    leave = True):
    
    INDICES = [ALL_COLS.index(column) for column in BATCH]

    QNET_PART = Qnet(
        feature_names = ALL_COLS,
        alpha = 0.1,
        n_jobs = N_JOBS,
        max_depth = -1,
        max_feats = -1,
        min_samples_split = 2,
        random_state = None,
        early_stopping = False,
        verbose = 0
    ) # !!!!!!!!!!!!!!!

    QNET_PART.fit(X, index_array = INDICES)
    QNET_PARTS.append(QNET_PART)

Train QNet trees: 100%|██████████| 19/19 [01:42<00:00,  5.39s/batches]


In [28]:
FULL_QNET = []

for QNET_PART in tqdm(
        QNET_PARTS, 
        "Cobmine QNet trees into one QNet",
         position = 0,
         unit = 'parts',
         leave = True):
    if not FULL_QNET:
        FULL_QNET = QNET_PART
    else:
        FULL_QNET.mix(QNET_PART, ALL_COLS)
        
FULL_QNET.training_data = X

save_qnet(
    FULL_QNET, 
    f"FULL_QNET.joblib", 
    low_mem = True, gz = True
)

Cobmine QNet trees into one QNet: 100%|██████████| 19/19 [00:00<00:00, 74.04parts/s]
