In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit

from config import CLINICAL_MBV_FILES, PROCESSED_DIR
from utils import Barcode2

# 02 Clinical

This notebook reads in the clinical information provided by FIND.

## Clinical data for training and validation (but not blinded test) cohorts

In [2]:
# Read data files
mb_100 = pd.read_excel(CLINICAL_MBV_FILES[0], sheet_name="URINE SAMPLES")
mb_100["Cohort"] = "training"
val_320 = pd.read_excel(CLINICAL_MBV_FILES[1])
val_320["Cohort"] = "validation"

# Concatenate
mbv_420 = pd.concat([mb_100, val_320], ignore_index=True).copy()

# Drop duplicates (same OS_PatientID, different row
mbv_420["Barcode Any Aliquot"] = mbv_420["barcode"].map(
    lambda b: Barcode2(b).any_aliquot()
)
mbv_420.index = pd.Index(mbv_420["Barcode Any Aliquot"], name="")
mbv = (
    mbv_420.groupby("Barcode Any Aliquot")
    .ffill()
    .bfill()
    .drop_duplicates(subset="OS_PatientID")
)


## Separate training and validation cohorts

In [3]:
sm_neg_urine = mbv[mbv["p_cat"] == "S-C+"]
sm_neg_ssp = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=74)
y_sn = 2 * sm_neg_urine["HIV_status"] + sm_neg_urine["SEX"]
sm_neg_train, sm_neg_val = next(sm_neg_ssp.split(sm_neg_urine, y_sn))
mbv.loc[sm_neg_urine.index[sm_neg_train], "Cohort"] = "training"


## Align with sample results


In [4]:
# Import sample results
X_med = pd.read_csv(PROCESSED_DIR / "X_med.csv", index_col=0)
X_rep = pd.read_csv(PROCESSED_DIR / "X_rep.csv", index_col=[0, 1])
X_all = pd.read_csv(PROCESSED_DIR / "X_all.csv", index_col=[0, 1])

missing_samples = mbv.loc[mbv.index.difference(X_med.index)]

## Create arrays for machine learning

In [5]:
X = X_med.loc[X_med.index.intersection(mbv.index)]
y = mbv.loc[X_med.index.intersection(mbv.index), "p_cat"].map(
    {"NonTB_NonLTBI": 0, "S+C+": 1, "S-C+": 1}
)
X.to_csv(PROCESSED_DIR / "X.csv")
y.to_csv(PROCESSED_DIR / "y.csv")
mbv.to_csv(PROCESSED_DIR / "mbv.csv")