Generate two new samples of data from the basic_avg_cnt sample:
1. A sample where the symptoms are randomly selected based on their probabilites
2. A sample where the symptoms are just randomly selected

In [None]:
avg_csv_train_file = "/home/oagba/bulk/data/output_basic_avg_cnt_15k/symptoms/csv/parsed/train.csv_sparse.csv"
avg_csv_test_file = "/home/oagba/bulk/data/output_basic_avg_cnt_15k/symptoms/csv/parsed/test.csv_sparse.csv"

In [None]:
conditions_file = "/home/oagba/bulk/data/definitions/conditions.json"

In [None]:
from glob import glob
import json
import pandas as pd
import os
import numpy as np
import hashlib

In [None]:
condition_prob_hash = {}
with open(conditions_file) as fp:
    conditions_data = json.load(fp)

for condition, values in conditions_data.items():
    condition_hash = hashlib.sha224(condition.encode("utf-8")).hexdigest()
    symptoms = values.get("symptoms")
    symptom_prob_hash = {}
    for symptom, symptom_data in symptoms.items():
        symptom_hash = hashlib.sha224(symptom.encode("utf-8")).hexdigest()
        prob = symptom_data.get("probability")
        symptom_prob_hash[symptom_hash] = prob
    condition_prob_hash[condition_hash] = symptom_prob_hash

In [None]:
cnd_prob_hash_file = "/home/oagba/bulk/data/definitions/condition_prob_hash.json"
with open(cnd_prob_hash_file, "w") as fp:
    json.dump(condition_prob_hash, fp)

In [None]:
conditions_db_file = "/home/oagba/bulk/data/definitions/condition_db.json"
symptom_db_file = "/home/oagba/bulk/data/definitions/symptom_db.json"

with open(conditions_db_file) as fp:
    conditions_db = json.load(fp)
with open(symptom_db_file) as fp:
    symptoms_db = json.load(fp)

sorted_conditions = sorted(conditions_db.keys())
sorted_symptoms = sorted(symptoms_db.keys())
condition_labels = {code: idx for idx, code in enumerate(sorted(conditions_db.keys()))}
symptom_map = {code: str(idx) for idx, code in enumerate(sorted(symptoms_db.keys()))}

In [None]:
test_csv = pd.read_csv(avg_csv_test_file)
cnd_symp = test_csv[['LABEL', 'SYMPTOMS']]

In [None]:
dd = cnd_symp.head()

In [None]:
def weighted_selection(item, cnd_code_list, symp_map, cnd_symp_hash, is_random=False):
    num_symptoms = len(item.SYMPTOMS.split(","))
    cnd_code = cnd_code_list[item.LABEL]
    cnd_symptoms_map = cnd_symp_hash[cnd_code]
    cnd_symptoms = sorted(cnd_symptoms_map.keys())
    cnd_probs = [cnd_symptoms_map[sym] for sym in cnd_symptoms]
    sum_probs = sum(cnd_probs)
    cnd_probs = [idx/sum_probs for idx in cnd_probs]
    cnd_symptoms = [symp_map[sym] for sym in cnd_symptoms]
    rng = np.random.default_rng()
    if is_random:
        selected_symp = rng.choice(cnd_symptoms, num_symptoms, replace=False)
    else:
        selected_symp = rng.choice(cnd_symptoms, num_symptoms, replace=False, p=cnd_probs)
    
    return ",".join(selected_symp.tolist())

In [None]:
res = dd.apply(
    weighted_selection,
    axis=1, 
    cnd_code_list=sorted_conditions,
    symp_map=symptom_map,
    cnd_symp_hash=condition_prob_hash
)