In [1]:
import pandas as pd
import numpy as np

In [2]:
import os

In [3]:
from dask_jobqueue import SLURMCluster
import dask.dataframe as dd
from dask.distributed import Client

In [4]:
data_dir = "/home/oagba/bulk/data/output_basic_15k/symptoms/csv"
data_dir = "/home/oagba/bulk/data/output_basic_45k/symptoms/csv"

In [5]:
cluster = SLURMCluster(
    queue='general',
    # project='medvice_parse',
    cores=16,
    memory='80 GB',
    walltime='01:00:00'
)

In [6]:
client = Client(cluster)
cluster.scale(1)

In [7]:
symptoms_file = os.path.join(data_dir, "symptoms.csv")

In [8]:
df = dd.read_csv(symptoms_file)



In [9]:
df = df.loc[df.NUM_SYMPTOMS > 0]

In [10]:
df = client.persist(df)

In [11]:
mean_num_symptoms = df.NUM_SYMPTOMS.mean()

In [12]:
std_num_symptoms = df.NUM_SYMPTOMS.std()
min_num_symptoms = df.NUM_SYMPTOMS.min()
max_num_symptoms = df.NUM_SYMPTOMS.max()

In [None]:
mean, std, min_num, max_num = dd.compute(mean_num_symptoms, std_num_symptoms, min_num_symptoms, max_num_symptoms)

In [None]:
symptom_dist = [df.NUM_SYMPTOMS[df.NUM_SYMPTOMS == idx].count() for idx in range(min_num, max_num + 1)]

In [None]:
symptom_dist = dd.compute(symptom_dist)

In [None]:
symptom_dist = symptom_dist[0]

In [None]:
import matplotlib.pyplot as plt

In [None]:
fig_dir = "/shares/bulk/oagba/thesis-notebooks/zz_qce_server/figures/explore_symptom_dist_45k"
if not os.path.isdir(fig_dir):
    os.mkdir(fig_dir)

In [None]:
fig, axes = plt.subplots()
axes.set_ylabel("Num. Conditions")
axes.set_xlabel("Num. Symptoms")
axes.set_title("Symptom Histogram. Mean: %.2f. Std: %.2f" % (mean_num_symptoms, std))
axes.bar(range(min_num, max_num +1), symptom_dist)

fig.set_size_inches(8, 8)
figname = os.path.join(fig_dir, "symptom_hist_min_1.pdf")
fig.savefig(figname)

In [None]:
# quantify data imbalance
condition_counts = df.groupby("PATHOLOGY").size().compute()

In [None]:
top_10_frequent = condition_counts.nlargest(10)

In [None]:
least_10_frequent = condition_counts.nsmallest(10)

In [None]:
average_count = condition_counts.mean()

In [None]:
std_count = condition_counts.std()

In [None]:
import json

In [None]:
condition_db = "/shares/bulk/oagba/data/definitions/condition_db.json"
with open(condition_db) as fp:
    condition_db = json.load(fp)

In [None]:
top_10_names = {item: {"name": condition_db[item], "count": int(top_10_frequent[item])} for item in top_10_frequent.index}
least_10_names = {item: {"name": condition_db[item], "count": int(least_10_frequent[item])} for item in least_10_frequent.index}

In [None]:
top_10_least_10_file = os.path.join(fig_dir, "top_10_least_10_min_symp_1.json")
with open(top_10_least_10_file, "w") as fp:
    json.dump({"top_10": top_10_names, "least_10": least_10_names}, fp, indent=4)

In [None]:
data = np.hstack((top_10_frequent.values, np.array([condition_counts.median()]), least_10_frequent.values[::-1]))

In [None]:
['r']*2

In [None]:
# we can't plot all 801 conditions, but we can plot the top and bottom two to give an idea of the range 
fig, axes = plt.subplots()
axes.set_xlabel("Condition Label")
axes.set_ylabel("Sample Count")
axes.set_title("Top 10 and Least 10 conditions by occurrence. Mean: %.0f" % average_count)

colors = ['r'] * 10 + ['b'] + ['k'] * 10
axes.bar(range(1, 11), data[:10], color='r', label='Top 10')
axes.bar(range(11, 12), data[10], color='b', label='Median')
axes.bar(range(12, len(data) + 1), data[11:], color='k', label='Least 10')
axes.legend()

fig.set_size_inches(8, 8)
figname = os.path.join(fig_dir, "top_10_least_10_median_min_1.pdf")
fig.savefig(figname)

In [None]:
sorted_conditions = condition_counts.nlargest(condition_counts.shape[0])

In [None]:
conditions_mean = sorted_conditions.mean()
conditions_std = sorted_conditions.std()
conditions_median = sorted_conditions.median()

In [None]:
fig, axes = plt.subplots()
axes.bar(range(1, sorted_conditions.shape[0] + 1), sorted_conditions.values)
axes.set_xlabel("Condition Label")
axes.set_ylabel("Sample Count")
axes.set_title("Conditions orderd by Occurence. Mean: %0f, Median: %0f, Std: %.2f" % (conditions_mean, conditions_std, conditions_median))
fig.set_size_inches(8, 8)
figname = os.path.join(fig_dir, "condition_distribution_min_1.pdf")
fig.savefig(figname)

In [None]:
condition_counts.sum()