In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from utils import *


In [None]:
# Load the data
file_path_analysed = './cvd/cvd_main+ado/final_cohort_cvd.csv'
df = pd.read_csv(file_path_analysed)
len(df)

In [None]:
data_dict = get_data_dict(file_path_analysed, 'cancer', 'cancer')
_, deltas = get_main_infos(data_dict, 'first_imaging_visit_date')

In [None]:
deltas = pd.DataFrame(deltas, columns=['deltas'])
#number of deltas with smaller than 365 days
for i in range(1, 10):
    print("{} diagnosed within {} year(s) after first imaging visit".format(deltas[deltas['deltas'] < 365*i].count(), i))

In [None]:
deltas.hist(bins=100)

In [None]:
MODE = 'cancer'
sex_field = ["31-0.0"]
sex_field_genetic = ['22001-0.0']
age_field = ["21003-0.0", "21003-1.0", "21003-2.0", "21003-3.0"]
ethnicity = ["21000-0.0", "21000-1.0", "21000-2.0"]
bmi = ["21001-0.0", "21001-1.0", "21001-2.0", "21001-3.0"]
death_date = ["40000-0.0"]#, "40000-1.0"]
death_cause_primary = ["40001-0.0"]#, "40001-1.0"]
#death_cause_secondary = ["40002-0.0"]#, "40002-1.0"]
icd_diagnosis_field, icd_diagnosis_date_field = get_icd_infos('cancer')
visit_dates_fields = ['53-0.0', '53-1.0', '53-2.0', '53-3.0']
icd_code_cols = [col for col in df.columns if col.startswith(icd_diagnosis_field)]
icd_date_cols = [col for col in df.columns if col.startswith(icd_diagnosis_date_field)]
patterns, PATTERN_SELECTION_MODE = get_patterns(MODE)
columns_to_read = ['eid'] + sex_field + sex_field_genetic + age_field + ethnicity + bmi + death_date + death_cause_primary + icd_code_cols + icd_date_cols + visit_dates_fields

In [None]:
# find 

In [None]:
#df = pd.read_csv('./cvd+ado_all.csv', usecols=columns_to_read)
#df = pd.read_csv('./cvd+ado_all_after_first_imaging.csv', usecols=columns_to_read)
df = pd.read_csv(file_path_analysed, usecols=columns_to_read)

In [None]:
pd.notna(df[death_date[0]].iloc[0])

In [None]:
for col in visit_dates_fields + icd_date_cols + death_date:
    df[col] = pd.to_datetime(df[col], errors='coerce', format='%Y-%m-%d') # therefore, pd.to_datetime is used!!!

In [None]:
min(df.iloc[0][icd_date_cols])

In [None]:
deltas = []
for index, row in df.iterrows():
    if row[death_date[0]] is pd.NaT:
        continue
    delta = row[death_date[0]] - min(row[icd_date_cols])
    if delta < pd.Timedelta(0):
        print("ALERT", row['eid'], delta)
    deltas.append(delta.days)
    #print(row['eid'], delta) 
deltas = pd.DataFrame(deltas)
print("Died in days after\nfirst cancer diagnosis")
deltas.describe()

In [None]:
deltas = []
for index, row in df.iterrows():
    if row[death_date[0]] is pd.NaT:
        continue
    delta = row[death_date[0]] - min(row[icd_date_cols])
    if row[death_cause_primary[0]] in patterns:
        deltas.append(delta.days)
    #print(row['eid'], delta) 
deltas = pd.DataFrame(deltas)
print("Died in days after\nfirst cancer diagnosis\nand cause is cancer")
deltas.describe()

In [None]:
# seaborn pie chart for sex_field where 1 male and 0 female in seaborn

sns.set_theme(style="whitegrid")


In [None]:
# age by sex 
males = df[df[sex_field[0]] == 1]
males[age_field[0]].plot.hist()
print("Males mean age: {:.1f} +- {:.1f}".format(males[age_field[0]].mean(), males[age_field[0]].std()))
#print("Males mean age", males[age_field[0]].mean(), " +- ", males[age_field[0]].std())
print("Males min and max age:", males[age_field[0]].min(), "and", males[age_field[0]].max())

In [None]:
females = df[df[sex_field[0]] == 0]
females[age_field[0]].plot.hist()
print("Females mean age: {:.1f} +- {:.1f}".format(females[age_field[0]].mean(), females[age_field[0]].std()))
print("Females min and max age:", females[age_field[0]].min(), "and", females[age_field[0]].max())

In [None]:
# ethnicity
def decode_sex(lst):
    return list(map(lambda x: "female" if x==0 else "male", lst))

def decode_eth(lst):
    coding_table = pd.read_csv('./coding1001.tsv', sep='\t')
    # make dictionary from coding_table['coding'] and coding_table['meaning']
    coding_dict = dict(zip(coding_table['coding'], coding_table['meaning']))
    return list(map(lambda x: coding_dict[x], lst))        
    
        
value_counts = df[sex_field[0]].value_counts()
keys = decode_sex(value_counts.keys().tolist())
values = value_counts.values.tolist()

sns.set_style("whitegrid")
plt.figure(figsize=(6,6))
plt.pie(values, labels=keys, autopct='%1.1f%%')
plt.title('Sex distribution in the dataset')
plt.show()

In [None]:
def decode_eth(lst):
    coding_table = pd.read_csv('./coding1001.tsv', sep='\t')
    # make dictionary from coding_table['coding'] and coding_table['meaning']
    coding_dict = dict(zip(coding_table['coding'], coding_table['meaning']))
    return list(map(lambda x: coding_dict[x], lst))       

value_counts = df[ethnicity[0]].value_counts()
keys = decode_eth(value_counts.keys().tolist())[:5]
values = value_counts.values.tolist()[:5]
df_to_plot = pd.DataFrame({'Category': keys, 'Count': values})

# Create the bar plot using a lambda function as an estimator
plt.figure(figsize=(10, 6))
ax = sns.barplot(
    x='Category', 
    y='Count', 
    data=df_to_plot, 
    estimator=lambda x: round(sum(x) / df_to_plot['Count'].sum() * 100, 2), 
    orient='v'
)

for i in ax.containers:
    #i.datavalues = np.round(i.datavalues, 1)
    ax.bar_label(i,)

# Add labels and title
plt.xlabel('Ethnicity')
plt.ylabel('Percentage')
plt.title('Ethnicity distribution')

# Show the plot
plt.show()

In [None]:
# seaborn pie chart for sex_field in df
df[bmi[0]].plot.hist()
print(" mean BMI: {:.1f} +- {:.1f}".format(df[bmi[0]].mean(), df[bmi[2]].std()))


In [None]:
cancer_types = get_cancer_types()
cancer_types_distribution = {}
for eid in data_dict:
    for icd_code in data_dict[eid]['icd_codes']:
        for cancer_type in cancer_types:
            for icd_code_pattern in cancer_types[cancer_type]:
                if icd_code.startswith(icd_code_pattern):
                    if cancer_type in cancer_types_distribution.keys():
                        cancer_types_distribution[cancer_type] += 1
                    else:
                        cancer_types_distribution[cancer_type] = 1
                    

In [None]:
sum(cancer_types_distribution.values())

In [None]:
cancer_types_distribution

In [None]:
# delete c44 key from dictionary
del cancer_types_distribution[('c44')]

In [None]:
pd.DataFrame(cancer_types_distribution, index=[0]).T.plot.pie(subplots=True, figsize=(10, 10), autopct='%1.1f%%', legend=False)

In [None]:
sum(cancer_types_distribution.values())

In [None]:
[col for col in df.columns if col.startswith('40021')]