In [None]:
import pandas as pd
import numpy as np
from helpers import *

In [None]:
pam16_df=pd.read_sas("PAM/hn16_pam.sas7bdat")
all16_df=pd.read_sas("ALL/hn16_all.sas7bdat")
pam14_df=pd.read_sas("PAM/hn14_pam.sas7bdat")
all14_df=pd.read_sas("ALL/hn14_all.sas7bdat")

This can be used for EDA

In [None]:
all16_df = all16_df[["ID", "year", "sex", "age", "BP_PHQ_9",
                  "mh_PHQ_S", "HE_BMI", "mh_stress", "EQ5D"]]
all14_df = all14_df[["id", "year", "sex", "age", "BP_PHQ_9",
                  "mh_PHQ_S", "HE_BMI", "mh_stress", "EQ5D"]]

process_data converts all nan Values into the mean values except the Sex Column that is processed in a different way

In [None]:
all14_df, all16_df = process_data(all14_df), process_data(all16_df)
all14_df.shape, all16_df.shape, pam14_df.shape, pam16_df.shape

turning features into meaningful values

In [None]:
#all14_df['BP_PHQ_9'], all16_df['BP_PHQ_9'] = all14_df['BP_PHQ_9'].apply(Depression_Severity_), all16_df['BP_PHQ_9'].apply(Depression_Severity_)
#all14_df['mh_PHQ_S'], all16_df['mh_PHQ_S'] = all14_df['mh_PHQ_S'].apply(Depression_Severity), all16_df['mh_PHQ_S'].apply(Depression_Severity)
all14_df['HE_BMI'], all16_df['HE_BMI'] = all14_df['HE_BMI'].apply(BMI_range), all16_df['HE_BMI'].apply(BMI_range)
pam14_df['sex'], pam16_df['sex'], all14_df['sex'], all16_df['sex'] = pam14_df['sex'].apply(Sex_name), pam16_df['sex'].apply(Sex_name), all14_df['sex'].apply(Sex_name), all16_df['sex'].apply(Sex_name)
pam14_df['age'], pam16_df['age'], all14_df['age'], all16_df['age'] = pam14_df['age'].apply(Age_range), pam16_df['age'].apply(Age_range), all14_df['age'].apply(Age_range), all16_df['age'].apply(Age_range)

In [None]:
print(all14_df.head())

In [None]:
print(all16_df.head())

Turn all columns to uppercase and concatenate the two dataframes from 2014 and 2016 together

In [None]:
func = lambda df: df.rename(columns=str.upper)
pam14_df, pam16_df, all14_df, all16_df = map(func, [pam14_df, pam16_df, all14_df, all16_df])
pam_combined = pd.concat([pam14_df, pam16_df], ignore_index=True)
all_combined = pd.concat([all14_df, all16_df], ignore_index=True)
pam_combined.drop('MOD_D', axis=1, inplace=True)

In [None]:
print(pam_combined.head(), pam_combined.shape, pam_combined.isna().sum().sum())

In [None]:
print(all_combined['AGE'].unique())
print(all_combined['AGE'].isna().sum())
print(all_combined.head(), all_combined.shape, all_combined.isna().sum().sum())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

pam_combined_grouped = pam_combined.groupby('ID')
# Initialize the plot
plt.figure(figsize=(10, 6))
# Iterate over each group
for name, group in pam_combined_grouped:
    plt.plot(group.index, group['PAXINTEN'], label=name)
    break
# Adding titles and labels
plt.title('PAXINTEN by ID')
plt.xlabel('Timestamp')
plt.ylabel('PAXINTEN')
#plt.legend(title='ID')
plt.show()

Create a kombination of IDs based of group blocks contained of SEX -> AGE -> HE_BMI

In [None]:
from itertools import combinations

# Assuming grouped_df is your DataFrame containing the grouped data
all_combined_to_group = all_combined[['ID', 'SEX', 'AGE', 'HE_BMI']].iloc[1:100]
# Create an empty list to store pairs of IDs
id_pairs = []
group_names = []
sex_names = []
age_names = []
bmi_names = []
PHQ9_value = np.array([])
# Iterate over each group
for name, group in all_combined_to_group.groupby(['SEX', 'AGE', 'HE_BMI']):
    # Get IDs in the group
    ids = group['ID'].tolist()
    for id_1 in ids:
        for id_2 in ids:
            if id_1 == id_2 or (id_2,id_1) in id_pairs:
                pass
            else:
                id_pairs.append((id_1,id_2))
                group_names.append(name[0] + '_' + name[1] + '_' + name[2])
                sex_names.append(name[0])
                age_names.append(name[1])
                bmi_names.append(name[2])
                PHQ9P1 = all_combined.loc[all_combined['ID'] == id_1, 'BP_PHQ_9'].iloc[0]
                PHQ9P2 = all_combined.loc[all_combined['ID'] == id_2, 'BP_PHQ_9'].iloc[0]
                value = round(abs(PHQ9P1 - PHQ9P2))
                PHQ9_value = np.append(PHQ9_value, value)
        
# Convert the list of ID pairs to a DataFrame
threshold = 4
id_pairs_df = pd.DataFrame(id_pairs, columns=['ID_1', 'ID_2'])
id_pairs_df['group_id'] = group_names
id_pairs_df['SEX'] = sex_names
id_pairs_df['AGE'] = age_names
id_pairs_df['HE_BMI'] = bmi_names
id_pairs_df['ID_COMBINED'] = id_pairs_df['ID_1'] + id_pairs_df['ID_2']
id_pairs_df['PHQ9_abs'] = PHQ9_value
id_pairs_df['Depression'] = (id_pairs_df['PHQ9_abs'] >= threshold).astype(int)
# Print the DataFrame containing pairs of IDs
print(id_pairs_df.head(100), id_pairs_df.shape)

Match ID1 and ID2 with the respectively actigraphy data and create a synthetic dataset with abs(PHQ9P2 - PHQ9P1) as the target variable.

In [None]:
pam_synthetic = pd.DataFrame()
for synthetic_patient in id_pairs_df:
    for name, group in pam_combined.groupby(['ID']):
        if name == synthetic_patient['ID_1']:
            data_patient_1 = group['PAXINTEN']
        elif name == synthetic_patient['ID_2']:
            data_patient_2 = group['PAXINTEN']

    # pam_synthetic = data_patient_1 - data_patient_2