In [None]:
import pandas as pd
import numpy as np
from helpers import *
from itertools import combinations
import matplotlib.pyplot as plt
import seaborn as sns
import logging
logging.basicConfig(level = logging.INFO, filename='log.log', filemode='w', format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

In [None]:
pam16_df=pd.read_sas("PAM/hn16_pam.sas7bdat")
all16_df=pd.read_sas("ALL/hn16_all.sas7bdat")
pam14_df=pd.read_sas("PAM/hn14_pam.sas7bdat")
all14_df=pd.read_sas("ALL/hn14_all.sas7bdat")

This can be used for EDA

In [None]:
all16_df = all16_df[["ID", "year", "sex", "age", "BP_PHQ_9",
                  "mh_PHQ_S", "HE_BMI", "mh_stress", "EQ5D"]]
all14_df = all14_df[["id", "year", "sex", "age", "BP_PHQ_9",
                  "mh_PHQ_S", "HE_BMI", "mh_stress", "EQ5D"]]

process_data converts all nan Values into the mean values except the Sex Column that is processed in a different way

In [None]:
all14_df, all16_df = process_data(all14_df), process_data(all16_df)
all14_df.shape, all16_df.shape, pam14_df.shape, pam16_df.shape

turning features into meaningful values

In [None]:
#all14_df['BP_PHQ_9'], all16_df['BP_PHQ_9'] = all14_df['BP_PHQ_9'].apply(Depression_Severity_), all16_df['BP_PHQ_9'].apply(Depression_Severity_)
#all14_df['mh_PHQ_S'], all16_df['mh_PHQ_S'] = all14_df['mh_PHQ_S'].apply(Depression_Severity), all16_df['mh_PHQ_S'].apply(Depression_Severity)
all14_df['HE_BMI'], all16_df['HE_BMI'] = all14_df['HE_BMI'].apply(BMI_range), all16_df['HE_BMI'].apply(BMI_range)
pam14_df['sex'], pam16_df['sex'], all14_df['sex'], all16_df['sex'] = pam14_df['sex'].apply(Sex_name), pam16_df['sex'].apply(Sex_name), all14_df['sex'].apply(Sex_name), all16_df['sex'].apply(Sex_name)
pam14_df['age'], pam16_df['age'], all14_df['age'], all16_df['age'] = pam14_df['age'].apply(Age_range), pam16_df['age'].apply(Age_range), all14_df['age'].apply(Age_range), all16_df['age'].apply(Age_range)

In [None]:
print(all14_df.head())

In [None]:
print(all16_df.head())

Turn all columns to uppercase and concatenate the two dataframes from 2014 and 2016 together

In [None]:
func = lambda df: df.rename(columns=str.upper)
pam14_df, pam16_df, all14_df, all16_df = map(func, [pam14_df, pam16_df, all14_df, all16_df])
pam_combined = pd.concat([pam14_df, pam16_df], ignore_index=True)
all_combined = pd.concat([all14_df, all16_df], ignore_index=True)
pam_combined.drop('MOD_D', axis=1, inplace=True)
pam_combined['ID'] = pam_combined['ID'].apply(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)
all_combined['ID'] = all_combined['ID'].apply(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)

In [None]:
print(pam_combined.head(), pam_combined.shape, pam_combined.isna().sum().sum())

In [None]:
print(all_combined['AGE'].unique())
print(all_combined['AGE'].isna().sum())
print(all_combined.head(), all_combined.shape, all_combined.isna().sum().sum())

In [None]:
pam_combined_grouped = pam_combined.groupby('ID')
# Initialize the plot
plt.figure(figsize=(10, 6))
# Iterate over each group
for name, group in pam_combined_grouped:
    plt.plot(group.index, group['PAXINTEN'], label=name)
    print(name)
    break
# Adding titles and labels
plt.title('PAXINTEN by ID')
plt.xlabel('Timestamp')
plt.ylabel('PAXINTEN')
plt.legend()
#plt.legend(title='ID')
plt.show()

Create a kombination of IDs based of group blocks contained of SEX -> AGE -> HE_BMI

In [None]:
# Assuming grouped_df is your DataFrame containing the grouped data
#all_combined = all_combined.iloc[1:1000]
pam_grouped = pam_combined.groupby('ID')
# Create an empty list to store pairs of IDs
id_pairs = []
group_names = []
sex_names = []
age_names = []
bmi_names = []
PHQ_value = np.array([])
# Iterate over each group
for name, group in all_combined.groupby(['SEX', 'AGE', 'HE_BMI']):
    # Get IDs in the group
    ids = group['ID'].tolist()
    valid_ids = []
    for id1 in ids:
        try:
            data_participant_1 = pam_grouped.get_group(id1)['PAXINTEN'].to_numpy()
            valid_ids.append(id1)
        except KeyError:
            pass

    for id_1 in valid_ids:
        for id_2 in valid_ids:
            if id_1 == id_2: #or (id_2,id_1) in id_pairs:
                pass
            else:
                id_pairs.append((id_1,id_2))
                group_names.append(name[0] + '_' + name[1] + '_' + name[2])
                sex_names.append(name[0])
                age_names.append(name[1])
                bmi_names.append(name[2])

                # PHQ9P1 = all_combined.loc[all_combined['ID'] == id_1, 'BP_PHQ_9'].iloc[0]
                # PHQ9P2 = all_combined.loc[all_combined['ID'] == id_2, 'BP_PHQ_9'].iloc[0]

                PHQSP1 = all_combined.loc[all_combined['ID'] == id_1, 'MH_PHQ_S'].iloc[0]
                PHQSP2 = all_combined.loc[all_combined['ID'] == id_2, 'MH_PHQ_S'].iloc[0]
                
                value = int(PHQSP1 - PHQSP2)
                PHQ_value = np.append(PHQ_value, value)
    
# Convert the list of ID pairs to a DataFrame
# Goes from -27 to 27 so with absolute from 0 - 27 possible thresholds = [12, 15, 18, 20, 22, 24]
threshold = 15
id_pairs_df = pd.DataFrame(id_pairs, columns=['ID_1', 'ID_2'])
id_pairs_df['group_id'] = group_names
id_pairs_df['SEX'] = sex_names
id_pairs_df['AGE'] = age_names
id_pairs_df['HE_BMI'] = bmi_names
id_pairs_df['ID_COMBINED'] = id_pairs_df['ID_1'] + id_pairs_df['ID_2']
id_pairs_df['d_PHQ'] = PHQ_value
id_pairs_df['Depression'] = (abs(id_pairs_df['d_PHQ']) >= threshold).astype(int)
# Print the DataFrame containing pairs of IDs
print(id_pairs_df.head(100), id_pairs_df.shape)

Match ID1 and ID2 with the respectively actigraphy data and create a synthetic dataset with abs(PHQ9P2 - PHQ9P1) as the target variable.

In [None]:
id_pairs_grouped = id_pairs_df.groupby('d_PHQ')
for name, group in id_pairs_grouped:
    print(name)
    print(group.count())

In [None]:
pam_synthetic = pd.DataFrame(columns=['ID','ACTIGRAPHY_DATA'], dtype = object)
synthetic_array = np.zeros((id_pairs_df.shape[0], 10080)) # 10080 number of samples for a single patient
id_combined = []
number = 0
for index,synthetic_patient in id_pairs_df.iterrows():
    
    data_participant_1 = pam_grouped.get_group(synthetic_patient['ID_1'])['PAXINTEN'].to_numpy()
    data_participant_2 = pam_grouped.get_group(synthetic_patient['ID_2'])['PAXINTEN'].to_numpy()
    synthetic_array[number] = (data_participant_1 + data_participant_2) / 2
    id_combined.append(synthetic_patient['ID_1'] + synthetic_patient['ID_2'])
    logging.info(f"Participant_1 {synthetic_patient['ID_1']} and Participant_2 {synthetic_patient['ID_2']} added with {synthetic_array[number]}")
    number += 1
    
pam_synthetic['ID'] = id_combined
mask = []
for row in range(synthetic_array.shape[0]):
    max_value = np.max(synthetic_array[row, :])
    if max_value == 0 or max_value == 0.0:
        mask.append(row)
synthetic_array = np.delete(synthetic_array, mask, axis=0)

for row in range(synthetic_array.shape[0]):
    pam_synthetic.at[row, 'ACTIGRAPHY_DATA'] = synthetic_array[row]
id_pairs_df['ACTIGRAPHY_DATA'] = pam_synthetic['ACTIGRAPHY_DATA']      

In [None]:
import pywt
import numpy as np
from sklearn.preprocessing import StandardScaler
# Define functions to compute statistical features

def compute_energy(coeff):
    return np.sum(coeff ** 2)

def compute_mean(coeff):
    return np.mean(coeff)

def compute_std(coeff):
    return np.std(coeff)

def compute_entropy(coeff):
    p = np.abs(coeff) / np.sum(np.abs(coeff))
    return -np.sum(p * np.log2(p + np.finfo(float).eps))  # eps to avoid log(0)

def compute_features(data):
    coefficents = np.zeros(len(id_pairs_df))
    coeffs = pywt.wavedec(data, 'db1')

        #coefficents[index] = coeff
        
        # for i,participant in enumerate(coefficents):
        #     energies = np.square(participant)
        #     sorted_indices = np.argsort(energies)[::-1]
        #     cumulative_energy = np.cumsum(energies[sorted_indices])
        #     total_energy = cumulative_energy[-1]
        #     cumulative_energy /= total_energy

        #     num_features = np.searchsorted(cumulative_energy, energy_threshold)
            
        #     important_indices = sorted_indices[:num_features]
        #     coefficents[i] = participant[important_indices]

    # Extract features from the wavelet coefficients
    features = []

    for i, coeff in enumerate(coeffs):
        features.append(compute_energy(coeff))
        features.append(compute_mean(coeff))
        features.append(compute_std(coeff))
        features.append(compute_entropy(coeff))

    # Convert the feature list to a numpy array
    features = np.array(features)

    # (Optional) Normalize or standardize the features
    scaler = StandardScaler()
    features = scaler.fit_transform(features.reshape(-1, 1)).flatten()

    # print("Extracted features:")
    # print(features)

    return features

feature_list = []
for index,participant in id_pairs_df.iterrows():
    features = compute_features(participant['ACTIGRAPHY_DATA'])
    feature_list.append(features)
   
id_pairs_df['FEATURES'] = feature_list


In [None]:
print(id_pairs_df.head())

In [None]:
id_pairs_df.drop('ACTIGRAPHY_DATA', axis=1, inplace=True)
print(id_pairs_df.head())

In [None]:
id_pairs_df.to_csv(f'data/data_threshold_{threshold}.csv', index=False)