In [43]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F
import copy
import wandb
import statistics
from sklearn.model_selection import train_test_split
import random

Helper Functions

In [44]:
def process_row(row):
    values_a = [float(x.strip()) for x in str(row['domain_ids']).split(',')]
    values_b = [float(x.strip()) for x in str(row['domain_scores']).split(',')]
    return values_a, values_b

In [45]:
def create_training_data(data : pd.DataFrame):
    score = np.zeros((1,14))
    scores = np.zeros((len(data), 14))
    i = 0

    data = data.sort_values(by=["start_time_min"])
    for idx, row in data.iterrows():
        domains, domain_scores = process_row(row)
        for j in range(len(domains)):
            score[0, int(domains[j] - 1)] = domain_scores[j]
        # set score to the score list average
        scores[i] = score
        i += 1
    
    scores_df = pd.DataFrame(scores, columns=["domain %d score" % i for i in range(1, 15)])
    data.reset_index(drop=True, inplace=True)
    scores_df.reset_index(drop=True, inplace=True)
    data = pd.concat([data, scores_df], axis=1)
    return data

In [46]:
def filter_nonzero_rows(df, max_zeros):
    # Count number of zeros in each row
    zeros_count = (df == 0).sum(axis=1)
    
    # Filter rows with at most max_zeros number of zeros
    non_zero_rows = df[zeros_count <= max_zeros]
    return non_zero_rows

In [47]:
def missing(arr):
    return arr[0] == arr[1]

Data processing

In [48]:
## read and process data
df = pd.read_csv("data/filtered_ds.csv")
df = df.sort_values(by=["patient_id", "start_time_min"])
df = df.groupby("patient_id").apply(create_training_data).reset_index(drop=True)

  df = df.groupby("patient_id").apply(create_training_data).reset_index(drop=True)


In [49]:
## filter data to only sessions where the patient has all 14 domains (all ground truths)
model_data = df[["domain %d score" % i for i in range(1, 15)]]
model_data = filter_nonzero_rows(model_data, 0)
ground_truth = copy.deepcopy(model_data)

Training process

In [50]:
## define model
class AE(torch.nn.Module):
    def __init__(self):
        super().__init__()
        n_domains = 13
        hidden1 = 50
        hidden2 = 25
        
        self.model = torch.nn.Sequential(
            torch.nn.Linear(n_domains, hidden1),
            torch.nn.Sigmoid(),
            torch.nn.Linear(hidden1, hidden2),
            torch.nn.Sigmoid(),
            torch.nn.Linear(hidden2, 1),
            torch.nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)


Bad pipe message: %s [b'\x84,\x86\xfa-\xc0\x80\xe4\xfan\x88g\x8fr\x06\x87l_\x00\x01|\x00\x00\x00\x01\x00\x02\x00\x03\x00\x04\x00\x05\x00\x06\x00\x07\x00\x08\x00\t\x00\n\x00\x0b\x00\x0c\x00\r\x00\x0e\x00\x0f\x00\x10\x00\x11\x00\x12\x00\x13\x00\x14\x00\x15\x00\x16\x00\x17\x00\x18\x00\x19\x00\x1a\x00\x1b\x00/\x000\x001\x002\x003\x004\x005\x006\x007\x008\x009\x00:\x00;\x00<\x00=\x00>\x00?\x00@\x00A\x00B\x00C\x00D\x00E\x00F\x00g\x00h\x00i\x00j\x00k\x00l\x00m\x00\x84\x00\x85\x00\x86\x00\x87\x00\x88\x00\x89\x00\x96\x00\x97\x00\x98\x00\x99\x00\x9a\x00\x9b\x00\x9c\x00\x9d\x00\x9e\x00\x9f\x00\xa0\x00\xa1\x00\xa2\x00\xa3\x00\xa4\x00\xa5\x00\xa6\x00\xa7\x00\xba\x00\xbb\x00', b"\xbd\x00\xbe\x00\xbf\x00\xc0\x00\xc1\x00\xc2\x00\xc3\x00\xc4\x00\xc5\x13\x01\x13\x02\x13\x03\x13\x04\x13\x05\xc0\x01\xc0\x02\xc0\x03\xc0\x04\xc0\x05\xc0\x06\xc0\x07\xc0\x08\xc0\t\xc0\n\xc0\x0b\xc0\x0c\xc0\r\xc0\x0e\xc0\x0f\xc0\x10\xc0\x11\xc0\x12\xc0\x13\xc0\x14\xc0\x15\xc0\x16\xc0\x17\xc0\x18\xc0\x19\xc0#\xc0$\xc0%\xc0&\xc0

In [None]:
def train_model(x_train, x_val,y_train, y_val, epochs, model, optimizer, loss_function):
    outputs = []
    losses = []
    val_losses = []
    w = 13 ## hardcoded

    for epoch in range(epochs):
        epoch_loss = []
        val_epoch_loss = []
        ## training
        model.train()
        for i in range(len(x_train)):
            session = x_train[i]
            # Output of Autoencoder
            session_rs = session.reshape(-1, w)
            session_t = torch.from_numpy(session_rs).float()
            reconstructed = model(session_t)
              
            # Calculating the loss function
            loss = loss_function(reconstructed, torch.tensor(y_train[i]).type(torch.float32))
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            # Storing the losses in a list for plotting
            epoch_loss.append(loss.item())

        losses.append(statistics.mean(epoch_loss))
        outputs.append((epochs, session_t, reconstructed))

        ## validation
        model.eval()
        with torch.no_grad():
            for k in range(len(x_val)):
                val = x_val[k]
                val_rs = val.reshape(-1, w)
                val_t = torch.from_numpy(val_rs)
                val_t = val_t.clone().detach().type(torch.float32)
                answer = model(val_t)
                val_loss = loss_function(answer, torch.tensor(y_val[k]).type(torch.float32))
                val_epoch_loss.append(val_loss.item())
        val_losses.append(statistics.mean(val_epoch_loss))
    return losses, val_losses, outputs, model

In [None]:
## take in two data arrays and return predictions for both in a tuple
def get_predictions(training, validation, model):
    model.eval()
    with torch.no_grad():
        return model(torch.tensor(training).type(torch.float32)), model(torch.tensor(validation).type(torch.float32))

In [None]:
def plot_mean_and_std(data, color_choice="blue", setting=""):
    """
    Plots the mean and standard deviation across multiple lists of data.
    
    Parameters:
    - data (list of lists): A list where each element is a list of numbers.
    
    The function will compute the mean and standard deviation at each point
    across the lists and plot these as a line for the mean and shading for the
    standard deviation.
    """
    # Convert data to a NumPy array for easier manipulation
    data_array = np.array(data)
    
    # Calculate mean and standard deviation
    means = np.mean(data_array, axis=0)
    stds = np.std(data_array, axis=0)
    # print(means, stds)
    
    # Create the x-axis values
    x_values = np.arange(len(means))
    
    # Plotting
    plt.plot(x_values, means, label='Mean', color=color_choice)  # Mean line
    plt.fill_between(x_values, means - stds, means + stds, color=color_choice, alpha=0.2, label='Standard Deviation')
    
    plt.title('Mean and Standard Deviation Plot of %s' %setting)
    plt.xlabel('epochs')
    plt.ylabel('loss')
    plt.legend()
    plt.grid(True)

In [None]:
## train mutliple run function
def multi_run(training_data, target_data, num_runs=3):
    losses_2d = []
    val_losses_2d = []

    for i in range(num_runs):
        model = AE()
        epochs = 10
        lr = 0.005

        # Validation using MSE Loss function
        loss_function = torch.nn.MSELoss()
        
        # Using an Adam Optimizer
        optimizer = torch.optim.Adam(model.parameters(),
                                    lr = lr)
        
        x_train, x_val, y_train, y_val = train_test_split(training_data, target_data, test_size=0.20)
        with torch.no_grad():
            initial_pred_train = model(torch.tensor(x_train).type(torch.float32))
            zero_loss = loss_function(torch.tensor(initial_pred_train).type(torch.float32), torch.tensor(y_train).type(torch.float32))

            initial_pred_val = model(torch.tensor(x_val).type(torch.float32))
            zero_loss_val = loss_function(torch.tensor(initial_pred_val).type(torch.float32), torch.tensor(y_val).type(torch.float32))
        
        losses, val_losses, outputs, model = train_model(x_train, x_val, y_train, y_val, epochs, model, optimizer, loss_function)
        losses = [zero_loss.item()] + losses
        val_losses = [zero_loss_val.item()] + val_losses
        
        losses_2d.append(losses)
        val_losses_2d.append(val_losses)
    
    ## use the last trained model to get predictions
    predictions_train, predictions_val = get_predictions(x_train, x_val, model)

    return x_train, x_val, y_train, y_val, losses_2d, val_losses_2d, predictions_train, predictions_val

Individual domains

In [None]:
## run
def train_domain(target_domain : int, model_data=model_data, ground_truth=ground_truth):
    target_domain_name = "domain %d score" % target_domain
    target = copy.deepcopy(ground_truth[target_domain_name]).to_numpy().reshape(-1, 1)
    train_data = model_data.drop([target_domain_name], axis=1).to_numpy()
    # return results from training a model for multiple runs
    return multi_run(train_data, target)

Assign level groups

In [None]:
def find_level(points, point):
    for i in range(len(points) - 1, -1, -1):
        if point <= points[i]:
            return len(points) - i
    print("error")

In [None]:
## separate sessions into x number of levels based on the 13 domains
def level_assignment(data, num_levels=5):
    lvls = []
    # get an average
    avg = np.mean(data, axis=1)
    print(data.shape, avg.shape)
    # find level separation points
    high = avg.max()
    low = avg.min()
    seps = [high - (high - low) / num_levels * i for i in range(num_levels)]
    for p in avg:
        lvls.append(find_level(seps, p))
    return lvls

In [None]:
def results_domain(target_domain : int, model_data=model_data, ground_truth=ground_truth):
    # run training functions to get the results
    x_train, x_val, y_train, y_val, training_loss, validation_loss, predictions_train, predictions_val = train_domain(target_domain, model_data=model_data, ground_truth=ground_truth)
    # create numpy array with training data (not including target) but maintain train val split order
    original = np.hstack((np.vstack((x_train, x_val)), np.vstack((y_train, y_val))))
    # find level for each session (row)
    levels = np.vstack((np.array(level_assignment(x_train)).reshape(-1, 1), np.array(level_assignment(x_val)).reshape(-1, 1)))
    # work with predictions
    predictions = np.vstack((predictions_train.numpy(), predictions_val.numpy()))
    # hstack everything to create a big numpy array to convert into dataframe
    data_with_levels = np.hstack((original, levels))
    data_with_levels = np.hstack((data_with_levels, predictions))

    # create dataframe
    column_names = ["non-target domain score %d" % i for i in range(1, 14)] + ["target domain", "level", "predicted score"]
    df_with_levels = pd.DataFrame(data_with_levels, columns=column_names)
    return df_with_levels, training_loss, validation_loss

Visualization

In [None]:
model_data.shape

In [None]:
domain_n = 1

In [None]:
results, traing_loss, val_loss = results_domain(domain_n, model_data=model_data, ground_truth=ground_truth)

In [None]:
with torch.no_grad():
    plt.figure()
    plot_mean_and_std(traing_loss, "blue", "Training")
    plot_mean_and_std(val_loss, "orange", "Validation")

In [None]:
plt.figure()
plt.title("Predicting Domain %d" % domain_n)
sns.histplot(data=results, x="predicted score", bins=50, kde=True, hue="level", palette = "tab10", multiple="stack")

In [None]:
plt.figure()
plt.title("GT Domain %d" % domain_n)
sns.histplot(data=results, x="target domain", bins=50, kde=True, hue="level", palette = "tab10", multiple="stack")

In [None]:
plt.figure()
results.groupby("level").size().plot.pie(autopct="%.2f")
plt.title("Level Distribution")
plt.figtext(.8, .8, "total 136,131 sessions")

In [None]:
fig = plt.figure(figsize=(12, 12))

plt.subplot(2, 2, 1)
plot_mean_and_std(traing_loss, "blue", "Training")
plot_mean_and_std(val_loss, "orange", "Validation")

plt.subplot(2, 2, 2)
results.groupby("level").size().plot.pie(autopct="%.2f")
plt.title("Level Distribution")
plt.figtext(.8, .8, "total 136,131 sessions")

plt.subplot(2, 2, 3)
plt.title("Predicting Domain %d" % domain_n)
sns.histplot(data=results, x="predicted score", bins=50, kde=True, hue="level", palette = "tab10", multiple="stack")

plt.subplot(2, 2, 4)
plt.title("GT Domain %d" % domain_n)
sns.histplot(data=results, x="target domain", bins=50, kde=True, hue="level", palette = "tab10", multiple="stack")

fig.suptitle("Domain %d" % domain_n, size=14)
fig.tight_layout()
fig.subplots_adjust(top=0.90)