## overview
- pipeline that finds the best suggestion, prototype

In [1]:
## import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import statistics
from datetime import datetime
from sklearn.model_selection import train_test_split
import random, sys, copy, os, json

In [2]:
# Set seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

# Ensure deterministic algorithms
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

Data

In [3]:
# df = pd.read_csv("data/filtered_model_data.csv")

In [4]:
# df["start_time"] = df["start_time"].astype('datetime64[ns]')
# df = df.sort_values(by=["patient_id", "start_time"])

Create training data

In [5]:
# given a row of sessions, take domain_ids and domain_scores, which are in string format separated by ",", and replace with a list of the values
def process_row(row):
    values_a = [int(x.strip()) for x in str(row['domain_ids']).split(',')]
    values_b = [float(x.strip()) for x in str(row['domain_scores']).split(',')]
    return values_a, values_b

In [6]:
# take in a dataframe of a patient's session, extract information useful for training
def create_training_data(data: pd.DataFrame):
    # Initialize variables
    session_row = [] # contents of a row (patient id, encoding, cur score, prev score, repeat)
    overall = [] # aggregate of everything (n sessions x 44)

    cur_score = np.empty((14)) # score for each session
    cur_score.fill(np.nan)
    prev_score = None

    seen = {} # dictionary for seen
    patient_id = data["patient_id"].iloc[0] # save patient_id

    # Sort data by session start time
    data = data.sort_values(by=["start_time"])

    # Process each row
    for idx, row in data.iterrows():
        domains, domain_scores = process_row(row)  # returns a list of domains : int and of domain_scores : float

        # Track repeat status and update scores
        repeat = False

        for j, domain in enumerate(domains):
            if domain not in seen:
                seen[domain] = True
            else:
                repeat = True
            
            cur_score[domain - 1] = domain_scores[j] # update score in the loop

        # Encode domains for this session
        domain_encoding = np.zeros(14)
        for domain in domains:
            domain_encoding[domain - 1] = 1
        
        

        # if the session does not contain the target domain or is the first (no prev score), continue in the loop without doing anything, do this before appending
        if prev_score is None:
            session_row = []
            prev_score = cur_score.copy()
            continue
        # assert np.sum(domain_encoding) != 1, "continue not working"

        # append everything in the row list
        session_row.append(patient_id)
        session_row.extend(domain_encoding.copy().tolist())
        session_row.extend(prev_score.copy().tolist())
        session_row.extend(cur_score.copy().tolist())
        session_row.append(repeat)
        assert len(session_row) == 44, "session row length weird"

        # append row to overall, reset
        overall.append(session_row)
        session_row = []
        prev_score = cur_score.copy()

    # Convert to numpy arrays
    if overall:
        overall = np.array(overall)
        assert len(overall.shape) == 2, "dimensions of overall wrong"
    else:
        # Handle case where scores is empty
        return pd.DataFrame(columns=["patient_id"] + ["domain %d encoding" % i for i in range(1, 15)] +
                                   ["domain %d score" % i for i in range(1, 15)] +
                                   ["domain %d target" % i for i in range(1, 15)] +
                                   ["repeat"])
    
        # Create column names
    column_names = (
        ["patient_id"]
        + [f"domain {i} encoding" for i in range(1, 15)]
        + [f"domain {i} score" for i in range(1, 15)]
        + [f"domain {i} target" for i in range(1, 15)]
        + ["repeat"]
    )

    # Create dataframe
    scores_df = pd.DataFrame(overall, columns=column_names)
    scores_df.reset_index(drop=True, inplace=True)
    return scores_df

In [7]:
# data = df.groupby("patient_id")[df.columns].apply(create_training_data).reset_index(drop=True)

In [8]:
# data.to_csv("data/next_step_data.csv", index=False)

In [9]:
data = pd.read_csv("data/next_step_data.csv")

In [10]:
train_data, test_data = train_test_split(data, test_size=0.25, random_state=42)
n_samples = 100000

## one sample for train, only to see if it learns that one example
train_data = train_data[:n_samples].copy()
test_data = test_data[:n_samples].copy()

In [11]:
score_columns = ["domain %d score" %i for i in range(1, 15)]
encoding_columns = ["domain %d encoding" %i for i in range(1, 15)]
target_columns = ["domain %d target" %i for i in range(1, 15)]
repeat_columns = ["repeat"]

In [12]:
# create missing indicator when given the score data
def create_missing_indicator(data):
    (l, w) = data.shape
    temp = np.zeros((l, w*2))
    for i in range(l):
        for d in range(w):
            p = data[i, d]
            # update output array
            if np.isnan(p):
                missing_ind = np.random.choice(2, 1)[0]
                temp[i, d*2] = missing_ind
                temp[i, d*2+1] = missing_ind
            else:
                temp[i, d*2] = p # score
                temp[i, d*2+1] = 1-p # 1-score
    return copy.deepcopy(temp)

In [13]:
# given a processed dataframe, return data and target tensors that can be put in the model
def create_model_data(data : pd.DataFrame):
    target = data[target_columns].copy().to_numpy() * data[encoding_columns].copy().to_numpy()
    data_scores = create_missing_indicator(data[score_columns].copy().to_numpy())
    final_data = np.hstack((data[encoding_columns].copy().to_numpy(), data_scores))
    return torch.from_numpy(final_data).float(), torch.from_numpy(target).float()

In [14]:
x_test, y_test = create_model_data(test_data)

Model

In [15]:
## input : 14 domain encodings + 14 domains (28 total features with missing indicator)
## output: 28 score (prediction for the scores after next domain)
class NN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        n_domains = 14
        
        self.model = torch.nn.Sequential(
            torch.nn.Linear(n_domains * 3, 100),
            torch.nn.Sigmoid(),
            torch.nn.Linear(100, n_domains)
        )

    def forward(self, x):
        return self.model(x)

# used for batch training
class customDataset(Dataset):
    def __init__(self, data, target):
        super().__init__()
        self.data = data
        self.target = target

    def __len__(self):
        return self.data.shape[0]

    def __getitem__(self, index):
        return self.data[index, :], self.target[index, :]

In [16]:
model = NN()
model = torch.load("output/experiment4/model.pt")

RuntimeError: Attempting to deserialize object on a CUDA device but torch.cuda.is_available() is False. If you are running on a CPU-only machine, please use torch.load with map_location=torch.device('cpu') to map your storages to the CPU.

Predict missing values

In [None]:
case_study_data_mp = create_missing_indicator(case_study_data[score_columns].copy().to_numpy()[-1, :])

In [None]:
mp_model.eval()
with torch.no_grad():
    mp_prediction = mp_model(torch.from_numpy(case_study_data_mp).float())

In [None]:
case_study_data

predict next step values

In [None]:
next_step = np.zeros(14)

# Select a random index to place the '1'
random_index = np.random.choice(14)
next_step[random_index] = 1
next_step

In [None]:
input = torch.cat((torch.from_numpy(next_step).float(), mp_prediction))

In [None]:
ns_model.eval()
with torch.no_grad():
    predicted_ns_score = ns_model(input)

In [None]:
predicted_ns_score_ = np.sum(predicted_ns_score.numpy() * next_step)

In [None]:
predicted_ns_score_

predict missing again

In [None]:
new_truth = case_study_data[score_columns].copy(deep=True).to_numpy()[-1, :]
new_truth[random_index] = predicted_ns_score_
new_truth = create_missing_indicator(new_truth)

In [None]:
mp_model.eval()
with torch.no_grad():
    mp_prediction_new = mp_model(torch.from_numpy(new_truth).float())

In [None]:
mp_prediction[::2]

In [None]:
mp_prediction_new[::2]

In [None]:
np.sum(mp_prediction_new[::2].numpy())/14 - np.sum(mp_prediction[::2].numpy())/ 14

masking

In [None]:
def missing(arr):
    assert(len(arr) == 2)
    # if two values are the same (the only three possible values pairs are (0, 0), (1, 1), and (0.5, 0.5))
    return arr[0] == arr[1] and (arr[0] == 0 or arr[0] == 1)

### Quantitative (overall baseline vs prediction view)
- compare our prediction to baseline in test set in a quantitative manner (rather than looking at an individual heatmap)

In [None]:
## take in dataframe, return known domain average for prediction, modified from baseline quant 11/04/2024
def known_domain_average(data : pd.DataFrame):
    """
    Imputes missing values (NaN) in a DataFrame with the average of the non-missing values in the same row.

    Parameters:
    - data (pd.DataFrame): The DataFrame to impute.

    Returns:
    - pd.DataFrame: The DataFrame with missing values imputed.
    """
    assert(data.shape[1] == 14 * 3) ## make sure that the dataframe is the right shape with encoding, current, and target

    # separate encoding, scores, and target (which we dont need so ignored in here)
    encoding = data.iloc[:, :14].copy()
    data_ = data.iloc[:, 14:28].copy()

    # Replace '0' with NaN to handle them as missing values
    data_ = data.replace(0, np.nan)
    # initialize the list we are going to use to store all known domain average
    known_domain_average_lst = []

    # Iterate over each row
    for index, row in data_.iterrows():
        # Calculate the mean of the non-NaN values in the row
        mean_value = row.mean()
        # if there are no known domains, we say known domain average is 0
        if np.isnan(mean_value):
            mean_value = 0
        # append the average onto the list
        known_domain_average_lst.append(mean_value)
        # in the encoding df, replace na with 
        encoding.loc[index] = encoding.loc[index].replace(1, mean_value)
    
    return encoding

In [None]:
def get_accuracy(y_pred, y_true, n, fn : str):
    diff = y_pred - y_true
    if fn == "mse":
        return np.sum(np.power(diff, 2)) / n
    elif fn == "mae":
        return np.sum(np.abs(diff)) / n

In [None]:
# get predictions from the model
def predict(data, model):
    model.eval()
    data_t = torch.tensor(data, dtype=torch.float32)
    with torch.no_grad():
        return model(data_t).clone().detach().numpy()

In [None]:
from scipy.stats import sem

accuracy_list_prediction_sq = []
accuracy_list_knownimputation_sq = []
accuracy_list_prediction_abs = []
accuracy_list_knownimputation_abs = []

for masked_number in range(14):
    ## create masked testing data so we can see how the model predicts and how imputation compares
    masked_test_data = test_data.copy() # make a copy of test data for masked
    masked_test_data[score_columns], n_missing, mask = generate_mask(test_data[score_columns].copy().to_numpy(), N=masked_number)
    assert n_missing == masked_number * test_data.shape[0], "n missing weird"
    assert n_missing == np.size(mask) - np.sum(mask), "n missing weird"

    # known domain average prediction for the next domains
    baseline_prediction = known_domain_average(masked_test_data).to_numpy()

    # predict
    masked_test_data_ = copy.deepcopy(masked_test_data[encoding_columns + score_columns])
    masked_test_data_ = masked_test_data_.to_numpy()
    test_data_scores = create_missing_indicator(masked_test_data_[:, -14:])
    masked_test_data_ = np.hstack((masked_test_data_[:, :14], test_data_scores))
    model_prediction = predict(masked_test_data_, model)
    model_prediction = np.multiply(model_prediction, test_data[encoding_columns].to_numpy()) # element wise multiply the two so we have the same form as baseline prediction
    # we only have the values we are predicting in the matrix, everything else is 0

    # get accuracy (abs)
    original = test_data[target_columns].copy().to_numpy() # ground truth
    original = np.multiply(original, test_data[encoding_columns].copy().to_numpy())# element wise multiply the two so we only have the scores we care about predicting
    assert original.shape == mask.shape and original.shape == model_prediction.shape and original.shape == baseline_prediction.shape
    n_predicting = np.sum(test_data[encoding_columns].copy().to_numpy())
    
    accuracy_list_prediction_abs.append(get_accuracy(model_prediction, original, n_predicting, "mae"))
    accuracy_list_knownimputation_abs.append(get_accuracy(baseline_prediction, original, n_predicting, "mae"))

    ## get accuracy (sq)
    accuracy_list_prediction_sq.append(get_accuracy(model_prediction, original, n_predicting, "mse"))
    accuracy_list_knownimputation_sq.append(get_accuracy(baseline_prediction, original, n_predicting, "mse"))


In [None]:
fig = plt.figure(figsize=(12, 12))
# plt.errorbar(range(0, 14), accuracy_list_prediction_sq, yerr=error_prediction_sq, label="model", linewidth=3)
# plt.errorbar(range(0, 14), accuracy_list_imputation_sq, yerr=error_imputation_sq, label="baseline")

x_values = range(0, 14)

plt.plot(x_values, accuracy_list_prediction_sq, label="model", marker="o")
plt.plot(x_values, accuracy_list_knownimputation_sq, label="baseline", marker="o")

for x, y1, y2 in zip(x_values, accuracy_list_prediction_sq, accuracy_list_knownimputation_sq):
    offset = 10

    label = "{:.4f}".format(y1)
    plt.annotate(label, # this is the text
                (x,y1), # these are the coordinates to position the label
                textcoords="offset points", # how to position the text
                xytext=(0,10), # distance from text to points (x,y)
                ha='center') # horizontal alignment can be left, right or center
    
    label = "{:.4f}".format(y2)
    plt.annotate(label, # this is the text
                (x,y2), # these are the coordinates to position the label
                textcoords="offset points", # how to position the text
                xytext=(0,-10), # distance from text to points (x,y)
                ha='center') # horizontal alignment can be left, right or center



plt.xlabel("number of masked domains")
plt.ylabel("mean squared error")
plt.title("Mean squared error vs number of masked domains")
plt.legend()

In [None]:
fig = plt.figure(figsize=(12, 12))
# plt.errorbar(range(0, 14), accuracy_list_prediction_abs, yerr=error_prediction_abs, label="prediction", linewidth=3)
# plt.errorbar(range(0, 14), accuracy_list_imputation_abs, yerr=error_imputation_abs, label="imputation")
x_values = range(0, 14)

plt.plot(x_values, accuracy_list_prediction_abs, label="model", marker="o")
plt.plot(x_values, accuracy_list_knownimputation_abs, label="baseline", marker="o")

for x, y1, y2 in zip(x_values, accuracy_list_prediction_abs, accuracy_list_knownimputation_abs):

    label = "{:.4f}".format(y1)
    plt.annotate(label, # this is the text
                (x,y1), # these are the coordinates to position the label
                textcoords="offset points", # how to position the text
                xytext=(0,10), # distance from text to points (x,y)
                ha='center') # horizontal alignment can be left, right or center
    
    label = "{:.4f}".format(y2)
    plt.annotate(label, # this is the text
                (x,y2), # these are the coordinates to position the label
                textcoords="offset points", # how to position the text
                xytext=(0,-10), # distance from text to points (x,y)
                ha='center') # horizontal alignment can be left, right or center
    

plt.xlabel("number of masked domains")
plt.ylabel("mean absolute error")
plt.title("Mean absolute erorr vs number of masked domains")
plt.legend()

### Visualization
- visualize individual patient's trajectory (ground truth vs prediction vs imputation?)
- look at overall quantitative picture

In [None]:
column_names = ["domain %d score" % i for i in range(1, 15)]
column_rename_dict = {}
for i in range(len(column_names)):
    column_rename_dict[column_names[i]] = i + 1

In [None]:
# takes in a matrix, returns a list mean along rows
def get_score(data):
    return np.mean(data, axis=1)

In [None]:
# get ground truth scores (overall)
GT_scores = get_score(case_study_data[column_names].to_numpy())

In [None]:
# get prediction scores
model.eval()
with torch.no_grad():
    test_data = case_study_data[encoding_columns + score_columns].to_numpy()
    test_data_scores = create_missing_indicator(test_data[:, -14:])
    test_data = np.hstack((test_data[:, :14], test_data_scores))
    predictions = model(torch.from_numpy(test_data).type(torch.float32)).detach().numpy()

In [None]:
n_missing = np.sum(case_study_data[encoding_columns].to_numpy())
target_ = np.multiply(case_study_data[target_columns].to_numpy(), case_study_data[encoding_columns].to_numpy()) # only the targets we want to predict
prediction_ = np.multiply(predictions, case_study_data[encoding_columns].to_numpy()) # only the scores we want to predict
mse_ = np.divide(np.sum(np.abs(target_ - prediction_)), n_missing)

In [None]:
mse_

In [None]:
plt.figure()
plt.scatter(range(case_study_data.shape[0]), np.divide(np.sum(np.abs(target_ - prediction_), axis=1), np.sum(case_study_data[encoding_columns].to_numpy(), axis=1)))

In [None]:
np.sum(case_study_data[encoding_columns].to_numpy(), axis=1)

In [None]:
case_study_data[encoding_columns]

In [None]:
df[df.patient_id == pid]

In [None]:
np.unique(df[df.patient_id == pid]["domain_ids"])

In [None]:
create_training_data(df[df.patient_id == pid])

In [None]:
# store model as 
torch.save(model, "model/next_step_1.pt")