## overview
- pipeline that finds the best suggestion, prototype

In [29]:
## import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import statistics
from datetime import datetime
from sklearn.model_selection import train_test_split
import random, sys, copy, os, json

In [30]:
# Set seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

# Ensure deterministic algorithms
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

Data

In [31]:
df = pd.read_csv("data/filtered_model_data.csv")

In [32]:
df["start_time"] = df["start_time"].astype('datetime64[ns]')
df = df.sort_values(by=["patient_id", "start_time"])

Create training data

In [33]:
# given a row of sessions, take domain_ids and domain_scores, which are in string format separated by ",", and replace with a list of the values
def process_row(row):
    values_a = [int(x.strip()) for x in str(row['domain_ids']).split(',')]
    values_b = [float(x.strip()) for x in str(row['domain_scores']).split(',')]
    return values_a, values_b

In [None]:
# take in a dataframe of a patient's session, extract information useful for training
def create_training_data(data: pd.DataFrame):
    # Initialize variables
    session_row = [] # contents of a row (patient id, encoding, cur score, prev score, repeat)
    overall = [] # aggregate of everything (n sessions x 44)

    cur_score = np.zeros((14)) # score for each session
    cur_score.fill(np.nan)
    prev_score = None

    # seen = {} # dictionary for seen
    patient_id = data["patient_id"].iloc[0] # save patient_id

    # Sort data by session start time
    data = data.sort_values(by=["start_time"])

    # Process each row
    for idx, row in data.iterrows():
        domains, domain_scores = process_row(row)  # returns a list of domains : int and of domain_scores : float
        domain = np.random.choice(14, 1)[0] # choose a random domain instead

        # Track repeat status and update scores
        if prev_score is None:
            repeat = False
        else:
            # find if domain already has a score
            next_domain_score = prev_score[domain]
            if np.isnan(next_domain_score): repeat = False
            else: repeat = True
        # repeat = False

        # for j, domain in enumerate(domains):
        # if domain not in seen:
        #     seen[domain] = True
        # else:
        #     repeat = True
        
        for j, domain in enumerate(domains):
            cur_score[domain - 1] = domain_scores[j] # update score in the loop

        # Encode domains for this session
        domain_encoding = np.zeros(14)
        domain_encoding[domain - 1] = 1
        
        

        # if the session does not contain the target domain or is the first (no prev score), continue in the loop without doing anything, do this before appending
        if prev_score is None:
            session_row = []
            prev_score = cur_score.copy()
            continue
        # assert np.sum(domain_encoding) != 1, "continue not working"

        # append everything in the row list
        session_row.append(patient_id)
        session_row.extend(domain_encoding.copy().tolist()) #encoding
        session_row.extend(prev_score.copy().tolist()) # score
        session_row.extend(cur_score.copy().tolist())# target
        session_row.append(repeat)
        session_row.append(row["start_time"].timestamp())
        assert len(session_row) == 45, "session row length incorrect"

        # append row to overall, reset
        overall.append(session_row)
        session_row = []
        prev_score = cur_score.copy()

    # Convert to numpy arrays
    if overall:
        overall = np.array(overall)
        assert len(overall.shape) == 2, "dimensions of overall wrong"
    else:
        # Handle case where scores is empty
        return pd.DataFrame(columns=["patient_id"] + ["domain %d encoding" % i for i in range(1, 15)] +
                                   ["domain %d score" % i for i in range(1, 15)] +
                                   ["domain %d target" % i for i in range(1, 15)] +
                                   ["repeat"] +
                                   ["start_time"])
    
        # Create column names
    column_names = (
        ["patient_id"]
        + [f"domain {i} encoding" for i in range(1, 15)]
        + [f"domain {i} score" for i in range(1, 15)]
        + [f"domain {i} target" for i in range(1, 15)]
        + ["repeat"]
        + ["start_time"]
    )

    # Create dataframe
    scores_df = pd.DataFrame(overall, columns=column_names)
    scores_df.reset_index(drop=True, inplace=True)
    return scores_df

In [35]:
data = df.groupby("patient_id")[df.columns].apply(create_training_data).reset_index(drop=True)

here [0.429   nan   nan   nan   nan   nan   nan   nan   nan   nan   nan   nan
   nan   nan]
nan


AttributeError: 'numpy.float64' object has no attribute 'isna'

In [None]:
score_columns = ["domain %d score" %i for i in range(1, 15)]
encoding_columns = ["domain %d encoding" %i for i in range(1, 15)]
target_columns = ["domain %d target" %i for i in range(1, 15)]
repeat_columns = ["repeat"]
time_columns = ["start_time"]

In [None]:
data

In [None]:
# filter rows based on number of missing values (denoted by nans) the session has
def filter_n_missing(df, n_missing):
    # only use the score columns when counting 0s
    scores = df[score_columns]
    # Count number of nans in each row
    missing_count = scores.isna().sum(axis=1)
    
    # Filter rows with n_zeros number of zeros
    filtered_rows = df[missing_count == n_missing]
    return filtered_rows

In [None]:
train_data, test_data = train_test_split(data, test_size=0.25, random_state=42)
n_samples = 100000

## one sample for train, only to see if it learns that one example
train_data = train_data[:n_samples].copy()
test_data = test_data[:n_samples].copy()

In [None]:
# create missing indicator when given the score data
def create_missing_indicator(data):
    torch.manual_seed(42)
    np.random.seed(42)
    random.seed(42)
    (l, w) = data.shape
    temp = np.zeros((l, w*2))
    for i in range(l):
        for d in range(w):
            p = data[i, d]
            # update output array
            # if p == 0:
            if np.isnan(p):
                missing_ind = np.random.choice(2, 1)[0]
                temp[i, d*2] = missing_ind
                temp[i, d*2+1] = missing_ind
            else:
                temp[i, d*2] = p # score
                temp[i, d*2+1] = 1-p # 1-score
    assert not np.isnan(temp).any(), "nans exists!!!"
    return copy.deepcopy(temp)

In [None]:
# given a processed dataframe, return data and target numpy arrays
def create_model_data(data : pd.DataFrame):
    target = data[target_columns].copy().to_numpy() * data[encoding_columns].copy().to_numpy()
    data_scores = create_missing_indicator(data[score_columns].copy().to_numpy())
    return data_scores, target

Model

In [None]:
## input : 14 domain encodings + 14 domains (28 total features with missing indicator)
## output: 28 score (prediction for the scores after next domain)
## copied from next_step.py, which was used to train the model
class NN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        n_domains = 14
        
        self.model = torch.nn.Sequential(
            torch.nn.Linear(n_domains * 3, 100),
            torch.nn.Sigmoid(),
            torch.nn.Linear(100, n_domains)
        )

    def forward(self, x):
        return self.model(x)

# used for batch training
class customDataset(Dataset):
    def __init__(self, data, target):
        super().__init__()
        self.data = data
        self.target = target

    def __len__(self):
        return self.data.shape[0]

    def __getitem__(self, index):
        return self.data[index, :], self.target[index, :]

In [None]:
# initialize the model
model = NN()
model = torch.load("output/experiment4/model.pt", map_location=torch.device('cpu'))

In [None]:
# add encoding to scores and return a tensor that can be put directly into the model
def add_encoding(scores : np.ndarray, encoding : np.ndarray):
    return torch.from_numpy(np.hstack((encoding, scores))).float()

In [None]:
# return predictions, loss, and mae
def predict(model, x, y):
    loss_function = torch.nn.MSELoss()
    with torch.no_grad():
        predictions = model(x)
        loss = loss_function(predictions, y.reshape(predictions.shape))    
        return predictions.clone().numpy(), loss.clone().item(), torch.mean(torch.abs(predictions - y.reshape(predictions.shape))).clone().item()

In [None]:
# plot average improvement plots and store, d_type= Ground Truth or Prediction, mode=train or test, cur_score=whatever we need, data=test or train data
def plot_average_improvements(mode, cur_score, encoding, prev_score):
    # Step 1: Compute differences
    differences = cur_score - prev_score
    # Step 2: Mask the differences using the encoding array
    masked_differences = np.where(encoding == 1, differences, 0)  # Retain differences only where encoding is 1
    # Step 3: Compute the column-wise sum and count
    column_sums = np.sum(masked_differences, axis=0)  # Sum of differences for each column
    column_counts = np.sum(encoding, axis=0)          # Number of 1s in each column
    # Step 4: Filter out columns with no encoding == 1
    valid_columns = column_counts > 0  # Boolean mask for valid columns
    filtered_sums = column_sums[valid_columns]
    filtered_counts = column_counts[valid_columns]
    # Step 5: Compute the column-wise averages for valid columns
    filtered_averages = filtered_sums / filtered_counts
    filtered_column_indices = np.where(valid_columns)[0]
    # Plot the bar chart
    fig, ax = plt.subplots(figsize=(8, 6))  # Create the figure and axes
    bars = ax.bar(range(len(filtered_averages)), filtered_averages, tick_label=[f"{i+1}" for i in filtered_column_indices])
    # Add values to the bars
    ax.bar_label(bars, fmt='%.4f', label_type='edge')
    # Set the y-axis range
    ax.set_ylim(-0.1, 0.5)
    # Add labels and title
    title_s = "%s Data Domain Improvement Averages" % (mode)
    plt.xlabel("Domains", fontsize=12)
    plt.ylabel("Average Difference", fontsize=12)
    plt.title(title_s, fontsize=16)
    plt.tight_layout()
    plt.show()

In [None]:
def overall_avg_improvement(cur_score, prev_score, encoding):
    cur_score = np.nan_to_num(cur_score, nan=0)
    prev_score = np.nan_to_num(prev_score, nan=0)
    if np.sum(encoding) == 0:
        total_improvement = 0
        print("no sessions")
    else:
        total = np.sum(encoding)
        total_improvement = np.sum(encoding * cur_score - encoding * prev_score) / total
    return total_improvement

In [None]:
def random_assignment(data):
    # sort by timestamp
    data = data.sort_values(by="start_time")
    # assign repeat values
    seen = dict() # Track repeat status and update scores
    repeat_lst = []
    for idx, row in data.iterrows():
            domain = row[encoding_columns].idxmax()
            if domain not in seen:
                seen[domain] = 1
                repeat_lst.append(0)
            else:
                repeat_lst.append(1)
    data.drop(repeat_columns[0], axis=1, inplace=True)
    data[repeat_columns[0]] = repeat_lst    
    return data

Calculate Ground Truth

### prepare data

In [None]:
ground_truth_test_data_repeat = test_data[test_data.repeat == 1].copy()
ground_truth_test_data_nonrepeat = test_data[test_data.repeat == 0].copy()

In [None]:
ground_truth_test_data_n_zeros_repeat = dict() ## a dictionary that stores the data on the key of the number of missing domains
for n in range(14):
    tmp = filter_n_missing(ground_truth_test_data_repeat, n_missing=n)
    ground_truth_test_data_n_zeros_repeat[n] = tmp.copy()

In [None]:
ground_truth_test_data_n_zeros_nonrepeat = dict() ## a dictionary that stores the data on the key of the number of missing domains
for n in range(14):
    tmp = filter_n_missing(ground_truth_test_data_nonrepeat, n_missing=n)
    ground_truth_test_data_n_zeros_nonrepeat[n] = tmp.copy()

### plot preparations

In [None]:
ground_truth_prediction_dict_repeat = dict() # dictionary that stores the prediction list
ground_truth_avg_improvement_lst_repeat = []

In [None]:
# loop through number of domains
for n in range(14):
    tmp = ground_truth_test_data_n_zeros_repeat[n] # set tmp to the data used for this iteration
    x_tmp, y_tmp = create_model_data(tmp) # create scores with missing indicators and target

    encoding = tmp[encoding_columns].copy().to_numpy() # encoding
    tmp_single = add_encoding(x_tmp, encoding) # add encoding on x_tmp
    prediction, loss, mae = predict(model, tmp_single, torch.from_numpy(y_tmp).float())
    
    ground_truth_prediction_dict_repeat[n] = prediction
    ground_truth_avg_improvement_lst_repeat.append(overall_avg_improvement(prediction, ground_truth_test_data_n_zeros_repeat[n][score_columns].to_numpy(), encoding))

In [None]:
plt.figure()
plt.plot(range(1,15), ground_truth_avg_improvement_lst_repeat[::-1])
plt.xlabel("number of known domains")
plt.ylabel("average best improvement in score")
plt.title("average best improvement for random repeats")
plt.show()

In [None]:
ground_truth_prediction_dict_nonrepeat = dict() # dictionary that stores the prediction list
ground_truth_avg_improvement_lst_nonrepeat = []

In [None]:
# loop through number of domains
for n in range(14):
    tmp = ground_truth_test_data_n_zeros_nonrepeat[n] # set tmp to the data used for this iteration
    x_tmp, y_tmp = create_model_data(tmp) # create scores with missing indicators and target

    encoding = tmp[encoding_columns].copy().to_numpy() # encoding
    tmp_single = add_encoding(x_tmp, encoding) # add encoding on x_tmp
    prediction, loss, mae = predict(model, tmp_single, torch.from_numpy(y_tmp).float())
    
    ground_truth_prediction_dict_nonrepeat[n] = prediction
    ground_truth_avg_improvement_lst_nonrepeat.append(overall_avg_improvement(prediction, ground_truth_test_data_n_zeros_nonrepeat[n][score_columns].to_numpy(), encoding))

In [None]:
plt.figure()
plt.plot(range(1,15), ground_truth_avg_improvement_lst_nonrepeat[::-1])
plt.xlabel("number of known domains")
plt.ylabel("average best improvement in score")
plt.title("average best improvement for random nonrepeats")
plt.show()
## TODO: double check that this matches the average

In [None]:
for n in range(14):
    tmp = filter_n_missing(test_data, n_missing=n)
    print("# of missing = %d, # of sessions = %d" % (n, tmp.shape[0]))
    print(np.isnan(tmp[score_columns]).sum(axis=1).sum() == n * tmp.shape[0])