In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from run_logistic_simulations import sim_data
from tqdm import tqdm
import matplotlib.pyplot as plt
import plotly.express as px
import numpy as np
import seaborn as sns
import warnings


In [None]:
true_toxic_prob_s1 = (0.25, 0.3, 0.5, 0.6, 0.7) # Given by assignment instructions, scenario 1
true_toxic_prob_s2 = (0.01, 0.05, 0.2, 0.3, 0.5) # Given by assignment instructions, scenario 2
doses = np.array([0.5, 1, 3, 5, 6]) # From figure 6 and in units (mg/m^2 per day)
num_sims=1000

In [None]:
def run_simulation(true_toxic_probabilities, num_sims=1000):
    results = []
    final_model = []
    for sim_run in tqdm(range(num_sims)):
        data = sim_data(0, true_toxic_probabilities)
        X = data['doses'].values.reshape(-1, 1) # only one feature so needs to be reshaped to a column vector
        y = data['toxicity_event']
        only_one_class_bool = (y == 0).sum() == len(y)
        if only_one_class_bool:
            next_dose_idx = 4
        elif (y == 1).sum() == len(y):
            next_dose_idx = 0
        else:
            clf = LogisticRegression(random_state=0).fit(X, y)
            predicted_prob_toxic = clf.predict_proba(doses.reshape(-1, 1))[:, 1]
            next_dose_idx = np.argmin(np.abs(predicted_prob_toxic - 0.33))

        for sample in range(11):
            new_data = sim_data(next_dose_idx, true_toxic_probabilities)
            data = pd.concat([data, new_data], axis=0, ignore_index=True)
            X = data['doses'].values.reshape(-1, 1) # only one feature so needs to be reshaped to a column vector
            y = data['toxicity_event']
            only_one_class_bool = (y == 0).sum() == len(y)
            if only_one_class_bool:
                next_dose_idx = 4
            elif (y == 1).sum() == len(y):
                next_dose_idx = 0
            else:
                clf = LogisticRegression(random_state=0).fit(X, y)
                predicted_prob_toxic = clf.predict_proba(doses.reshape(-1, 1))[:, 1]
                next_dose_idx = np.argmin(np.abs(predicted_prob_toxic - 0.33))
        results.append(next_dose_idx)
        final_model.append(clf)
    true_label_idx = np.argmin(np.abs(np.array(true_toxic_probabilities) - 0.33))
    return results, true_label_idx, final_model

In [None]:
results, true_label_idx, final_model_s1 = run_simulation(true_toxic_prob_s1, num_sims=num_sims)
_, counts = np.unique(results, return_counts=True)
percent_class_predictions = counts / sum(counts)
percent_class_predictions

In [None]:
print(f"Accuracy for scenario 1: {percent_class_predictions[true_label_idx]}%")

In [None]:
results, true_label_idx, final_model_s2 = run_simulation(true_toxic_prob_s2, num_sims=num_sims)
_, counts = np.unique(results, return_counts=True)
percent_class_predictions = counts / sum(counts)
percent_class_predictions

In [None]:
print(f"Accuracy for scenario 2: {percent_class_predictions[true_label_idx]}%")

# Now we try the revised approach where we reweight the samples

In [None]:
true_toxic_probabilities = true_toxic_prob_s1

results = []
final_model = []
for sim_run in tqdm(range(num_sims)):
    data = sim_data(0, true_toxic_probabilities)
    X = data['doses'].values.reshape(-1, 1) # only one feature so needs to be reshaped to a column vector
    y = data['toxicity_event']
    only_one_class_bool = (y == 0).sum() == len(y)
    if only_one_class_bool:
        next_dose_idx = 4
    elif (y == 1).sum() == len(y):
        next_dose_idx = 0
    else:
        clf = LogisticRegression(solver='lbfgs',random_state=0).fit(X, y)
        predicted_prob_toxic = clf.predict_proba(doses.reshape(-1, 1))[:, 1]
        next_dose_idx = np.argmin(np.abs(predicted_prob_toxic - 0.33))

    for sample in range(7):
        new_data = sim_data(next_dose_idx, true_toxic_probabilities)
        data = pd.concat([data, new_data], axis=0, ignore_index=True)
        X = data['doses'].values.reshape(-1, 1) # only one feature so needs to be reshaped to a column vector
        y = data['toxicity_event']
        only_one_class_bool = (y == 0).sum() == len(y)
        if only_one_class_bool:
            next_dose_idx = 4
        elif (y == 1).sum() == len(y):
            next_dose_idx = 0
        else:
            clf = LogisticRegression(solver='lbfgs',random_state=0, warm_start=True).fit(X, y)
            predicted_prob_toxic = clf.predict_proba(doses.reshape(-1, 1))[:, 1]
            next_dose_idx = np.argmin(np.abs(predicted_prob_toxic - 0.33))
    
    # intialize the weights to 1
    data['sample_weight'] = 1
    
    for sample in range(7, 11):
        # use the model to get the predicted probability of a toxic event given current dose
        predict_prob_next_3 = clf.predict_proba(doses[next_dose_idx].reshape(1, -1))[0][1]
        # get the expected number of toxicities given the predicted probability
        expected_num_toxic = round(np.mean(np.random.binomial(3, p=predict_prob_next_3, size=50_000)))
        # simulate the true data
        new_data = sim_data(next_dose_idx, true_toxic_probabilities)
        true_num_toxic = new_data['toxicity_event'].sum()
        if expected_num_toxic == true_num_toxic:
            new_data['sample_weight'] = 1 # just guessing the weight here. Tentative. 
        else:
            new_data['sample_weight'] = 1
        data = pd.concat([data, new_data], axis=0, ignore_index=True)
        X = data['doses'].values.reshape(-1, 1) # only one feature so needs to be reshaped to a column vector
        y = data['toxicity_event']
        clf = LogisticRegression(solver='lbfgs',random_state=0, warm_start=True).fit(X, y, sample_weight=data['sample_weight'])
        predicted_prob_toxic = clf.predict_proba(doses.reshape(-1, 1))[:, 1]
        next_dose_idx = np.argmin(np.abs(predicted_prob_toxic - 0.33))        
    results.append(next_dose_idx)
    final_model.append(clf) # can be used for checking the fitted curve
true_label_idx = np.argmin(np.abs(np.array(true_toxic_probabilities) - 0.33))

In [None]:
data

In [None]:
_, counts = np.unique(results, return_counts=True)
percent_class_predictions = counts / sum(counts)
percent_class_predictions

# Lets visualize what the model is doing when reweighting the samples

In [None]:
predicted_probabilities_reweighted = []
for model in final_model:
    predictions = model.predict_proba(doses.reshape(-1, 1))[:, 1]
    predicted_probabilities_reweighted.append(predictions)

In [None]:
# looking at average over all simulations
sns.lineplot(x=doses, y=np.mean(predicted_probabilities_reweighted, axis=0))
plt.scatter(x=doses, y=true_toxic_prob_s1, color='red')

In [None]:
# looking at average over all simulations
sns.lineplot(x=doses, y=np.mean(predicted_probabilities_reweighted, axis=0))
plt.scatter(x=doses, y=true_toxic_prob_s1, color='red')

In [None]:
data.loc[30:35, 'sample_weight'] = 0.7

In [None]:
data

In [None]:
reduced_sample_data = data.query("doses < 5")

In [None]:
clf = LogisticRegression(solver='lbfgs',random_state=0, warm_start=True).fit(X, y, sample_weight=data['sample_weight'])
clf = LogisticRegression(solver='lbfgs',random_state=0, warm_start=True).fit(reduced_sample_data['doses'].values.reshape(-1, 1), reduced_sample_data['toxicity_event'])
clf = LogisticRegression(solver='lbfgs',random_state=0, warm_start=True).fit(data['doses'].values.reshape(-1, 1), data['toxicity_event'], sample_weight=data['sample_weight'])

In [None]:
new_pred = clf.predict_proba(doses.reshape(-1, 1))[:, 1]

In [None]:
# looking at individual simulations
sns.lineplot(x=doses, y=predicted_probabilities_reweighted[-1])
sns.lineplot(x=doses, y=new_pred, color='k')
plt.scatter(x=doses, y=true_toxic_prob_s1, color='red')