In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from run_logistic_simulations import sim_data
from tqdm import tqdm
import matplotlib.pyplot as plt
import plotly.express as px
import numpy as np
import seaborn as sns
import warnings


In [None]:
true_toxic_prob_s1 = (0.25, 0.3, 0.5, 0.6, 0.7) # Given by assignment instructions, scenario 1
true_toxic_prob_s2 = (0.01, 0.05, 0.2, 0.3, 0.5) # Given by assignment instructions, scenario 2
doses = np.array([0.5, 1, 3, 5, 6]) # From figure 6 and in units (mg/m^2 per day)
num_sims = 5000

In [None]:
def original_simulation(true_toxic_probabilities, num_sims=5000):
    results = []
    proportion_true_dose_sampled = []
    for sim_run in tqdm(range(num_sims)):
        selected_dose_idx = [0]
        data = sim_data(0, true_toxic_probabilities)
        X = data['doses'].values.reshape(-1, 1) # only one feature so needs to be reshaped to a column vector
        y = data['toxicity_event']
        only_one_class_bool = (y == 0).sum() == len(y)
        if only_one_class_bool:
            next_dose_idx = 4
        elif (y == 1).sum() == len(y):
            next_dose_idx = 0
        else:
            clf = LogisticRegression(random_state=0).fit(X, y)
            predicted_prob_toxic = clf.predict_proba(doses.reshape(-1, 1))[:, 1]
            next_dose_idx = np.argmin(np.abs(predicted_prob_toxic - 0.33))
        selected_dose_idx.append(next_dose_idx)
        for sample in range(11):
            new_data = sim_data(next_dose_idx, true_toxic_probabilities)
            data = pd.concat([data, new_data], axis=0, ignore_index=True)
            X = data['doses'].values.reshape(-1, 1) # only one feature so needs to be reshaped to a column vector
            y = data['toxicity_event']
            only_one_class_bool = (y == 0).sum() == len(y)
            if only_one_class_bool:
                next_dose_idx = 4
            elif (y == 1).sum() == len(y):
                next_dose_idx = 0
            else:
                clf = LogisticRegression(random_state=0).fit(X, y)
                predicted_prob_toxic = clf.predict_proba(doses.reshape(-1, 1))[:, 1]
                next_dose_idx = np.argmin(np.abs(predicted_prob_toxic - 0.33))
            selected_dose_idx.append(next_dose_idx)
        results.append(next_dose_idx) # best dose for each sim
        proportion_true_dose_sampled.append(np.array(selected_dose_idx))    
    results_df = pd.DataFrame({"selected_dose":results, "doses_sampled":proportion_true_dose_sampled})
    return results_df

In [None]:
results_df_s1 = original_simulation(true_toxic_prob_s1, num_sims=5000)
accuracy_s1 = np.mean(results_df_s1['selected_dose'] == 1)
results_df_s1['proportion_selected_dose_sampled'] = results_df_s1.apply(lambda x: np.mean(x['doses_sampled'] == x['selected_dose']), axis=1)

In [None]:
print(f"Accuracy for scenario 1: {round(accuracy_s1 * 100, 2)}%")

In [None]:
results_df_s2 = original_simulation(true_toxic_prob_s2, num_sims=num_sims)
accuracy_s2 = np.mean(results_df_s2['selected_dose'] == 3)
results_df_s2['proportion_selected_dose_sampled'] = results_df_s2.apply(lambda x: np.mean(x['doses_sampled'] == x['selected_dose']), axis=1)
print(f"Accuracy for scenario 2: {round(accuracy_s2 * 100, 2)}%")

In [None]:
def run_simulation(true_toxic_probabilities, num_sims=5000):
    results = []
    proportion_true_dose_sampled = []
    for sim_run in tqdm(range(num_sims)):
        selected_dose_idx = [0]
        data = sim_data(0, true_toxic_probabilities)
        X = data['doses'].values.reshape(-1, 1) # only one feature so needs to be reshaped to a column vector
        y = data['toxicity_event']
        only_one_class_bool = (y == 0).sum() == len(y)
        if only_one_class_bool:
            next_dose_idx = 2
        elif (y == 1).sum() == len(y):
            next_dose_idx = 0
        else:
            clf = LogisticRegression(random_state=0).fit(X, y)
            predicted_prob_toxic = clf.predict_proba(doses.reshape(-1, 1))[:, 1]
            next_dose_idx = np.argmin(np.abs(predicted_prob_toxic - 0.33))
        selected_dose_idx.append(next_dose_idx)
        for sample in range(11):
            new_data = sim_data(next_dose_idx, true_toxic_probabilities)
            data = pd.concat([data, new_data], axis=0, ignore_index=True)
            X = data['doses'].values.reshape(-1, 1) # only one feature so needs to be reshaped to a column vector
            y = data['toxicity_event']
            only_one_class_bool = (y == 0).sum() == len(y)
            if only_one_class_bool:
                next_dose_idx = 4
            elif (y == 1).sum() == len(y):
                next_dose_idx = 0
            else:
                clf = LogisticRegression(random_state=0).fit(X, y)
                predicted_prob_toxic = clf.predict_proba(doses.reshape(-1, 1))[:, 1]
                next_dose_idx = np.argmin(np.abs(predicted_prob_toxic - 0.33))
            selected_dose_idx.append(next_dose_idx)
        results.append(next_dose_idx) # best dose for each sim
        proportion_true_dose_sampled.append(np.array(selected_dose_idx))    
    results_df = pd.DataFrame({"selected_dose":results, "doses_sampled":proportion_true_dose_sampled})
    return results_df

In [None]:
results_df_s1 = run_simulation(true_toxic_prob_s1, num_sims=5000)
accuracy_s1 = np.mean(results_df_s1['selected_dose'] == 1)
results_df_s1['proportion_selected_dose_sampled'] = results_df_s1.apply(lambda x: np.mean(x['doses_sampled'] == x['selected_dose']), axis=1)

In [None]:
print(f"Accuracy for scenario 1: {round(accuracy_s1 * 100, 2)}%")

reduced_df = results_df_s1.query("proportion_selected_dose_sampled < 0.9")
print(f"rows removed: {len(results_df_s1) - len(reduced_df)}")
reduced_df['selected_dose'].value_counts(normalize=True)

In [None]:
results_df_s2 = run_simulation(true_toxic_prob_s2, num_sims=num_sims)
accuracy_s2 = np.mean(results_df_s2['selected_dose'] == 3)
results_df_s2['proportion_selected_dose_sampled'] = results_df_s2.apply(lambda x: np.mean(x['doses_sampled'] == x['selected_dose']), axis=1)

In [None]:
print(f"Accuracy for scenario 2: {round(accuracy_s2 * 100, 2)}%")

In [None]:
reduced_df = results_df_s2.query("proportion_selected_dose_sampled < 0.9")
print(f"rows removed: {len(results_df_s2) - len(reduced_df)}")
reduced_df['selected_dose'].value_counts(normalize=True)

In [None]:
results_df_s2.query("selected_dose == 3")['proportion_selected_dose_sampled'].plot.hist(bins=7, label='correct dose selected')
results_df_s2.query("selected_dose != 3")['proportion_selected_dose_sampled'].plot.hist(bins=7, label='incorrect dose selected')
plt.xlabel("Proportion of Selected Dose Sampled in a Given Simulation")
plt.legend()

In [None]:
results_df_s1.query("selected_dose != 1").loc[4995]

In [None]:
print(results_df_s1.query("selected_dose == 1")['proportion_selected_dose_sampled'].mean())
print(results_df_s2.query("selected_dose == 3")['proportion_selected_dose_sampled'].mean())

Possible ideas:
- hard code what proportion of samples were sampled from our selected dose.
- Use cross-validation
- 

# testing ideas below

In [None]:
true_toxic_probabilities = true_toxic_prob_s1

results = []
final_model = []
proportion_sampled_of_chosen_dose = []
chosen_doses = []
for sim_run in tqdm(range(3000)):
    chosen_doses_current_sim = []
    data = sim_data(0, true_toxic_probabilities)
    X = data['doses'].values.reshape(-1, 1) # only one feature so needs to be reshaped to a column vector
    y = data['toxicity_event']
    only_one_class_bool = (y == 0).sum() == len(y)
    if only_one_class_bool:
        next_dose_idx = 2 # used to be 4 but I changed it to 2
    elif (y == 1).sum() == len(y):
        next_dose_idx = 0
    else:
        clf = LogisticRegression(solver='lbfgs',random_state=0).fit(X, y)
        predicted_prob_toxic = clf.predict_proba(doses.reshape(-1, 1))[:, 1]
        next_dose_idx = np.argmin(np.abs(predicted_prob_toxic - 0.33))
    chosen_doses_current_sim.append(doses[next_dose_idx])
    for sample in range(11):
        new_data = sim_data(next_dose_idx, true_toxic_probabilities)
        data = pd.concat([data, new_data], axis=0, ignore_index=True)
        X = data['doses'].values.reshape(-1, 1) # only one feature so needs to be reshaped to a column vector
        y = data['toxicity_event']
        only_one_class_bool = (y == 0).sum() == len(y)
        if only_one_class_bool:
            next_dose_idx = 4 
        elif (y == 1).sum() == len(y):
            next_dose_idx = 0
        else:
            clf = LogisticRegression(solver='lbfgs', random_state=0, warm_start=False).fit(X, y)
            predicted_prob_toxic = clf.predict_proba(doses.reshape(-1, 1))[:, 1]
            next_dose_idx = np.argmin(np.abs(predicted_prob_toxic - 0.33))
        chosen_doses_current_sim.append(doses[next_dose_idx])
    chosen_doses.append(chosen_doses_current_sim)
    results.append(next_dose_idx)
    proportion_sampled_of_chosen_dose.append(np.mean(X == doses[next_dose_idx]))
    final_model.append(clf) # can be used for checking the fitted curve
true_label_idx = np.argmin(np.abs(np.array(true_toxic_probabilities) - 0.33))
results_df = pd.DataFrame({"proportion_sampled_of_chosen_dose":proportion_sampled_of_chosen_dose, "selected_dose":results})

In [None]:
results_df['selected_dose'].value_counts(normalize=True)

In [None]:
results_df.query("selected_dose == 1")['proportion_sampled_of_chosen_dose'].plot.hist(bins=9)
results_df.query("selected_dose != 1")['proportion_sampled_of_chosen_dose'].plot.hist(bins=9)

In [None]:
results_df.query('proportion_sampled_of_chosen_dose < 0.90')['selected_dose'].value_counts(normalize=True)

In [None]:
_, counts = np.unique(results, return_counts=True)
percent_class_predictions = counts / sum(counts)
percent_class_predictions

In [None]:
predicted_probabilities_reweighted = []
for model in final_model:
    predictions = model.predict_proba(doses.reshape(-1, 1))[:, 1]
    predicted_probabilities_reweighted.append(predictions)

In [None]:
# looking at average over all simulations
sns.lineplot(x=doses, y=np.mean(predicted_probabilities_reweighted, axis=0))
plt.scatter(x=doses, y=true_toxic_prob_s1, color='red')

In [None]:
# looking at average over all simulations
sns.lineplot(x=doses, y=np.mean(predicted_probabilities_reweighted, axis=0))
plt.scatter(x=doses, y=true_toxic_prob_s1, color='red')

In [None]:
data.loc[30:35, 'sample_weight'] = 0.7

In [None]:
reduced_sample_data = data.query("doses < 5")

In [None]:
clf = LogisticRegression(solver='lbfgs',random_state=0, warm_start=True).fit(X, y, sample_weight=data['sample_weight'])
clf = LogisticRegression(solver='lbfgs',random_state=0, warm_start=True).fit(reduced_sample_data['doses'].values.reshape(-1, 1), reduced_sample_data['toxicity_event'])
clf = LogisticRegression(solver='lbfgs',random_state=0, warm_start=True).fit(data['doses'].values.reshape(-1, 1), data['toxicity_event'], sample_weight=data['sample_weight'])

In [None]:
new_pred = clf.predict_proba(doses.reshape(-1, 1))[:, 1]

In [None]:
# looking at individual simulations
sns.lineplot(x=doses, y=predicted_probabilities_reweighted[-1])
sns.lineplot(x=doses, y=new_pred, color='k')
plt.scatter(x=doses, y=true_toxic_prob_s1, color='red')

In [None]:
data