In [194]:
import numpy as np
import pandas as pd
import random
import time
from itertools import product
from scipy.stats import norm, binomtest
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression, Lasso, Ridge

from sklearn import linear_model, svm, naive_bayes, ensemble
from sklearn.model_selection import cross_validate, train_test_split, RepeatedStratifiedKFold
from sklearn.utils import class_weight
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, f1_score, roc_auc_score
from xgboost import XGBClassifier
# from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix, roc_auc_score, matthews_corrcoef

from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score


## Read functions

In [195]:
%run functions_scenario2.ipynb

## Scenario 2, Case 2 with interaction

In [196]:
def invlogit(p_list):
    invlogit_values = []
    for p in p_list:
        invlogit_values.append(np.exp(p) / (1 + np.exp(p)))

    return invlogit_values

In [197]:
design_list = [5,8,10]
patient_n_list = [5400,680,170] # n_patient_per_plan

# def plan_true_rate(row, impact_rates):
#     tmp = row * impact_rates
#     tmp = tmp[tmp != 0]
#     return np.prod(tmp) * 0.05

def generate_data_step1(design_number = 5, n_rounds = 4, n_patient_per_plan = 5400, seed=42):

  # design_list = [5,8,10]

  # Record the start time
#   start_time = time.time()

  # scenario 2:
  random.seed(seed)

  # n_design = design_number
  n_patient_per_plan_round = n_patient_per_plan/n_rounds # each round is capped by this patient number

  # Create all possible combinations of design features
  design_combinations = list(product([1, 0], repeat=n_design))

  design_feature_df = pd.DataFrame(design_combinations, columns=[f'Design_Feature_{i+1}' for i in range(n_design)])
  # add interaction term:
  interaction = [1 if row['Design_Feature_1'] + row['Design_Feature_2'] == 2 else 0 for _, row in design_feature_df.iterrows()]
  design_feature_df['interaction'] = interaction

  design_feature_df['recruitment_plan'] = [x + 1 for x in range(2 ** n_design)]

  # design_feature_df
  random.seed(seed)
  # impact_rates = np.array([random.uniform(0.6, 1.5) for _ in range(n_design)])
  impact_rates = np.array([-0.5, 0, 0, 0, 0, 1]) # design 2 has the highest rr
  response_rate_list = np.dot(np.array(design_feature_df.iloc[:,:(n_design+1)]), impact_rates)
  # normalize to [0,1]
  # normalized_values = normalize_to_0_1(response_rate_list) * 0.115
  # inverse logit
  normalized_values = np.array(invlogit(response_rate_list)) * 0.11
  design_feature_df['plan_response_rate'] = normalized_values

  design_feature_df.drop(columns=['interaction'], axis = 1, inplace=True)

  # Apply plan_true_rate to each row
  # result_column = design_feature_df.iloc[:, :n_design].apply(lambda row: plan_true_rate(row, impact_rates), axis=1)

  # Add the result_column to the design_feature_df
  # design_feature_df['plan_response_rate'] = result_column

  # each combination repeat for n_patient_per_plan_round
  design_feature_df = pd.DataFrame(np.repeat(design_feature_df.values, n_patient_per_plan_round, axis=0), columns=design_feature_df.columns)

  # add response outcome column:
  grouped_df = design_feature_df.groupby(list(design_feature_df.columns)).size().reset_index(name='group_size')

  random.seed(seed)
  random_seeds_for_resp = [random.randint(1, 100000) for _ in range(len(grouped_df))]

  def generate_responses(row):
      rate = row['plan_response_rate']
      num = row['group_size']
      # random.seed(42)
      seed1 = random_seeds_for_resp[row.name]
      np.random.seed(seed1)
      return np.random.binomial(n=1, p=rate, size=int(num))

  grouped_df['response'] = grouped_df.apply(generate_responses, axis=1)

  # Explode the 'response' arrays to expand the DataFrame
  expanded_df = grouped_df.explode('response').reset_index(drop=True)
  expanded_df['response'] = expanded_df['response'].astype(float)

#   end_time = time.time()

#   elapsed_time = end_time - start_time

#   print(f"Elapsed time: {elapsed_time} seconds")

  return expanded_df

## Logistic regression

In [7]:
def ensemble_model_fit(data, data_pred):
    X_train, X_test, y_train, y_test = train_test_split(
        data.drop(['recruitment_plan','plan_response_rate','group_size','response'], axis=1),
        data['response'],
        test_size=0.2,
        random_state=0
    )

    # Define the VotingClassifier with the individual classifiers
    voting_classifier = ensemble.VotingClassifier(
        estimators=[
            ('LR', linear_model.LogisticRegression(max_iter=200, random_state=0))
#             ('Ridge', linear_model.LogisticRegression(penalty='l2', solver='lbfgs', max_iter=200, random_state=0))
                    # ('SVM', svm.SVC(kernel='linear', C=1.0, random_state=0, probability=True, class_weight='balanced'))
#                     ('RF', ensemble.RandomForestClassifier(n_estimators=200, criterion='gini', random_state=0))
                    # ('XGB', XGBClassifier(n_estimators=50, learning_rate=0.1, random_state=0))
                   ],
        voting='soft'
    )

    # Define the hyperparameter grid to search
    # Best Hyperparameters: {'LR__C': 0.01, 'RF__n_estimators': 50, 'XGB__n_estimators': 50}
    param_grid = {
        # 'NB__alpha': [0.01, 0.05, 0.1],  # '__' is used to specify hyperparameters for individual classifiers
        'LR__C': [0.01, 0.1, 1.0] # [0.01, 0.05, 0.1]
        # 'Ridge__C': [0.01]
        # 'SVM__C': [0.01, 0.05, 0.1]
#         'RF__n_estimators': [50] # [10, 30, 50]
        # 'XGB__n_estimators': [50]
    }

    # Create a GridSearchCV object
    # custom_scorer_auc = make_scorer(roc_auc_score, needs_proba=True)
    grid_search = GridSearchCV(voting_classifier, param_grid, cv=10, scoring='roc_auc')

    # Perform the grid search on the training data
    grid_search.fit(X_train, y_train)

    # Get the best hyperparameters
    best_params = grid_search.best_params_
    print(best_params)

    # Train the final VotingClassifier with the best hyperparameters on the full training set
    final_voting_classifier = grid_search.best_estimator_
    final_voting_classifier.fit(X_train, y_train)

    # Predict probabilities instead of binary outcomes on the test set
    y_pred_proba_test = final_voting_classifier.predict_proba(X_test)
    y_pred_test = final_voting_classifier.predict(X_test)
    X_dt = data_pred.drop(['recruitment_plan','plan_response_rate','group_size','response'], axis=1)
    y_pred = final_voting_classifier.predict_proba(X_dt)

    return y_pred

In [8]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import train_test_split

# def ensemble_model_fit(data, data_pred):
#     # Split the data into training and testing sets
#     X_train, X_test, y_train, y_test = train_test_split(
#         data.drop(['recruitment_plan', 'plan_response_rate', 'group_size','response'], axis=1),
#         data['response'],
#         test_size=0.2,
#         random_state=0
#     )

#     # Define the Logistic Regression model
#     logistic_regression = LogisticRegression(C=0.01, max_iter=200,random_state=0)

#     # Fit the Logistic Regression model on the training data
#     logistic_regression.fit(X_train, y_train)

#     # Print out the coefficients of the logistic regression model
#     print("Coefficients:", logistic_regression.coef_)
#     print("Intercept:", logistic_regression.intercept_)
#     print("Coefficients shape:", logistic_regression.coef_.shape)

#     # Predict probabilities instead of binary outcomes on the test set
#     y_pred_proba_test = logistic_regression.predict_proba(X_test)
#     y_pred_test = logistic_regression.predict(X_test)

#     # You can also predict probabilities for the prediction data (data_pred)
#     X_dt = data_pred.drop(['recruitment_plan', 'plan_response_rate', 'group_size','response'], axis=1)
#     y_pred = logistic_regression.predict_proba(X_dt)

#     return y_pred

# # Call the function
# # ensemble_model_fit(data, data_pred)


### Simulation starts

In [9]:
n_sim = 100
# create a list of random seeds
random.seed(42)
random_seeds = [random.randint(1, 100000) for _ in range(n_sim)]

design_list = [5,8,10]
patient_n_list = [5468,680,170] # n_patient_per_plan

n_design = 5
n_patient_per_plan = 5468
n_rounds = 5
total_n = n_patient_per_plan * (2**n_design)

## sample size determination
beta = 0.2
power = 1 - beta
alpha = 0.05
delta = 0.01 # effect size

## early stopping
epsilon = 0.001

In [10]:
rr_dict = {"step{}_rr".format(r): [] for r in range(1, n_rounds + 1)}
plan_number_dict = {"step{}_plan_number".format(r): [] for r in range(1, n_rounds + 1)}
sample_size_dict = {"step{}_sample_size".format(r): [] for r in range(1, n_rounds + 1)}
last_round_sample_size = []
random_rr_dict = {"step{}_random_max_rr".format(r): [] for r in range(1, n_rounds + 1)}
random_rr_dict.update({
    "step{}_random_mean_rr".format(r): [] for r in range(1, n_rounds + 1)
})

stopping_dict = {"early_stopping": [],
                 "early_stopping_plan":[],
                 "early_stopping_orr":[],
                 "early_stopping_size":[]}
final_plan_number = []
highest_rr_overall = []
highest_true_rr = []

better_chance = []

orr_total = []
orr_total_adaptive = []



i = 0

for seed in random_seeds:
    i += 1
    print(i)

    print([f"{key}: {len(value)}" for key, value in rr_dict.items()])



    event_list = 0
    event_list_adaptive = 0
    max_rr = []

    # step 1:
    ## Generate dataset
    dt_design_5 = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                      n_patient_per_plan = n_patient_per_plan, seed=seed)
    highest_true_rr.append(np.max(dt_design_5['plan_response_rate']))
    print("empirical response rate", dt_design_5['response'].mean())
    ## save for benchmark results
    maxidx = np.argmax(dt_design_5['plan_response_rate'])
    random_rr_dict['step1_random_max_rr'].append(dt_design_5.iloc[maxidx,6])
    random_rr_dict['step1_random_mean_rr'].append(dt_design_5['plan_response_rate'].mean())

    ## Ensemble modelling:
    y_pred = ensemble_model_fit(data=dt_design_5, data_pred = dt_design_5)

    ## select recruitment plan
    pred_df = pd.DataFrame(np.hstack((dt_design_5,  y_pred[:, 1].reshape(-1, 1))),
                             columns=list(dt_design_5.columns) + ['predicted_response_rate'])
    pred_df_rr = pred_df.groupby([f'Design_Feature_{i+1}' for i in range(n_design)]+['recruitment_plan'])['predicted_response_rate'].mean().reset_index(name='predicted_response_rate')

    x = pred_df_rr['predicted_response_rate'].values
    if len(x) <= 10:
        best_k = 2
    else:
        best_k = kmeans_fit(data = x)['best_k']
    kmeans_results = kmeans_bhattacharyya(data=x, k=best_k)

    merged_df = pd.merge(pred_df_rr, kmeans_results['clusters'][['predicted_response_rate', 'cluster_number']], on='predicted_response_rate')
    cluster_with_highest_rate = kmeans_results['clusters'].groupby('cluster_number')['predicted_response_rate'].mean().idxmax()
    highest_cluster = merged_df[merged_df['cluster_number']==cluster_with_highest_rate].reset_index(drop=True)
    highest_cluster.sort_values(by='predicted_response_rate', ascending=False, inplace=True)


    p_vec_next = np.array(highest_cluster['predicted_response_rate']/np.sum(highest_cluster['predicted_response_rate']))
    highest_cluster['p_vec'] = p_vec_next
    highest_cluster = pd.merge(highest_cluster, dt_design_5[['recruitment_plan','plan_response_rate']].drop_duplicates(), how='left', on='recruitment_plan')

    # highest_cluster = pred_df_rr

    # highest_cluster = pd.merge(highest_cluster, dt_design_5[['recruitment_plan','plan_response_rate']].drop_duplicates(), how='left', on='recruitment_plan')
    # highest_cluster['cluster_number'] = highest_cluster['recruitment_plan']

    ## prepare to chance of better performance:
    temp = dt_design_5[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
    event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
    # print("step1", np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
    # print(temp)
    p0 = (temp['plan_response_rate'].mean())

    rr_dict["step1_rr"].append(dt_design_5['plan_response_rate'].mean())
    sample_size_dict["step1_sample_size"].append(len(dt_design_5))

    for round in range(2, n_rounds+1):

        rr_dict["step"+str(round)+"_rr"].append(np.dot(np.array(highest_cluster['p_vec']), np.array(highest_cluster['plan_response_rate'])))
        plan_number_dict["step"+str(round)+"_plan_number"].append(len(p_vec_next))

        ## when it comes to the last round:
        if round == n_rounds:

            ## save benchmark results for last round:
            dt_benchmark = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                      n_patient_per_plan = n_patient_per_plan, seed=seed+round)
            maxidx = np.argmax(dt_benchmark['plan_response_rate'])
            random_rr_dict["step"+str(round)+'_random_max_rr'].append(dt_benchmark.iloc[maxidx,6])
            random_rr_dict["step"+str(round)+'_random_mean_rr'].append(dt_benchmark['plan_response_rate'].mean())

            if round == 2: # if the last round is round 2
                remaining_size = total_n - len(dt_design_5) # apply to the rest of the patients
            else:
                remaining_size -= len(dt_design_5_step2up)

            print("remaining size for last Round " + str(round) + ": " + str(remaining_size))
            sample_size_dict["step"+str(round)+"_sample_size"].append(remaining_size)
            last_round_sample_size.append(remaining_size)

            ## data generation, combine previous data:
            dt_design_5_rest = generate_data_step2up(highest_cluster=highest_cluster, p_vec = p_vec_next, round=round,
                                                          design_number = n_design, n_rounds = n_rounds,
                                                          n_patient_per_plan = n_patient_per_plan, size = int(remaining_size), seed=seed)
            plan_list = dt_design_5_rest['recruitment_plan'].unique()
            if round == 2:
                supp = dt_design_5[dt_design_5['recruitment_plan'].isin(plan_list)] # data from step 1
            else:
                # step from previous rounds
                supp = dt_design_5_step2up_overall[dt_design_5_step2up_overall['recruitment_plan'].isin(plan_list)]
            dt_design_5_rest_overall = pd.concat([dt_design_5_rest, supp.reset_index(drop=True)], axis = 0)
            dt_design_5_rest_overall.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)

            stopping_dict['early_stopping'].append(0)
            stopping_dict['early_stopping_plan'].append(0)
            stopping_dict['early_stopping_orr'].append(0)
            stopping_dict['early_stopping_size'].append(0)

            ## record final plan number
            final_plan_number.append(len(p_vec_next))

            ## record highest responserate overall
            highest_rr_overall.append(np.max(dt_design_5_rest_overall['plan_response_rate']))

            ## prepare to chance of better performance:
            temp = dt_design_5_rest[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
            event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            event_list_adaptive += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print("step"+str(round), np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print(len(temp), temp['plan_response_rate'].mean())
            # print(temp)

            result = binomtest(int(event_list), n=total_n, p=p0)
            # print(int(event_list), total_n, p0)
            orr_total.append(event_list)
            orr_total_adaptive.append(event_list_adaptive)
            better_chance.append(1 if result.pvalue < 0.05 else 0)
            # print(result.pvalue)

            break



        ## If haven't reached the last round:
        ## check remaining sample size:
        if round == 2:
            remaining_size = total_n - len(dt_design_5) # apply to the rest of the patients
        else:
            remaining_size -= len(dt_design_5_step2up)

        print("remaining size for Round " + str(round) + ": " + str(remaining_size))

        ## determine whether move on to step 2:
        if len(highest_cluster) == 1: # if there is only one plan left

            sample_size_dict["step"+str(round)+"_sample_size"].append(remaining_size)
            last_round_sample_size.append(remaining_size) # record the last round sample size
            for r in range(round+1, n_rounds+1):
                sample_size_dict["step"+str(r)+"_sample_size"].append(np.nan)

            ## save benchmark results for last round:
            dt_benchmark = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                      n_patient_per_plan = n_patient_per_plan, seed=seed+round)
            maxidx = np.argmax(dt_benchmark['plan_response_rate'])
            random_rr_dict["step"+str(round)+'_random_max_rr'].append(dt_benchmark.iloc[maxidx,6])
            random_rr_dict["step"+str(round)+'_random_mean_rr'].append(dt_benchmark['plan_response_rate'].mean())

            dt_design_5_step2up = generate_data_step2up(highest_cluster=highest_cluster, p_vec = p_vec_next,round=round,
                                                      design_number = n_design, n_rounds = n_rounds,
                                                      n_patient_per_plan = n_patient_per_plan, size = int(remaining_size), seed=seed)
            plan_list = dt_design_5_step2up['recruitment_plan'].unique()
            if round == 2:
                supp = dt_design_5[dt_design_5['recruitment_plan'].isin(plan_list)] # data from step 1
            else:
                # step from previous rounds
                supp = dt_design_5_step2up_overall[dt_design_5_step2up_overall['recruitment_plan'].isin(plan_list)]

            dt_design_5_step2up_overall = pd.concat([dt_design_5_step2up, supp.reset_index(drop=True)], axis = 0)
            dt_design_5_step2up_overall.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)

            stopping_dict['early_stopping'].append(1)
            stopping_dict['early_stopping_plan'].append(1)
            stopping_dict['early_stopping_orr'].append(0)
            stopping_dict['early_stopping_size'].append(0)

            ## record final plan number
            final_plan_number.append(len(p_vec_next))

            ## record highest responserate overall
            highest_rr_overall.append(np.max(dt_design_5_step2up_overall['plan_response_rate']))

            for r in range(round+1, n_rounds+1):
                rr_dict["step"+str(r)+"_rr"].append(np.nan)

            ## prepare for chance of better performance:
            temp = dt_design_5_step2up[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
            event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            event_list_adaptive += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print("step"+str(round), np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print(temp)

            result = binomtest(int(event_list), n=total_n, p=p0)
            # print(int(event_list), total_n, p0)
            orr_total.append(event_list)
            orr_total_adaptive.append(event_list_adaptive)
            better_chance.append(1 if result.pvalue < 0.05 else 0)
            # print(result.pvalue)
            break

        ## sample size determination:
        if round == 2:
            dt_design_5_step2up_overall = dt_design_5
        orr_1 = dt_design_5_step2up_overall['response'].mean() # observed overall response rates for previous rounds
        orr_2 = orr_1 + delta
        n_1 = len(dt_design_5_step2up_overall)

        size_step2up = sample_size_calc(orr_1, n_1, delta=delta, alpha=alpha, power=power) # total size for dataset
        if size_step2up > 0 and size_step2up < 1000:
            size_step2up = 1000 # if size in [0,1000], then it is 1000 for this round.
        elif size_step2up >= 1000:
            size_step2up = min(size_step2up, int(total_n/n_rounds)) # dataset size capped by the n_patient_per_plan
        else:
        # if size_step2up <= 0:
            # the process stops at this step
            print('Calculated size for Round ' + str(round) + ': ' + str(size_step2up) + 'lt 0, break')
            sample_size_dict["step"+str(round)+"_sample_size"].append(remaining_size)
            last_round_sample_size.append(remaining_size)
            for r in range(round+1, n_rounds+1):
                sample_size_dict["step"+str(r)+"_sample_size"].append(np.nan)

            dt_benchmark = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                      n_patient_per_plan = n_patient_per_plan, seed=seed+round)
            maxidx = np.argmax(dt_benchmark['plan_response_rate'])
            random_rr_dict["step"+str(round)+'_random_max_rr'].append(dt_benchmark.iloc[maxidx,6])
            random_rr_dict["step"+str(round)+'_random_mean_rr'].append(dt_benchmark['plan_response_rate'].mean())

            dt_design_5_step2up = generate_data_step2up(highest_cluster=highest_cluster, p_vec = p_vec_next,round=round,
                                                      design_number = n_design, n_rounds = n_rounds,
                                                      n_patient_per_plan = n_patient_per_plan, size = int(remaining_size), seed=seed)
            plan_list = dt_design_5_step2up['recruitment_plan'].unique()
            if round == 2:
                supp = dt_design_5[dt_design_5['recruitment_plan'].isin(plan_list)] # data from step 1
            else:
                # step from previous rounds
                supp = dt_design_5_step2up_overall[dt_design_5_step2up_overall['recruitment_plan'].isin(plan_list)]

            dt_design_5_step2up_overall = pd.concat([dt_design_5_step2up, supp.reset_index(drop=True)], axis = 0)
            dt_design_5_step2up_overall.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)

            stopping_dict['early_stopping'].append(1)
            stopping_dict['early_stopping_plan'].append(0)
            stopping_dict['early_stopping_orr'].append(0)
            stopping_dict['early_stopping_size'].append(1)

            ## record final plan number
            final_plan_number.append(len(p_vec_next))

            ## record highest responserate overall
            highest_rr_overall.append(np.max(dt_design_5_step2up_overall['plan_response_rate']))

            for r in range(round+1, n_rounds+1):
                rr_dict["step"+str(r)+"_rr"].append(np.nan)

            ## prepare for chance of better performance:
            temp = dt_design_5_step2up[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
            event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            event_list_adaptive += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print("step"+str(round), np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print(temp)

            result = binomtest(int(event_list), n=total_n, p=p0)
            # print(int(event_list), total_n, p0)
            orr_total.append(event_list)
            orr_total_adaptive.append(event_list_adaptive)
            better_chance.append(1 if result.pvalue < 0.05 else 0)
            # print(result.pvalue)
            break

        print('Calculated size for Round ' + str(round) + ': ' + str(size_step2up))

        sample_size_dict["step"+str(round)+"_sample_size"].append(size_step2up)

        ## save benchmark results for last round:
        dt_benchmark = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                            n_patient_per_plan = n_patient_per_plan, seed=seed+round)
        maxidx = np.argmax(dt_benchmark['plan_response_rate'])
        random_rr_dict["step"+str(round)+'_random_max_rr'].append(dt_benchmark.iloc[maxidx,6])
        random_rr_dict["step"+str(round)+'_random_mean_rr'].append(dt_benchmark['plan_response_rate'].mean())

        ## data generation, combine previous data:
        dt_design_5_step2up = generate_data_step2up(highest_cluster=highest_cluster, p_vec = p_vec_next,round=round,
                                                  design_number = n_design, n_rounds = n_rounds,
                                                  n_patient_per_plan = n_patient_per_plan, size = int(size_step2up), seed=seed)

        plan_list = dt_design_5_step2up['recruitment_plan'].unique()
        if round == 2:
            supp = dt_design_5[dt_design_5['recruitment_plan'].isin(plan_list)]
        else:
            supp = dt_design_5_step2up_overall[dt_design_5_step2up_overall['recruitment_plan'].isin(plan_list)]
        dt_design_5_step2up_overall = pd.concat([dt_design_5_step2up, supp.reset_index(drop=True)], axis = 0)
        dt_design_5_step2up_overall.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)

        ## Ensemble model fitting:
        dt_design_5_step2up.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)
        y_pred2up = ensemble_model_fit(data = dt_design_5_step2up_overall, data_pred = dt_design_5_step2up)
        pred_df_step2up = pd.DataFrame(np.hstack((dt_design_5_step2up,  y_pred2up[:, 1].reshape(-1, 1))),
                                    columns=list(dt_design_5_step2up.columns) + ['predicted_response_rate'])
        pred_df_rr_step2up = pred_df_step2up.groupby([f'Design_Feature_{i+1}' for i in range(n_design)]+['recruitment_plan'])['predicted_response_rate'].mean().reset_index(name='predicted_response_rate')

        ## select recruitment plans:
        x = pred_df_rr_step2up['predicted_response_rate'].values

        if len(x) <= 10:
            best_k = 2
        else:
            best_k = kmeans_fit(data = x)['best_k']
        kmeans_results = kmeans_bhattacharyya(data=x, k=best_k)

        ## match the cluster results back to the original data
        highest_cluster_previous = highest_cluster # save previous cluster results

        merged_df = pd.merge(pred_df_rr_step2up, kmeans_results['clusters'][['predicted_response_rate', 'cluster_number']], on='predicted_response_rate')
        cluster_with_highest_rate = kmeans_results['clusters'].groupby('cluster_number')['predicted_response_rate'].mean().idxmax()
        highest_cluster = merged_df[merged_df['cluster_number']==cluster_with_highest_rate].reset_index(drop=True)
        highest_cluster.sort_values(by='predicted_response_rate', ascending=False, inplace=True)

        # p_vec_previous = p_vec_next # save p_vec of previous round
        p_vec_next = np.array(highest_cluster['predicted_response_rate']/np.sum(highest_cluster['predicted_response_rate']))

        highest_cluster['p_vec'] = p_vec_next

        highest_cluster = pd.merge(highest_cluster, dt_design_5_step2up[['recruitment_plan','plan_response_rate']].drop_duplicates(), how='left', on='recruitment_plan')

        ## prepare to chance of better performance:
        temp = dt_design_5_step2up[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
        event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
        event_list_adaptive += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
        # print("step"+str(round), np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
        # print(temp)

        ## check early stopping predicted ORR:
        orr_df = pd.merge(highest_cluster_previous, highest_cluster[['recruitment_plan','predicted_response_rate','p_vec']], on='recruitment_plan', how='left')
        orr_df.fillna(0, inplace=True)
        p_orr_1 = np.dot(np.array(orr_df['p_vec_x']), np.array(orr_df['predicted_response_rate_y']))
        p_orr_2 = np.dot(np.array(orr_df['p_vec_y']), np.array(orr_df['predicted_response_rate_y']))
        print("orr termination", p_orr_1, p_orr_2, p_orr_2 - p_orr_1)

        if (p_orr_2 - p_orr_1 < epsilon):
            # step 3 use the same strategy of step2
            print(i, p_orr_1, p_orr_2, "early stop at Round " + str(round))
            rr_dict["step"+str(round+1)+"_rr"].append(np.dot(np.array(highest_cluster['p_vec']), np.array(highest_cluster['plan_response_rate'])))

            ## save benchmark results for last round:
            dt_benchmark = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                                n_patient_per_plan = n_patient_per_plan, seed=seed+round)
            maxidx = np.argmax(dt_benchmark['plan_response_rate'])
            random_rr_dict["step"+str(round+1)+'_random_max_rr'].append(dt_benchmark.iloc[maxidx,6])
            random_rr_dict["step"+str(round+1)+'_random_mean_rr'].append(dt_benchmark['plan_response_rate'].mean())

            ### update remaining size:
            remaining_size -= len(dt_design_5_step2up)
            print("early stop, remaining size for Round" + str(round + 1) + ": " + str(remaining_size))
            sample_size_dict["step"+str(round)+"_sample_size"].append(remaining_size)
            last_round_sample_size.append(remaining_size)

            for r in range(round+2, n_rounds+1):
                sample_size_dict["step"+str(r)+"_sample_size"].append(np.nan)

            dt_design_5_rest = generate_data_step2up(highest_cluster=highest_cluster, p_vec = p_vec_next, round=round,
                                                      design_number = n_design, n_rounds = n_rounds,
                                                      n_patient_per_plan = n_patient_per_plan, size = int(remaining_size), seed=seed)
            plan_list = dt_design_5_rest['recruitment_plan'].unique()
            supp = dt_design_5_step2up_overall[dt_design_5_step2up_overall['recruitment_plan'].isin(plan_list)]
            dt_design_5_rest_overall = pd.concat([dt_design_5_rest, supp.reset_index(drop=True)], axis = 0)
            dt_design_5_rest_overall.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)

            stopping_dict['early_stopping'].append(1)
            stopping_dict['early_stopping_plan'].append(0)
            stopping_dict['early_stopping_orr'].append(1)
            stopping_dict['early_stopping_size'].append(0)

            ## record final plan number
            final_plan_number.append(len(p_vec_next))

            ## record highest responserate overall
            highest_rr_overall.append(np.max(dt_design_5_rest_overall['plan_response_rate']))

            for r in range(round+2, n_rounds+1):
                rr_dict["step"+str(r)+"_rr"].append(np.nan)

            ## prepare to chance of better performance:
            temp = dt_design_5_rest[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
            event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            event_list_adaptive += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print("step"+str(round+1), np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print(temp)

            result = binomtest(int(event_list), n=total_n, p=p0)
            # print(int(event_list), total_n, p0)
            orr_total.append(event_list)
            orr_total_adaptive.append(event_list_adaptive)
            better_chance.append(1 if result.pvalue < 0.05 else 0)
            # print(result.pvalue)

            break



1
['step1_rr: 0', 'step2_rr: 0', 'step3_rr: 0', 'step4_rr: 0', 'step5_rr: 0']
empirical response rate 0.05655306495882891
{'LR__C': 0.01}
remaining size for Round 2: 140000
Calculated size for Round 2: 4240.799240494033
{'LR__C': 0.01}
orr termination 0.03145624959349912 0.06349294395987579 0.03203669436637667
remaining size for Round 3: 135760
Calculated size for Round 3: 7054.0537551469515
{'LR__C': 0.01}
orr termination 0.03274560725004911 0.06487887661321948 0.03213326936317037
remaining size for Round 4: 128706
2
['step1_rr: 1', 'step2_rr: 1', 'step3_rr: 1', 'step4_rr: 1', 'step5_rr: 1']
empirical response rate 0.054637465690759376
{'LR__C': 0.01}
remaining size for Round 2: 140000
Calculated size for Round 2: 4113.519228021547
{'LR__C': 0.01}
orr termination 0.04284867883572635 0.06421230125649269 0.021363622420766337
remaining size for Round 3: 135887
Calculated size for Round 3: 6199.105937709977
{'LR__C': 0.01}
orr termination 0.03603485531522797 0.07057620432667427 0.03454134

### Results summary

In [11]:
rr_df = pd.DataFrame(rr_dict)
rr_df

Unnamed: 0,step1_rr,step2_rr,step3_rr,step4_rr,step5_rr
0,0.055,0.058348,0.061799,0.068471,
1,0.055,0.061768,0.065446,0.068471,0.068471
2,0.055,0.061710,0.068471,,
3,0.055,0.055000,,,
4,0.055,0.061721,0.063410,0.068471,0.068471
...,...,...,...,...,...
95,0.055,0.063445,0.068471,0.068471,0.068471
96,0.055,0.066576,0.068471,0.068471,
97,0.055,0.060130,0.068471,,
98,0.055,0.062704,0.065873,0.068471,0.068471


In [12]:
# Calculate average rounds:
row_non_nan_counts = rr_df.count(axis=1)

print(f"{np.mean(row_non_nan_counts):.1f} ({np.std(row_non_nan_counts):.1f})")

4.0 (0.8)


In [13]:
# orr for each round:
result_dict = {}
for key, values in rr_df.items():
    mean_value = np.mean(values)
    std_value = np.std(values)
    result_dict[key] = f"{mean_value:.3f} ({std_value:.3f})"

for key, value in list(result_dict.items())[:12]:
    print(f"{key}: {value}")

step1_rr: 0.055 (0.000)
step2_rr: 0.061 (0.004)
step3_rr: 0.066 (0.005)
step4_rr: 0.068 (0.002)
step5_rr: 0.068 (0.003)


In [14]:
# overall orr:
result_dict = np.array(orr_total) / total_n
mean_result = np.mean(result_dict)
std_result = np.std(result_dict)

print(f"Mean: {mean_result:.3f}, StD: {std_result:.3f}")

Mean: 0.064, StD: 0.004


In [17]:
# highest true rr:
mean_result=(np.mean(highest_true_rr))
std_result=(np.std(highest_true_rr))
print(f"Mean: {mean_result:.3f} ({std_result:.3f})")

Mean: 0.068 (0.000)


In [18]:
# adaptive learning orr:
result_dict = np.array(orr_total_adaptive) / (total_n-34976)
mean_result = np.mean(result_dict)
std_result = np.std(result_dict)

print(f"Mean: {mean_result:.3f}, StD: {std_result:.3f}")

Mean: 0.066, StD: 0.005


In [19]:
# average plan number for each round:
result_dict = {}
for key, values in plan_number_dict.items():
    mean_value = np.mean(values)
    std_value = np.std(values)
    result_dict[key] = f"{mean_value:.1f} ({std_value:.1f})"
#     result_dict[key + "_std"] = std_value
for key, value in list(result_dict.items())[:12]:
    print(f"{key}: {value}")

step1_plan_number: nan (nan)
step2_plan_number: 7.1 (3.9)
step3_plan_number: 2.9 (2.0)
step4_plan_number: 2.0 (1.4)
step5_plan_number: 1.4 (0.8)


In [20]:
# early stopping probabilities:
means = {key: np.mean(values) for key, values in stopping_dict.items()}
means

{'early_stopping': 0.67,
 'early_stopping_plan': 0.67,
 'early_stopping_orr': 0.0,
 'early_stopping_size': 0.0}

In [21]:
# sample size for the last round:
print(np.mean(last_round_sample_size), np.std(last_round_sample_size))

129828.75 5130.973592555315


In [None]:
# probabilities for better performance compared to the benchmark:
np.mean(better_chance)

In [22]:
# plan_number_dict = {"step{}_plan_number".format(r): [] for r in range(1, n_rounds + 1)}
result_dict = {}
for key, values in sample_size_dict.items():
    mean_value = np.nanmean(values)
    std_value = np.nanstd(values)
    result_dict[key] = f"{mean_value:.3f} ({std_value:.3f})"
#     result_dict[key + "_std"] = std_value
for key, value in list(result_dict.items())[:12]:
    print(f"{key}: {value}")

step1_sample_size: 34976.000 (0.000)
step2_sample_size: 6855.844 (19020.756)
step3_sample_size: 47113.405 (60380.932)
step4_sample_size: 68859.140 (61678.788)
step5_sample_size: 123752.727 (868.797)


## Random Forest

In [23]:
def ensemble_model_fit(data, data_pred):
    X_train, X_test, y_train, y_test = train_test_split(
        data.drop(['recruitment_plan','plan_response_rate','group_size','response'], axis=1),
        data['response'],
        test_size=0.2,
        random_state=0
    )

    # Define the VotingClassifier with the individual classifiers
    voting_classifier = ensemble.VotingClassifier(
        estimators=[
            # ('LR', linear_model.LogisticRegression(max_iter=200, random_state=0))
#             ('Ridge', linear_model.LogisticRegression(penalty='l2', solver='lbfgs', max_iter=200, random_state=0))
                    # ('SVM', svm.SVC(kernel='linear', C=1.0, random_state=0, probability=True, class_weight='balanced'))
                    ('RF', ensemble.RandomForestClassifier(criterion='gini', random_state=0))
                    # ('XGB', XGBClassifier(n_estimators=50, learning_rate=0.1, random_state=0))
                   ],
        voting='soft'
    )

    # Define the hyperparameter grid to search
    # Best Hyperparameters: {'LR__C': 0.01, 'RF__n_estimators': 50, 'XGB__n_estimators': 50}
    param_grid = {
        # 'NB__alpha': [0.01, 0.05, 0.1],  # '__' is used to specify hyperparameters for individual classifiers
        # 'LR__C': [0.01] # [0.01, 0.05, 0.1]
        # 'Ridge__C': [0.01]
        # 'SVM__C': [0.01, 0.05, 0.1]
#         'RF__n_estimators': [50, 100, 200]
        'RF__n_estimators': [50]
#         'RF__max_depth': [10, 20, None]
        # 'XGB__n_estimators': [50]
    }

    # Create a GridSearchCV object
    # custom_scorer_auc = make_scorer(roc_auc_score, needs_proba=True)
    grid_search = GridSearchCV(voting_classifier, param_grid, cv=10, scoring='roc_auc')

    # Perform the grid search on the training data
    grid_search.fit(X_train, y_train)

    # Get the best hyperparameters
    best_params = grid_search.best_params_
    print(best_params)

    # Train the final VotingClassifier with the best hyperparameters on the full training set
    final_voting_classifier = grid_search.best_estimator_
    final_voting_classifier.fit(X_train, y_train)

    # Predict probabilities instead of binary outcomes on the test set
    y_pred_proba_test = final_voting_classifier.predict_proba(X_test)
    y_pred_test = final_voting_classifier.predict(X_test)
    X_dt = data_pred.drop(['recruitment_plan','plan_response_rate','group_size','response'], axis=1)
    y_pred = final_voting_classifier.predict_proba(X_dt)

    return y_pred

### Simulation starts

In [25]:
n_sim = 100
# create a list of random seeds
random.seed(42)
random_seeds = [random.randint(1, 100000) for _ in range(n_sim)]

design_list = [5,8,10]
patient_n_list = [5468,680,170] # n_patient_per_plan

n_design = 5
n_patient_per_plan = 5468
n_rounds = 5
total_n = n_patient_per_plan * (2**n_design)

## sample size determination
beta = 0.2
power = 1 - beta
alpha = 0.05
delta = 0.01 # effect size

## early stopping
epsilon = 0.001

In [26]:
rr_dict = {"step{}_rr".format(r): [] for r in range(1, n_rounds + 1)}
plan_number_dict = {"step{}_plan_number".format(r): [] for r in range(1, n_rounds + 1)}
sample_size_dict = {"step{}_sample_size".format(r): [] for r in range(1, n_rounds + 1)}
last_round_sample_size = []
random_rr_dict = {"step{}_random_max_rr".format(r): [] for r in range(1, n_rounds + 1)}
random_rr_dict.update({
    "step{}_random_mean_rr".format(r): [] for r in range(1, n_rounds + 1)
})

stopping_dict = {"early_stopping": [],
                 "early_stopping_plan":[],
                 "early_stopping_orr":[],
                 "early_stopping_size":[]}
final_plan_number = []
highest_rr_overall = []
highest_true_rr = []

better_chance = []

orr_total = []
orr_total_adaptive = []



i = 0

for seed in random_seeds:
    i += 1
    print(i)

    print([f"{key}: {len(value)}" for key, value in rr_dict.items()])



    event_list = 0
    event_list_adaptive = 0
    max_rr = []

    # step 1:
    ## Generate dataset
    dt_design_5 = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                      n_patient_per_plan = n_patient_per_plan, seed=seed)
    highest_true_rr.append(np.max(dt_design_5['plan_response_rate']))
    print("empirical response rate", dt_design_5['response'].mean())
    ## save for benchmark results
    maxidx = np.argmax(dt_design_5['plan_response_rate'])
    random_rr_dict['step1_random_max_rr'].append(dt_design_5.iloc[maxidx,6])
    random_rr_dict['step1_random_mean_rr'].append(dt_design_5['plan_response_rate'].mean())

    ## Ensemble modelling:
    y_pred = ensemble_model_fit(data=dt_design_5, data_pred = dt_design_5)

    ## select recruitment plan
    pred_df = pd.DataFrame(np.hstack((dt_design_5,  y_pred[:, 1].reshape(-1, 1))),
                             columns=list(dt_design_5.columns) + ['predicted_response_rate'])
    pred_df_rr = pred_df.groupby([f'Design_Feature_{i+1}' for i in range(n_design)]+['recruitment_plan'])['predicted_response_rate'].mean().reset_index(name='predicted_response_rate')

    x = pred_df_rr['predicted_response_rate'].values
    if len(x) <= 10:
        best_k = 2
    else:
        best_k = kmeans_fit(data = x)['best_k']
    kmeans_results = kmeans_bhattacharyya(data=x, k=best_k)

    merged_df = pd.merge(pred_df_rr, kmeans_results['clusters'][['predicted_response_rate', 'cluster_number']], on='predicted_response_rate')
    cluster_with_highest_rate = kmeans_results['clusters'].groupby('cluster_number')['predicted_response_rate'].mean().idxmax()
    highest_cluster = merged_df[merged_df['cluster_number']==cluster_with_highest_rate].reset_index(drop=True)
    highest_cluster.sort_values(by='predicted_response_rate', ascending=False, inplace=True)


    p_vec_next = np.array(highest_cluster['predicted_response_rate']/np.sum(highest_cluster['predicted_response_rate']))
    highest_cluster['p_vec'] = p_vec_next
    highest_cluster = pd.merge(highest_cluster, dt_design_5[['recruitment_plan','plan_response_rate']].drop_duplicates(), how='left', on='recruitment_plan')

    # highest_cluster = pred_df_rr

    # highest_cluster = pd.merge(highest_cluster, dt_design_5[['recruitment_plan','plan_response_rate']].drop_duplicates(), how='left', on='recruitment_plan')
    # highest_cluster['cluster_number'] = highest_cluster['recruitment_plan']

    ## prepare to chance of better performance:
    temp = dt_design_5[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
    event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
    # print("step1", np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
    # print(temp)
    p0 = (temp['plan_response_rate'].mean())

    rr_dict["step1_rr"].append(dt_design_5['plan_response_rate'].mean())
    sample_size_dict["step1_sample_size"].append(len(dt_design_5))

    for round in range(2, n_rounds+1):

        rr_dict["step"+str(round)+"_rr"].append(np.dot(np.array(highest_cluster['p_vec']), np.array(highest_cluster['plan_response_rate'])))
        plan_number_dict["step"+str(round)+"_plan_number"].append(len(p_vec_next))

        ## when it comes to the last round:
        if round == n_rounds:

            ## save benchmark results for last round:
            dt_benchmark = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                      n_patient_per_plan = n_patient_per_plan, seed=seed+round)
            maxidx = np.argmax(dt_benchmark['plan_response_rate'])
            random_rr_dict["step"+str(round)+'_random_max_rr'].append(dt_benchmark.iloc[maxidx,6])
            random_rr_dict["step"+str(round)+'_random_mean_rr'].append(dt_benchmark['plan_response_rate'].mean())

            if round == 2: # if the last round is round 2
                remaining_size = total_n - len(dt_design_5) # apply to the rest of the patients
            else:
                remaining_size -= len(dt_design_5_step2up)

            print("remaining size for last Round " + str(round) + ": " + str(remaining_size))
            sample_size_dict["step"+str(round)+"_sample_size"].append(remaining_size)
            last_round_sample_size.append(remaining_size)

            ## data generation, combine previous data:
            dt_design_5_rest = generate_data_step2up(highest_cluster=highest_cluster, p_vec = p_vec_next, round=round,
                                                          design_number = n_design, n_rounds = n_rounds,
                                                          n_patient_per_plan = n_patient_per_plan, size = int(remaining_size), seed=seed)
            plan_list = dt_design_5_rest['recruitment_plan'].unique()
            if round == 2:
                supp = dt_design_5[dt_design_5['recruitment_plan'].isin(plan_list)] # data from step 1
            else:
                # step from previous rounds
                supp = dt_design_5_step2up_overall[dt_design_5_step2up_overall['recruitment_plan'].isin(plan_list)]
            dt_design_5_rest_overall = pd.concat([dt_design_5_rest, supp.reset_index(drop=True)], axis = 0)
            dt_design_5_rest_overall.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)

            stopping_dict['early_stopping'].append(0)
            stopping_dict['early_stopping_plan'].append(0)
            stopping_dict['early_stopping_orr'].append(0)
            stopping_dict['early_stopping_size'].append(0)

            ## record final plan number
            final_plan_number.append(len(p_vec_next))

            ## record highest responserate overall
            highest_rr_overall.append(np.max(dt_design_5_rest_overall['plan_response_rate']))

            ## prepare to chance of better performance:
            temp = dt_design_5_rest[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
            event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            event_list_adaptive += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print("step"+str(round), np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print(len(temp), temp['plan_response_rate'].mean())
            # print(temp)

            result = binomtest(int(event_list), n=total_n, p=p0)
            # print(int(event_list), total_n, p0)
            orr_total.append(event_list)
            orr_total_adaptive.append(event_list_adaptive)
            better_chance.append(1 if result.pvalue < 0.05 else 0)
            # print(result.pvalue)

            break



        ## If haven't reached the last round:
        ## check remaining sample size:
        if round == 2:
            remaining_size = total_n - len(dt_design_5) # apply to the rest of the patients
        else:
            remaining_size -= len(dt_design_5_step2up)

        print("remaining size for Round " + str(round) + ": " + str(remaining_size))

        ## determine whether move on to step 2:
        if len(highest_cluster) == 1: # if there is only one plan left

            sample_size_dict["step"+str(round)+"_sample_size"].append(remaining_size)
            last_round_sample_size.append(remaining_size) # record the last round sample size
            for r in range(round+1, n_rounds+1):
                sample_size_dict["step"+str(r)+"_sample_size"].append(np.nan)

            ## save benchmark results for last round:
            dt_benchmark = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                      n_patient_per_plan = n_patient_per_plan, seed=seed+round)
            maxidx = np.argmax(dt_benchmark['plan_response_rate'])
            random_rr_dict["step"+str(round)+'_random_max_rr'].append(dt_benchmark.iloc[maxidx,6])
            random_rr_dict["step"+str(round)+'_random_mean_rr'].append(dt_benchmark['plan_response_rate'].mean())

            dt_design_5_step2up = generate_data_step2up(highest_cluster=highest_cluster, p_vec = p_vec_next,round=round,
                                                      design_number = n_design, n_rounds = n_rounds,
                                                      n_patient_per_plan = n_patient_per_plan, size = int(remaining_size), seed=seed)
            plan_list = dt_design_5_step2up['recruitment_plan'].unique()
            if round == 2:
                supp = dt_design_5[dt_design_5['recruitment_plan'].isin(plan_list)] # data from step 1
            else:
                # step from previous rounds
                supp = dt_design_5_step2up_overall[dt_design_5_step2up_overall['recruitment_plan'].isin(plan_list)]

            dt_design_5_step2up_overall = pd.concat([dt_design_5_step2up, supp.reset_index(drop=True)], axis = 0)
            dt_design_5_step2up_overall.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)

            stopping_dict['early_stopping'].append(1)
            stopping_dict['early_stopping_plan'].append(1)
            stopping_dict['early_stopping_orr'].append(0)
            stopping_dict['early_stopping_size'].append(0)

            ## record final plan number
            final_plan_number.append(len(p_vec_next))

            ## record highest responserate overall
            highest_rr_overall.append(np.max(dt_design_5_step2up_overall['plan_response_rate']))

            for r in range(round+1, n_rounds+1):
                rr_dict["step"+str(r)+"_rr"].append(np.nan)

            ## prepare for chance of better performance:
            temp = dt_design_5_step2up[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
            event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            event_list_adaptive += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print("step"+str(round), np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print(temp)

            result = binomtest(int(event_list), n=total_n, p=p0)
            # print(int(event_list), total_n, p0)
            orr_total.append(event_list)
            orr_total_adaptive.append(event_list_adaptive)
            better_chance.append(1 if result.pvalue < 0.05 else 0)
            # print(result.pvalue)
            break

        ## sample size determination:
        if round == 2:
            dt_design_5_step2up_overall = dt_design_5
        orr_1 = dt_design_5_step2up_overall['response'].mean() # observed overall response rates for previous rounds
        orr_2 = orr_1 + delta
        n_1 = len(dt_design_5_step2up_overall)

        size_step2up = sample_size_calc(orr_1, n_1, delta=delta, alpha=alpha, power=power) # total size for dataset
        if size_step2up > 0 and size_step2up < 1000:
            size_step2up = 1000 # if size in [0,1000], then it is 1000 for this round.
        elif size_step2up >= 1000:
            size_step2up = min(size_step2up, int(total_n/n_rounds)) # dataset size capped by the n_patient_per_plan
        else:
        # if size_step2up <= 0:
            # the process stops at this step
            print('Calculated size for Round ' + str(round) + ': ' + str(size_step2up) + 'lt 0, break')
            sample_size_dict["step"+str(round)+"_sample_size"].append(remaining_size)
            last_round_sample_size.append(remaining_size)
            for r in range(round+1, n_rounds+1):
                sample_size_dict["step"+str(r)+"_sample_size"].append(np.nan)

            dt_benchmark = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                      n_patient_per_plan = n_patient_per_plan, seed=seed+round)
            maxidx = np.argmax(dt_benchmark['plan_response_rate'])
            random_rr_dict["step"+str(round)+'_random_max_rr'].append(dt_benchmark.iloc[maxidx,6])
            random_rr_dict["step"+str(round)+'_random_mean_rr'].append(dt_benchmark['plan_response_rate'].mean())

            dt_design_5_step2up = generate_data_step2up(highest_cluster=highest_cluster, p_vec = p_vec_next,round=round,
                                                      design_number = n_design, n_rounds = n_rounds,
                                                      n_patient_per_plan = n_patient_per_plan, size = int(remaining_size), seed=seed)
            plan_list = dt_design_5_step2up['recruitment_plan'].unique()
            if round == 2:
                supp = dt_design_5[dt_design_5['recruitment_plan'].isin(plan_list)] # data from step 1
            else:
                # step from previous rounds
                supp = dt_design_5_step2up_overall[dt_design_5_step2up_overall['recruitment_plan'].isin(plan_list)]

            dt_design_5_step2up_overall = pd.concat([dt_design_5_step2up, supp.reset_index(drop=True)], axis = 0)
            dt_design_5_step2up_overall.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)

            stopping_dict['early_stopping'].append(1)
            stopping_dict['early_stopping_plan'].append(0)
            stopping_dict['early_stopping_orr'].append(0)
            stopping_dict['early_stopping_size'].append(1)

            ## record final plan number
            final_plan_number.append(len(p_vec_next))

            ## record highest responserate overall
            highest_rr_overall.append(np.max(dt_design_5_step2up_overall['plan_response_rate']))

            for r in range(round+1, n_rounds+1):
                rr_dict["step"+str(r)+"_rr"].append(np.nan)

            ## prepare for chance of better performance:
            temp = dt_design_5_step2up[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
            event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            event_list_adaptive += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print("step"+str(round), np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print(temp)

            result = binomtest(int(event_list), n=total_n, p=p0)
            # print(int(event_list), total_n, p0)
            orr_total.append(event_list)
            orr_total_adaptive.append(event_list_adaptive)
            better_chance.append(1 if result.pvalue < 0.05 else 0)
            # print(result.pvalue)
            break

        print('Calculated size for Round ' + str(round) + ': ' + str(size_step2up))

        sample_size_dict["step"+str(round)+"_sample_size"].append(size_step2up)

        ## save benchmark results for last round:
        dt_benchmark = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                            n_patient_per_plan = n_patient_per_plan, seed=seed+round)
        maxidx = np.argmax(dt_benchmark['plan_response_rate'])
        random_rr_dict["step"+str(round)+'_random_max_rr'].append(dt_benchmark.iloc[maxidx,6])
        random_rr_dict["step"+str(round)+'_random_mean_rr'].append(dt_benchmark['plan_response_rate'].mean())

        ## data generation, combine previous data:
        dt_design_5_step2up = generate_data_step2up(highest_cluster=highest_cluster, p_vec = p_vec_next,round=round,
                                                  design_number = n_design, n_rounds = n_rounds,
                                                  n_patient_per_plan = n_patient_per_plan, size = int(size_step2up), seed=seed)

        plan_list = dt_design_5_step2up['recruitment_plan'].unique()
        if round == 2:
            supp = dt_design_5[dt_design_5['recruitment_plan'].isin(plan_list)]
        else:
            supp = dt_design_5_step2up_overall[dt_design_5_step2up_overall['recruitment_plan'].isin(plan_list)]
        dt_design_5_step2up_overall = pd.concat([dt_design_5_step2up, supp.reset_index(drop=True)], axis = 0)
        dt_design_5_step2up_overall.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)

        ## Ensemble model fitting:
        dt_design_5_step2up.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)
        y_pred2up = ensemble_model_fit(data = dt_design_5_step2up_overall, data_pred = dt_design_5_step2up)
        pred_df_step2up = pd.DataFrame(np.hstack((dt_design_5_step2up,  y_pred2up[:, 1].reshape(-1, 1))),
                                    columns=list(dt_design_5_step2up.columns) + ['predicted_response_rate'])
        pred_df_rr_step2up = pred_df_step2up.groupby([f'Design_Feature_{i+1}' for i in range(n_design)]+['recruitment_plan'])['predicted_response_rate'].mean().reset_index(name='predicted_response_rate')

        ## select recruitment plans:
        x = pred_df_rr_step2up['predicted_response_rate'].values

        if len(x) <= 10:
            best_k = 2
        else:
            best_k = kmeans_fit(data = x)['best_k']
        kmeans_results = kmeans_bhattacharyya(data=x, k=best_k)

        ## match the cluster results back to the original data
        highest_cluster_previous = highest_cluster # save previous cluster results

        merged_df = pd.merge(pred_df_rr_step2up, kmeans_results['clusters'][['predicted_response_rate', 'cluster_number']], on='predicted_response_rate')
        cluster_with_highest_rate = kmeans_results['clusters'].groupby('cluster_number')['predicted_response_rate'].mean().idxmax()
        highest_cluster = merged_df[merged_df['cluster_number']==cluster_with_highest_rate].reset_index(drop=True)
        highest_cluster.sort_values(by='predicted_response_rate', ascending=False, inplace=True)

        # p_vec_previous = p_vec_next # save p_vec of previous round
        p_vec_next = np.array(highest_cluster['predicted_response_rate']/np.sum(highest_cluster['predicted_response_rate']))

        highest_cluster['p_vec'] = p_vec_next

        highest_cluster = pd.merge(highest_cluster, dt_design_5_step2up[['recruitment_plan','plan_response_rate']].drop_duplicates(), how='left', on='recruitment_plan')

        ## prepare to chance of better performance:
        temp = dt_design_5_step2up[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
        event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
        event_list_adaptive += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
        # print("step"+str(round), np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
        # print(temp)

        ## check early stopping predicted ORR:
        orr_df = pd.merge(highest_cluster_previous, highest_cluster[['recruitment_plan','predicted_response_rate','p_vec']], on='recruitment_plan', how='left')
        orr_df.fillna(0, inplace=True)
        p_orr_1 = np.dot(np.array(orr_df['p_vec_x']), np.array(orr_df['predicted_response_rate_y']))
        p_orr_2 = np.dot(np.array(orr_df['p_vec_y']), np.array(orr_df['predicted_response_rate_y']))
        print("orr termination", p_orr_1, p_orr_2, p_orr_2 - p_orr_1)

        if (p_orr_2 - p_orr_1 < epsilon):
            # step 3 use the same strategy of step2
            print(i, p_orr_1, p_orr_2, "early stop at Round " + str(round))
            rr_dict["step"+str(round+1)+"_rr"].append(np.dot(np.array(highest_cluster['p_vec']), np.array(highest_cluster['plan_response_rate'])))

            ## save benchmark results for last round:
            dt_benchmark = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                                n_patient_per_plan = n_patient_per_plan, seed=seed+round)
            maxidx = np.argmax(dt_benchmark['plan_response_rate'])
            random_rr_dict["step"+str(round+1)+'_random_max_rr'].append(dt_benchmark.iloc[maxidx,6])
            random_rr_dict["step"+str(round+1)+'_random_mean_rr'].append(dt_benchmark['plan_response_rate'].mean())

            ### update remaining size:
            remaining_size -= len(dt_design_5_step2up)
            print("early stop, remaining size for Round" + str(round + 1) + ": " + str(remaining_size))
            sample_size_dict["step"+str(round)+"_sample_size"].append(remaining_size)
            last_round_sample_size.append(remaining_size)

            for r in range(round+2, n_rounds+1):
                sample_size_dict["step"+str(r)+"_sample_size"].append(np.nan)

            dt_design_5_rest = generate_data_step2up(highest_cluster=highest_cluster, p_vec = p_vec_next, round=round,
                                                      design_number = n_design, n_rounds = n_rounds,
                                                      n_patient_per_plan = n_patient_per_plan, size = int(remaining_size), seed=seed)
            plan_list = dt_design_5_rest['recruitment_plan'].unique()
            supp = dt_design_5_step2up_overall[dt_design_5_step2up_overall['recruitment_plan'].isin(plan_list)]
            dt_design_5_rest_overall = pd.concat([dt_design_5_rest, supp.reset_index(drop=True)], axis = 0)
            dt_design_5_rest_overall.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)

            stopping_dict['early_stopping'].append(1)
            stopping_dict['early_stopping_plan'].append(0)
            stopping_dict['early_stopping_orr'].append(1)
            stopping_dict['early_stopping_size'].append(0)

            ## record final plan number
            final_plan_number.append(len(p_vec_next))

            ## record highest responserate overall
            highest_rr_overall.append(np.max(dt_design_5_rest_overall['plan_response_rate']))

            for r in range(round+2, n_rounds+1):
                rr_dict["step"+str(r)+"_rr"].append(np.nan)

            ## prepare to chance of better performance:
            temp = dt_design_5_rest[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
            event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            event_list_adaptive += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print("step"+str(round+1), np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print(temp)

            result = binomtest(int(event_list), n=total_n, p=p0)
            # print(int(event_list), total_n, p0)
            orr_total.append(event_list)
            orr_total_adaptive.append(event_list_adaptive)
            better_chance.append(1 if result.pvalue < 0.05 else 0)
            # print(result.pvalue)

            break



1
['step1_rr: 0', 'step2_rr: 0', 'step3_rr: 0', 'step4_rr: 0', 'step5_rr: 0']
empirical response rate 0.05655306495882891
{'RF__n_estimators': 50}
remaining size for Round 2: 140000
Calculated size for Round 2: 4240.799240494033
{'RF__n_estimators': 50}
orr termination 0.05479087988334956 0.06836783503271052 0.013576955149360956
remaining size for Round 3: 135760
Calculated size for Round 3: 7250.5978185355325
{'RF__n_estimators': 50}
orr termination 0.03649780748875311 0.07259724332691517 0.03609943583816206
remaining size for Round 4: 128510
Calculated size for Round 4: 5970.162675655956
{'RF__n_estimators': 50}
orr termination 0.034455066447021364 0.06981692622864044 0.03536185978161908
remaining size for last Round 5: 122540
2
['step1_rr: 1', 'step2_rr: 1', 'step3_rr: 1', 'step4_rr: 1', 'step5_rr: 1']
empirical response rate 0.054637465690759376
{'RF__n_estimators': 50}
remaining size for Round 2: 140000
Calculated size for Round 2: 4113.519228021547
{'RF__n_estimators': 50}
orr te

### Results summary

In [27]:
rr_df = pd.DataFrame(rr_dict)
rr_df

Unnamed: 0,step1_rr,step2_rr,step3_rr,step4_rr,step5_rr
0,0.055,0.063155,0.065021,0.068471,0.068471
1,0.055,0.063836,0.068471,0.068471,
2,0.055,0.064696,0.068471,0.068471,0.068471
3,0.055,0.060285,0.068471,0.068471,
4,0.055,0.064135,0.065973,0.068471,0.068471
...,...,...,...,...,...
95,0.055,0.061347,0.068471,0.068471,0.068471
96,0.055,0.068471,,,
97,0.055,0.068471,,,
98,0.055,0.068471,0.068471,0.068471,


In [28]:
# Calculate average rounds:
row_non_nan_counts = rr_df.count(axis=1)

print(f"{np.mean(row_non_nan_counts):.1f} ({np.std(row_non_nan_counts):.1f})")

3.7 (1.0)


In [29]:
# orr for each round:
result_dict = {}
for key, values in rr_df.items():
    mean_value = np.mean(values)
    std_value = np.std(values)
    result_dict[key] = f"{mean_value:.3f} ({std_value:.3f})"

for key, value in list(result_dict.items())[:12]:
    print(f"{key}: {value}")

step1_rr: 0.055 (0.000)
step2_rr: 0.066 (0.003)
step3_rr: 0.068 (0.002)
step4_rr: 0.068 (0.001)
step5_rr: 0.068 (0.001)


In [30]:
# overall orr:
result_dict = np.array(orr_total) / total_n
mean_result = np.mean(result_dict)
std_result = np.std(result_dict)

print(f"Mean: {mean_result:.3f} ({std_result:.3f})")

Mean: 0.066 (0.001)


In [31]:
# adaptive learning orr:
result_dict = np.array(orr_total_adaptive) / (total_n-34976)
mean_result = np.mean(result_dict)
std_result = np.std(result_dict)

print(f"Mean: {mean_result:.3f} ({std_result:.3f})")

Mean: 0.068 (0.001)


In [32]:
# highest true rr:
mean_result = (np.mean(highest_true_rr))
std_result = (np.std(highest_true_rr))
print(f"Mean: {mean_result:.3f} ({std_result:.3f})")

Mean: 0.068 (0.000)


In [33]:
# average plan number for each round:
result_dict = {}
for key, values in plan_number_dict.items():
    mean_value = np.mean(values)
    std_value = np.std(values)
    result_dict[key] = f"{mean_value:.1f} ({std_value:.1f})"
#     result_dict[key + "_std"] = std_value
for key, value in list(result_dict.items())[:12]:
    print(f"{key}: {value}")

step1_plan_number: nan (nan)
step2_plan_number: 5.1 (4.0)
step3_plan_number: 2.9 (2.3)
step4_plan_number: 2.2 (1.6)
step5_plan_number: 1.5 (1.0)


In [34]:
# early stopping probabilities:
means = {key: np.mean(values) for key, values in stopping_dict.items()}
means

{'early_stopping': 0.72,
 'early_stopping_plan': 0.72,
 'early_stopping_orr': 0.0,
 'early_stopping_size': 0.0}

In [35]:
# sample size for the last round:
print(np.mean(last_round_sample_size), np.std(last_round_sample_size))

130706.03 6259.832643857182


In [36]:
# plan_number_dict = {"step{}_plan_number".format(r): [] for r in range(1, n_rounds + 1)}
result_dict = {}
for key, values in sample_size_dict.items():
    mean_value = np.nanmean(values)
    std_value = np.nanstd(values)
    result_dict[key] = f"{mean_value:.3f} ({std_value:.3f})"
#     result_dict[key + "_std"] = std_value
for key, value in list(result_dict.items())[:12]:
    print(f"{key}: {value}")

step1_sample_size: 34976.000 (0.000)
step2_sample_size: 21802.249 (45690.069)
step3_sample_size: 56065.754 (62391.363)
step4_sample_size: 64649.525 (60619.740)
step5_sample_size: 123252.536 (1149.734)


In [37]:
# probabilities for better performance compared to the benchmark:
np.mean(better_chance)

0.99

## XGBoost

In [146]:
def ensemble_model_fit(data, data_pred):
    X_train, X_test, y_train, y_test = train_test_split(
        data.drop(['recruitment_plan','plan_response_rate','group_size','response'], axis=1),
        data['response'],
        test_size=0.2,
        random_state=0
    )

    # Define the VotingClassifier with the individual classifiers
    voting_classifier = ensemble.VotingClassifier(
        estimators=[
            # ('LR', linear_model.LogisticRegression(max_iter=200, random_state=0))
#             ('Ridge', linear_model.LogisticRegression(penalty='l2', solver='lbfgs', max_iter=200, random_state=0))
                    # ('SVM', svm.SVC(kernel='linear', C=1.0, random_state=0, probability=True, class_weight='balanced'))
#                     ('RF', ensemble.RandomForestClassifier(n_estimators=200, criterion='gini', random_state=0))
                    ('XGB', XGBClassifier(n_estimators=50, learning_rate=0.1, random_state=0))
                   ],
        voting='soft'
    )

    # Define the hyperparameter grid to search
    # Best Hyperparameters: {'LR__C': 0.01, 'RF__n_estimators': 50, 'XGB__n_estimators': 50}
    param_grid = {
        # 'NB__alpha': [0.01, 0.05, 0.1],  # '__' is used to specify hyperparameters for individual classifiers
        # 'LR__C': [0.01] # [0.01, 0.05, 0.1]
        # 'Ridge__C': [0.01]
        # 'SVM__C': [0.01, 0.05, 0.1]
#         'RF__n_estimators': [50] # [10, 30, 50]
        'XGB__learning_rate': [0.01, 0.1, 1],
        'XGB__n_estimators': [50, 100, 200]
#         'XGB__n_estimators': [50]
    }

    # Create a GridSearchCV object
    # custom_scorer_auc = make_scorer(roc_auc_score, needs_proba=True)
    grid_search = GridSearchCV(voting_classifier, param_grid, cv=10, scoring='roc_auc')

    # Perform the grid search on the training data
    grid_search.fit(X_train, y_train)

    # Get the best hyperparameters
    best_params = grid_search.best_params_
    print("Best fit parameters:", best_params)

    # Train the final VotingClassifier with the best hyperparameters on the full training set
    final_voting_classifier = grid_search.best_estimator_
    final_voting_classifier.fit(X_train, y_train)

    # Predict probabilities instead of binary outcomes on the test set
    y_pred_proba_test = final_voting_classifier.predict_proba(X_test)
    y_pred_test = final_voting_classifier.predict(X_test)
    X_dt = data_pred.drop(['recruitment_plan','plan_response_rate','group_size','response'], axis=1)
    y_pred = final_voting_classifier.predict_proba(X_dt)

    return y_pred

### Simulation starts

In [147]:
n_sim = 100
# create a list of random seeds
random.seed(42)
random_seeds = [random.randint(1, 100000) for _ in range(n_sim)]

design_list = [5,8,10]
patient_n_list = [5468,680,170] # n_patient_per_plan

n_design = 5
n_patient_per_plan = 5468
n_rounds = 5
total_n = n_patient_per_plan * (2**n_design)

## sample size determination
beta = 0.2
power = 1 - beta
alpha = 0.05
delta = 0.01 # effect size

## early stopping
epsilon = 0.001

In [148]:
rr_dict = {"step{}_rr".format(r): [] for r in range(1, n_rounds + 1)}
plan_number_dict = {"step{}_plan_number".format(r): [] for r in range(1, n_rounds + 1)}
sample_size_dict = {"step{}_sample_size".format(r): [] for r in range(1, n_rounds + 1)}
last_round_sample_size = []
random_rr_dict = {"step{}_random_max_rr".format(r): [] for r in range(1, n_rounds + 1)}
random_rr_dict.update({
    "step{}_random_mean_rr".format(r): [] for r in range(1, n_rounds + 1)
})

stopping_dict = {"early_stopping": [],
                 "early_stopping_plan":[],
                 "early_stopping_orr":[],
                 "early_stopping_size":[]}
final_plan_number = []
highest_rr_overall = []

better_chance = []

orr_total = []
orr_total_adaptive = []



i = 0

for seed in random_seeds:
    i += 1
    print(i)

    print([f"{key}: {len(value)}" for key, value in rr_dict.items()])



    event_list = 0
    event_list_adaptive = 0
    max_rr = []

    # step 1:
    ## Generate dataset
    dt_design_5 = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                      n_patient_per_plan = n_patient_per_plan, seed=seed)
    print("empirical response rate", dt_design_5['response'].mean())
    ## save for benchmark results
    maxidx = np.argmax(dt_design_5['plan_response_rate'])
    random_rr_dict['step1_random_max_rr'].append(dt_design_5.iloc[maxidx,6])
    random_rr_dict['step1_random_mean_rr'].append(dt_design_5['plan_response_rate'].mean())

    ## Ensemble modelling:
    y_pred = ensemble_model_fit(data=dt_design_5, data_pred = dt_design_5)

    ## select recruitment plan
    pred_df = pd.DataFrame(np.hstack((dt_design_5,  y_pred[:, 1].reshape(-1, 1))),
                             columns=list(dt_design_5.columns) + ['predicted_response_rate'])
    pred_df_rr = pred_df.groupby([f'Design_Feature_{i+1}' for i in range(n_design)]+['recruitment_plan'])['predicted_response_rate'].mean().reset_index(name='predicted_response_rate')

    x = pred_df_rr['predicted_response_rate'].values
    if len(x) <= 10:
        best_k = 2
    else:
        best_k = kmeans_fit(data = x)['best_k']
    kmeans_results = kmeans_bhattacharyya(data=x, k=best_k)

    merged_df = pd.merge(pred_df_rr, kmeans_results['clusters'][['predicted_response_rate', 'cluster_number']], on='predicted_response_rate')
    cluster_with_highest_rate = kmeans_results['clusters'].groupby('cluster_number')['predicted_response_rate'].mean().idxmax()
    highest_cluster = merged_df[merged_df['cluster_number']==cluster_with_highest_rate].reset_index(drop=True)
    highest_cluster.sort_values(by='predicted_response_rate', ascending=False, inplace=True)


    p_vec_next = np.array(highest_cluster['predicted_response_rate']/np.sum(highest_cluster['predicted_response_rate']))
    highest_cluster['p_vec'] = p_vec_next
    highest_cluster = pd.merge(highest_cluster, dt_design_5[['recruitment_plan','plan_response_rate']].drop_duplicates(), how='left', on='recruitment_plan')

    # highest_cluster = pred_df_rr

    # highest_cluster = pd.merge(highest_cluster, dt_design_5[['recruitment_plan','plan_response_rate']].drop_duplicates(), how='left', on='recruitment_plan')
    # highest_cluster['cluster_number'] = highest_cluster['recruitment_plan']

    ## prepare to chance of better performance:
    temp = dt_design_5[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
    event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
    # print("step1", np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
    # print(temp)
    p0 = (temp['plan_response_rate'].mean())

    rr_dict["step1_rr"].append(dt_design_5['plan_response_rate'].mean())
    sample_size_dict["step1_sample_size"].append(len(dt_design_5))

    for round in range(2, n_rounds+1):

        rr_dict["step"+str(round)+"_rr"].append(np.dot(np.array(highest_cluster['p_vec']), np.array(highest_cluster['plan_response_rate'])))
        plan_number_dict["step"+str(round)+"_plan_number"].append(len(p_vec_next))

        ## when it comes to the last round:
        if round == n_rounds:

            ## save benchmark results for last round:
            dt_benchmark = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                      n_patient_per_plan = n_patient_per_plan, seed=seed+round)
            maxidx = np.argmax(dt_benchmark['plan_response_rate'])
            random_rr_dict["step"+str(round)+'_random_max_rr'].append(dt_benchmark.iloc[maxidx,6])
            random_rr_dict["step"+str(round)+'_random_mean_rr'].append(dt_benchmark['plan_response_rate'].mean())

            if round == 2: # if the last round is round 2
                remaining_size = total_n - len(dt_design_5) # apply to the rest of the patients
            else:
                remaining_size -= len(dt_design_5_step2up)

            print("remaining size for last Round " + str(round) + ": " + str(remaining_size))
            sample_size_dict["step"+str(round)+"_sample_size"].append(remaining_size)
            last_round_sample_size.append(remaining_size)

            ## data generation, combine previous data:
            dt_design_5_rest = generate_data_step2up(highest_cluster=highest_cluster, p_vec = p_vec_next, round=round,
                                                          design_number = n_design, n_rounds = n_rounds,
                                                          n_patient_per_plan = n_patient_per_plan, size = int(remaining_size), seed=seed)
            plan_list = dt_design_5_rest['recruitment_plan'].unique()
            if round == 2:
                supp = dt_design_5[dt_design_5['recruitment_plan'].isin(plan_list)] # data from step 1
            else:
                # step from previous rounds
                supp = dt_design_5_step2up_overall[dt_design_5_step2up_overall['recruitment_plan'].isin(plan_list)]
            dt_design_5_rest_overall = pd.concat([dt_design_5_rest, supp.reset_index(drop=True)], axis = 0)
            dt_design_5_rest_overall.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)

            stopping_dict['early_stopping'].append(0)
            stopping_dict['early_stopping_plan'].append(0)
            stopping_dict['early_stopping_orr'].append(0)
            stopping_dict['early_stopping_size'].append(0)

            ## record final plan number
            final_plan_number.append(len(p_vec_next))

            ## record highest responserate overall
            highest_rr_overall.append(np.max(dt_design_5_rest_overall['plan_response_rate']))

            ## prepare to chance of better performance:
            temp = dt_design_5_rest[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
            event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            event_list_adaptive += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print("step"+str(round), np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print(len(temp), temp['plan_response_rate'].mean())
            # print(temp)

            result = binomtest(int(event_list), n=total_n, p=p0)
            # print(int(event_list), total_n, p0)
            orr_total.append(event_list)
            orr_total_adaptive.append(event_list_adaptive)
            better_chance.append(1 if result.pvalue < 0.05 else 0)
            # print(result.pvalue)

            break



        ## If haven't reached the last round:
        ## check remaining sample size:
        if round == 2:
            remaining_size = total_n - len(dt_design_5) # apply to the rest of the patients
        else:
            remaining_size -= len(dt_design_5_step2up)

        print("remaining size for Round " + str(round) + ": " + str(remaining_size))

        ## determine whether move on to step 2:
        if len(highest_cluster) == 1: # if there is only one plan left

            sample_size_dict["step"+str(round)+"_sample_size"].append(remaining_size)
            last_round_sample_size.append(remaining_size) # record the last round sample size
            for r in range(round+1, n_rounds+1):
                sample_size_dict["step"+str(r)+"_sample_size"].append(np.nan)

            ## save benchmark results for last round:
            dt_benchmark = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                      n_patient_per_plan = n_patient_per_plan, seed=seed+round)
            maxidx = np.argmax(dt_benchmark['plan_response_rate'])
            random_rr_dict["step"+str(round)+'_random_max_rr'].append(dt_benchmark.iloc[maxidx,6])
            random_rr_dict["step"+str(round)+'_random_mean_rr'].append(dt_benchmark['plan_response_rate'].mean())

            dt_design_5_step2up = generate_data_step2up(highest_cluster=highest_cluster, p_vec = p_vec_next,round=round,
                                                      design_number = n_design, n_rounds = n_rounds,
                                                      n_patient_per_plan = n_patient_per_plan, size = int(remaining_size), seed=seed)
            plan_list = dt_design_5_step2up['recruitment_plan'].unique()
            if round == 2:
                supp = dt_design_5[dt_design_5['recruitment_plan'].isin(plan_list)] # data from step 1
            else:
                # step from previous rounds
                supp = dt_design_5_step2up_overall[dt_design_5_step2up_overall['recruitment_plan'].isin(plan_list)]

            dt_design_5_step2up_overall = pd.concat([dt_design_5_step2up, supp.reset_index(drop=True)], axis = 0)
            dt_design_5_step2up_overall.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)

            stopping_dict['early_stopping'].append(1)
            stopping_dict['early_stopping_plan'].append(1)
            stopping_dict['early_stopping_orr'].append(0)
            stopping_dict['early_stopping_size'].append(0)

            ## record final plan number
            final_plan_number.append(len(p_vec_next))

            ## record highest responserate overall
            highest_rr_overall.append(np.max(dt_design_5_step2up_overall['plan_response_rate']))

            for r in range(round+1, n_rounds+1):
                rr_dict["step"+str(r)+"_rr"].append(np.nan)

            ## prepare for chance of better performance:
            temp = dt_design_5_step2up[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
            event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            event_list_adaptive += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print("step"+str(round), np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print(temp)

            result = binomtest(int(event_list), n=total_n, p=p0)
            # print(int(event_list), total_n, p0)
            orr_total.append(event_list)
            orr_total_adaptive.append(event_list_adaptive)
            better_chance.append(1 if result.pvalue < 0.05 else 0)
            # print(result.pvalue)
            break

        ## sample size determination:
        if round == 2:
            dt_design_5_step2up_overall = dt_design_5
        orr_1 = dt_design_5_step2up_overall['response'].mean() # observed overall response rates for previous rounds
        orr_2 = orr_1 + delta
        n_1 = len(dt_design_5_step2up_overall)

        size_step2up = sample_size_calc(orr_1, n_1, delta=delta, alpha=alpha, power=power) # total size for dataset
        if size_step2up > 0 and size_step2up < 1000:
            size_step2up = 1000 # if size in [0,1000], then it is 1000 for this round.
        elif size_step2up >= 1000:
            size_step2up = min(size_step2up, int(total_n/n_rounds)) # dataset size capped by the n_patient_per_plan
        else:
        # if size_step2up <= 0:
            # the process stops at this step
            print('Calculated size for Round ' + str(round) + ': ' + str(size_step2up) + 'lt 0, break')
            sample_size_dict["step"+str(round)+"_sample_size"].append(remaining_size)
            last_round_sample_size.append(remaining_size)
            for r in range(round+1, n_rounds+1):
                sample_size_dict["step"+str(r)+"_sample_size"].append(np.nan)

            dt_benchmark = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                      n_patient_per_plan = n_patient_per_plan, seed=seed+round)
            maxidx = np.argmax(dt_benchmark['plan_response_rate'])
            random_rr_dict["step"+str(round)+'_random_max_rr'].append(dt_benchmark.iloc[maxidx,6])
            random_rr_dict["step"+str(round)+'_random_mean_rr'].append(dt_benchmark['plan_response_rate'].mean())

            dt_design_5_step2up = generate_data_step2up(highest_cluster=highest_cluster, p_vec = p_vec_next,round=round,
                                                      design_number = n_design, n_rounds = n_rounds,
                                                      n_patient_per_plan = n_patient_per_plan, size = int(remaining_size), seed=seed)
            plan_list = dt_design_5_step2up['recruitment_plan'].unique()
            if round == 2:
                supp = dt_design_5[dt_design_5['recruitment_plan'].isin(plan_list)] # data from step 1
            else:
                # step from previous rounds
                supp = dt_design_5_step2up_overall[dt_design_5_step2up_overall['recruitment_plan'].isin(plan_list)]

            dt_design_5_step2up_overall = pd.concat([dt_design_5_step2up, supp.reset_index(drop=True)], axis = 0)
            dt_design_5_step2up_overall.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)

            stopping_dict['early_stopping'].append(1)
            stopping_dict['early_stopping_plan'].append(0)
            stopping_dict['early_stopping_orr'].append(0)
            stopping_dict['early_stopping_size'].append(1)

            ## record final plan number
            final_plan_number.append(len(p_vec_next))

            ## record highest responserate overall
            highest_rr_overall.append(np.max(dt_design_5_step2up_overall['plan_response_rate']))

            for r in range(round+1, n_rounds+1):
                rr_dict["step"+str(r)+"_rr"].append(np.nan)

            ## prepare for chance of better performance:
            temp = dt_design_5_step2up[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
            event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            event_list_adaptive += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print("step"+str(round), np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print(temp)

            result = binomtest(int(event_list), n=total_n, p=p0)
            # print(int(event_list), total_n, p0)
            orr_total.append(event_list)
            orr_total_adaptive.append(event_list_adaptive)
            better_chance.append(1 if result.pvalue < 0.05 else 0)
            # print(result.pvalue)
            break

        print('Calculated size for Round ' + str(round) + ': ' + str(size_step2up))

        sample_size_dict["step"+str(round)+"_sample_size"].append(size_step2up)

        ## save benchmark results for last round:
        dt_benchmark = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                            n_patient_per_plan = n_patient_per_plan, seed=seed+round)
        maxidx = np.argmax(dt_benchmark['plan_response_rate'])
        random_rr_dict["step"+str(round)+'_random_max_rr'].append(dt_benchmark.iloc[maxidx,6])
        random_rr_dict["step"+str(round)+'_random_mean_rr'].append(dt_benchmark['plan_response_rate'].mean())

        ## data generation, combine previous data:
        dt_design_5_step2up = generate_data_step2up(highest_cluster=highest_cluster, p_vec = p_vec_next,round=round,
                                                  design_number = n_design, n_rounds = n_rounds,
                                                  n_patient_per_plan = n_patient_per_plan, size = int(size_step2up), seed=seed)

        plan_list = dt_design_5_step2up['recruitment_plan'].unique()
        if round == 2:
            supp = dt_design_5[dt_design_5['recruitment_plan'].isin(plan_list)]
        else:
            supp = dt_design_5_step2up_overall[dt_design_5_step2up_overall['recruitment_plan'].isin(plan_list)]
        dt_design_5_step2up_overall = pd.concat([dt_design_5_step2up, supp.reset_index(drop=True)], axis = 0)
        dt_design_5_step2up_overall.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)

        ## Ensemble model fitting:
        dt_design_5_step2up.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)
        y_pred2up = ensemble_model_fit(data = dt_design_5_step2up_overall, data_pred = dt_design_5_step2up)
        pred_df_step2up = pd.DataFrame(np.hstack((dt_design_5_step2up,  y_pred2up[:, 1].reshape(-1, 1))),
                                    columns=list(dt_design_5_step2up.columns) + ['predicted_response_rate'])
        pred_df_rr_step2up = pred_df_step2up.groupby([f'Design_Feature_{i+1}' for i in range(n_design)]+['recruitment_plan'])['predicted_response_rate'].mean().reset_index(name='predicted_response_rate')

        ## select recruitment plans:
        x = pred_df_rr_step2up['predicted_response_rate'].values

        if len(x) <= 10:
            best_k = 2
        else:
            best_k = kmeans_fit(data = x)['best_k']
        kmeans_results = kmeans_bhattacharyya(data=x, k=best_k)

        ## match the cluster results back to the original data
        highest_cluster_previous = highest_cluster # save previous cluster results

        merged_df = pd.merge(pred_df_rr_step2up, kmeans_results['clusters'][['predicted_response_rate', 'cluster_number']], on='predicted_response_rate')
        cluster_with_highest_rate = kmeans_results['clusters'].groupby('cluster_number')['predicted_response_rate'].mean().idxmax()
        highest_cluster = merged_df[merged_df['cluster_number']==cluster_with_highest_rate].reset_index(drop=True)
        highest_cluster.sort_values(by='predicted_response_rate', ascending=False, inplace=True)

        # p_vec_previous = p_vec_next # save p_vec of previous round
        p_vec_next = np.array(highest_cluster['predicted_response_rate']/np.sum(highest_cluster['predicted_response_rate']))

        highest_cluster['p_vec'] = p_vec_next

        highest_cluster = pd.merge(highest_cluster, dt_design_5_step2up[['recruitment_plan','plan_response_rate']].drop_duplicates(), how='left', on='recruitment_plan')

        ## prepare to chance of better performance:
        temp = dt_design_5_step2up[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
        event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
        event_list_adaptive += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
        # print("step"+str(round), np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
        # print(temp)

        ## check early stopping predicted ORR:
        orr_df = pd.merge(highest_cluster_previous, highest_cluster[['recruitment_plan','predicted_response_rate','p_vec']], on='recruitment_plan', how='left')
        orr_df.fillna(0, inplace=True)
        p_orr_1 = np.dot(np.array(orr_df['p_vec_x']), np.array(orr_df['predicted_response_rate_y']))
        p_orr_2 = np.dot(np.array(orr_df['p_vec_y']), np.array(orr_df['predicted_response_rate_y']))
        print("orr termination", p_orr_1, p_orr_2, p_orr_2 - p_orr_1)

        if (p_orr_2 - p_orr_1 < epsilon):
            # step 3 use the same strategy of step2
            print(i, p_orr_1, p_orr_2, "early stop at Round " + str(round))
            rr_dict["step"+str(round+1)+"_rr"].append(np.dot(np.array(highest_cluster['p_vec']), np.array(highest_cluster['plan_response_rate'])))

            ## save benchmark results for last round:
            dt_benchmark = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                                n_patient_per_plan = n_patient_per_plan, seed=seed+round)
            maxidx = np.argmax(dt_benchmark['plan_response_rate'])
            random_rr_dict["step"+str(round+1)+'_random_max_rr'].append(dt_benchmark.iloc[maxidx,6])
            random_rr_dict["step"+str(round+1)+'_random_mean_rr'].append(dt_benchmark['plan_response_rate'].mean())

            ### update remaining size:
            remaining_size -= len(dt_design_5_step2up)
            print("early stop, remaining size for Round" + str(round + 1) + ": " + str(remaining_size))
            sample_size_dict["step"+str(round)+"_sample_size"].append(remaining_size)
            last_round_sample_size.append(remaining_size)

            for r in range(round+2, n_rounds+1):
                sample_size_dict["step"+str(r)+"_sample_size"].append(np.nan)

            dt_design_5_rest = generate_data_step2up(highest_cluster=highest_cluster, p_vec = p_vec_next, round=round,
                                                      design_number = n_design, n_rounds = n_rounds,
                                                      n_patient_per_plan = n_patient_per_plan, size = int(remaining_size), seed=seed)
            plan_list = dt_design_5_rest['recruitment_plan'].unique()
            supp = dt_design_5_step2up_overall[dt_design_5_step2up_overall['recruitment_plan'].isin(plan_list)]
            dt_design_5_rest_overall = pd.concat([dt_design_5_rest, supp.reset_index(drop=True)], axis = 0)
            dt_design_5_rest_overall.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)

            stopping_dict['early_stopping'].append(1)
            stopping_dict['early_stopping_plan'].append(0)
            stopping_dict['early_stopping_orr'].append(1)
            stopping_dict['early_stopping_size'].append(0)

            ## record final plan number
            final_plan_number.append(len(p_vec_next))

            ## record highest responserate overall
            highest_rr_overall.append(np.max(dt_design_5_rest_overall['plan_response_rate']))

            for r in range(round+2, n_rounds+1):
                rr_dict["step"+str(r)+"_rr"].append(np.nan)

            ## prepare to chance of better performance:
            temp = dt_design_5_rest[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
            event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            event_list_adaptive += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print("step"+str(round+1), np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print(temp)

            result = binomtest(int(event_list), n=total_n, p=p0)
            # print(int(event_list), total_n, p0)
            orr_total.append(event_list)
            orr_total_adaptive.append(event_list_adaptive)
            better_chance.append(1 if result.pvalue < 0.05 else 0)
            # print(result.pvalue)

            break



1
['step1_rr: 0', 'step2_rr: 0', 'step3_rr: 0', 'step4_rr: 0', 'step5_rr: 0']
empirical response rate 0.05655306495882891
Best fit parameters: {'XGB__learning_rate': 0.01, 'XGB__n_estimators': 100}
remaining size for Round 2: 140000
Calculated size for Round 2: 4240.799240494033
Best fit parameters: {'XGB__learning_rate': 0.01, 'XGB__n_estimators': 50}
orr termination 0.4767242670059204 0.2383621335029602 -0.2383621335029602
1 0.4767242670059204 0.2383621335029602 early stop at Round 2
early stop, remaining size for Round3: 135760
2
['step1_rr: 1', 'step2_rr: 1', 'step3_rr: 1', 'step4_rr: 1', 'step5_rr: 1']
empirical response rate 0.054637465690759376
Best fit parameters: {'XGB__learning_rate': 0.01, 'XGB__n_estimators': 100}
remaining size for Round 2: 140000
Calculated size for Round 2: 4113.519228021547
Best fit parameters: {'XGB__learning_rate': 0.01, 'XGB__n_estimators': 50}
orr termination 0.1538586428115833 0.20905391458355962 0.055195271771976334
remaining size for Round 3: 135

### Results summary

In [149]:
rr_df = pd.DataFrame(rr_dict)
rr_df

Unnamed: 0,step1_rr,step2_rr,step3_rr,step4_rr,step5_rr
0,0.055,0.068471,0.068471,,
1,0.055,0.058487,0.059121,0.068471,
2,0.055,0.060294,0.068471,,
3,0.055,0.062540,0.068471,0.068471,
4,0.055,0.064038,0.068471,0.068471,
...,...,...,...,...,...
95,0.055,0.068471,0.068471,,
96,0.055,0.065806,0.068471,,
97,0.055,0.068471,,,
98,0.055,0.062062,0.068471,0.068471,0.068471


In [150]:
# Calculate average rounds:
row_non_nan_counts = rr_df.count(axis=1)

print(f"{np.mean(row_non_nan_counts):.1f} ({np.std(row_non_nan_counts):.1f})")

3.5 (0.8)


In [151]:
# orr for each round:
result_dict = {}
for key, values in rr_df.items():
    mean_value = np.mean(values)
    std_value = np.std(values)
    result_dict[key] = f"{mean_value:.3f} ({std_value:.3f})"

for key, value in list(result_dict.items())[:12]:
    print(f"{key}: {value}")

step1_rr: 0.055 (0.000)
step2_rr: 0.065 (0.004)
step3_rr: 0.067 (0.003)
step4_rr: 0.068 (0.001)
step5_rr: 0.068 (0.000)


In [152]:
# overall orr:
result_dict = np.array(orr_total) / total_n
mean_result = np.mean(result_dict)
std_result = np.std(result_dict)

print(f"Mean: {mean_result:.3f}, StD: {std_result:.3f}")

Mean: 0.065, StD: 0.002


In [153]:
# highest true rr:
print(np.mean(highest_true_rr))
print(np.std(highest_true_rr))

0.068470526432204
1.3877787807814457e-17


In [154]:
# adaptive learning orr:
result_dict = np.array(orr_total_adaptive) / (total_n-34976)
mean_result = np.mean(result_dict)
std_result = np.std(result_dict)

print(f"Mean: {mean_result:.3f}, StD: {std_result:.3f}")

Mean: 0.068, StD: 0.003


In [155]:
# average plan number for each round:
result_dict = {}
for key, values in plan_number_dict.items():
    mean_value = np.mean(values)
    std_value = np.std(values)
    result_dict[key] = f"{mean_value:.1f} ({std_value:.1f})"
#     result_dict[key + "_std"] = std_value
for key, value in list(result_dict.items())[:12]:
    print(f"{key}: {value}")

step1_plan_number: nan (nan)
step2_plan_number: 17.0 (21.4)
step3_plan_number: 5.2 (7.3)
step4_plan_number: 2.4 (2.1)
step5_plan_number: 1.8 (0.7)


In [156]:
# early stopping probabilities:
means = {key: np.mean(values) for key, values in stopping_dict.items()}
means

{'early_stopping': 0.88,
 'early_stopping_plan': 0.52,
 'early_stopping_orr': 0.36,
 'early_stopping_size': 0.0}

In [157]:
# sample size for the last round:
print(np.mean(last_round_sample_size), np.std(last_round_sample_size))

132328.45 4911.178645447546


In [158]:
# plan_number_dict = {"step{}_plan_number".format(r): [] for r in range(1, n_rounds + 1)}
result_dict = {}
for key, values in sample_size_dict.items():
    mean_value = np.nanmean(values)
    std_value = np.nanstd(values)
    result_dict[key] = f"{mean_value:.3f} ({std_value:.3f})"
#     result_dict[key + "_std"] = std_value
for key, value in list(result_dict.items())[:12]:
    print(f"{key}: {value}")

step1_sample_size: 34976.000 (0.000)
step2_sample_size: 34573.711 (55774.721)
step3_sample_size: 65610.153 (63545.937)
step4_sample_size: 79584.757 (59983.292)
step5_sample_size: 123780.583 (1380.933)


In [159]:
# probabilities for better performance compared to the benchmark:
np.mean(better_chance)

0.97

## Ensemble learning - 3 methods

In [198]:
# def ensemble_model_fit(data, data_pred):
#     X_train, X_test, y_train, y_test = train_test_split(
#         data.drop(['recruitment_plan','plan_response_rate','group_size','response'], axis=1),
#         data['response'],
#         test_size=0.2,
#         random_state=0
#     )

#     # Define the VotingClassifier with the individual classifiers
#     voting_classifier = ensemble.VotingClassifier(
#         estimators=[
#             ('LR', linear_model.LogisticRegression(max_iter=200, random_state=0)),
# #             ('Ridge', linear_model.LogisticRegression(penalty='l2', solver='lbfgs', max_iter=200, random_state=0))
#                     # ('SVM', svm.SVC(kernel='linear', C=1.0, random_state=0, probability=True, class_weight='balanced'))
#                     ('RF', ensemble.RandomForestClassifier(criterion='gini', random_state=0)),
#                     ('XGB', XGBClassifier(learning_rate=0.1, random_state=0))
#                    ],
#         voting='soft'
#     )

#     # Define the hyperparameter grid to search
#     # Best Hyperparameters: {'LR__C': 0.01, 'RF__n_estimators': 50, 'XGB__n_estimators': 50}
#     param_grid = {
#         # 'NB__alpha': [0.01, 0.05, 0.1],  # '__' is used to specify hyperparameters for individual classifiers
#         'LR__C': [0.01, 0.05, 0.1],
#         # 'Ridge__C': [0.01]
#         # 'SVM__C': [0.01, 0.05, 0.1]
#         'RF__n_estimators': [50, 100, 200],
#         'XGB__n_estimators': [50, 100, 200]
#     }

#     # Create a GridSearchCV object
#     # custom_scorer_auc = make_scorer(roc_auc_score, needs_proba=True)
#     grid_search = GridSearchCV(voting_classifier, param_grid, cv=10, scoring='roc_auc')

#     # Perform the grid search on the training data
#     grid_search.fit(X_train, y_train)

#     # Get the best hyperparameters
#     best_params = grid_search.best_params_
#     # Print the best fitted parameters
#     print("Best fitted parameters:")
#     print(best_params)

#     # Train the final VotingClassifier with the best hyperparameters on the full training set
#     final_voting_classifier = grid_search.best_estimator_
#     final_voting_classifier.fit(X_train, y_train)

#     # Predict probabilities instead of binary outcomes on the test set
#     y_pred_proba_test = final_voting_classifier.predict_proba(X_test)
#     y_pred_test = final_voting_classifier.predict(X_test)
#     X_dt = data_pred.drop(['recruitment_plan','plan_response_rate','group_size','response'], axis=1)
#     y_pred = final_voting_classifier.predict_proba(X_dt)

#     return y_pred

In [199]:
# ensemble_model_fit(dt_design_5, dt_design_5)

In [200]:
def ensemble_model_fit(data, data_pred):
    X_train, X_test, y_train, y_test = train_test_split(
        data.drop(['recruitment_plan','plan_response_rate','group_size','response'], axis=1),
        data['response'],
        test_size=0.2,
        random_state=0
    )

    # Define the VotingClassifier with the individual classifiers
    voting_classifier = ensemble.VotingClassifier(
        estimators=[
            ('LR', linear_model.LogisticRegression(max_iter=200, random_state=0)),
#             ('Ridge', linear_model.LogisticRegression(penalty='l2', solver='lbfgs', max_iter=200, random_state=0))
                    # ('SVM', svm.SVC(kernel='linear', C=1.0, random_state=0, probability=True, class_weight='balanced'))
                    ('RF', ensemble.RandomForestClassifier(criterion='gini', random_state=0)),
                    ('XGB', XGBClassifier(learning_rate=0.1, random_state=0))
                   ],
        voting='soft'
    )

    # Define the hyperparameter grid to search
    # Best Hyperparameters: {'LR__C': 0.01, 'RF__n_estimators': 50, 'XGB__n_estimators': 50}
    param_grid = {
        # 'NB__alpha': [0.01, 0.05, 0.1],  # '__' is used to specify hyperparameters for individual classifiers
        'LR__C': [0.05],
        # 'Ridge__C': [0.01]
        # 'SVM__C': [0.05]
        'RF__n_estimators': [100],
        'XGB__n_estimators': [50]
    }

    # Create a GridSearchCV object
    # custom_scorer_auc = make_scorer(roc_auc_score, needs_proba=True)
    grid_search = GridSearchCV(voting_classifier, param_grid, cv=10, scoring='roc_auc')

    # Perform the grid search on the training data
    grid_search.fit(X_train, y_train)

    # Get the best hyperparameters
    best_params = grid_search.best_params_

    # Train the final VotingClassifier with the best hyperparameters on the full training set
    final_voting_classifier = grid_search.best_estimator_
    final_voting_classifier.fit(X_train, y_train)

    # Predict probabilities instead of binary outcomes on the test set
    y_pred_proba_test = final_voting_classifier.predict_proba(X_test)
    y_pred_test = final_voting_classifier.predict(X_test)
    X_dt = data_pred.drop(['recruitment_plan','plan_response_rate','group_size','response'], axis=1)
    y_pred = final_voting_classifier.predict_proba(X_dt)

    return y_pred

### Simulation starts

In [202]:
n_sim = 100
# create a list of random seeds
random.seed(42)
random_seeds = [random.randint(1, 100000) for _ in range(n_sim)]

design_list = [5,8,10]
patient_n_list = [5468,680,170] # n_patient_per_plan

n_design = 5
n_patient_per_plan = 5468
n_rounds = 5
total_n = n_patient_per_plan * (2**n_design)

## sample size determination
beta = 0.2
power = 1 - beta
alpha = 0.05
delta = 0.01 # effect size

## early stopping
epsilon = 0.001

In [203]:
rr_dict = {"step{}_rr".format(r): [] for r in range(1, n_rounds + 1)}
plan_number_dict = {"step{}_plan_number".format(r): [] for r in range(1, n_rounds + 1)}
sample_size_dict = {"step{}_sample_size".format(r): [] for r in range(1, n_rounds + 1)}
last_round_sample_size = []
random_rr_dict = {"step{}_random_max_rr".format(r): [] for r in range(1, n_rounds + 1)}
random_rr_dict.update({
    "step{}_random_mean_rr".format(r): [] for r in range(1, n_rounds + 1)
})

stopping_dict = {"early_stopping": [],
                 "early_stopping_plan":[],
                 "early_stopping_orr":[],
                 "early_stopping_size":[]}
final_plan_number = []
highest_rr_overall = []

better_chance = []

orr_total = []
orr_total_adaptive = []



i = 0

for seed in random_seeds:
    i += 1
    print(i)

    print([f"{key}: {len(value)}" for key, value in rr_dict.items()])



    event_list = 0
    event_list_adaptive = 0
    max_rr = []

    # step 1:
    ## Generate dataset
    dt_design_5 = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                      n_patient_per_plan = n_patient_per_plan, seed=seed)
    print("empirical response rate", dt_design_5['response'].mean())
    ## save for benchmark results
    maxidx = np.argmax(dt_design_5['plan_response_rate'])
    random_rr_dict['step1_random_max_rr'].append(dt_design_5.iloc[maxidx,6])
    random_rr_dict['step1_random_mean_rr'].append(dt_design_5['plan_response_rate'].mean())

    ## Ensemble modelling:
    y_pred = ensemble_model_fit(data=dt_design_5, data_pred = dt_design_5)

    ## select recruitment plan
    pred_df = pd.DataFrame(np.hstack((dt_design_5,  y_pred[:, 1].reshape(-1, 1))),
                             columns=list(dt_design_5.columns) + ['predicted_response_rate'])
    pred_df_rr = pred_df.groupby([f'Design_Feature_{i+1}' for i in range(n_design)]+['recruitment_plan'])['predicted_response_rate'].mean().reset_index(name='predicted_response_rate')

    x = pred_df_rr['predicted_response_rate'].values
    if len(x) <= 10:
        best_k = 2
    else:
        best_k = kmeans_fit(data = x)['best_k']
    kmeans_results = kmeans_bhattacharyya(data=x, k=best_k)

    merged_df = pd.merge(pred_df_rr, kmeans_results['clusters'][['predicted_response_rate', 'cluster_number']], on='predicted_response_rate')
    cluster_with_highest_rate = kmeans_results['clusters'].groupby('cluster_number')['predicted_response_rate'].mean().idxmax()
    highest_cluster = merged_df[merged_df['cluster_number']==cluster_with_highest_rate].reset_index(drop=True)
    highest_cluster.sort_values(by='predicted_response_rate', ascending=False, inplace=True)


    p_vec_next = np.array(highest_cluster['predicted_response_rate']/np.sum(highest_cluster['predicted_response_rate']))
    highest_cluster['p_vec'] = p_vec_next
    highest_cluster = pd.merge(highest_cluster, dt_design_5[['recruitment_plan','plan_response_rate']].drop_duplicates(), how='left', on='recruitment_plan')

    # highest_cluster = pred_df_rr

    # highest_cluster = pd.merge(highest_cluster, dt_design_5[['recruitment_plan','plan_response_rate']].drop_duplicates(), how='left', on='recruitment_plan')
    # highest_cluster['cluster_number'] = highest_cluster['recruitment_plan']

    ## prepare to chance of better performance:
    temp = dt_design_5[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
    event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
    # print("step1", np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
    # print(temp)
    p0 = (temp['plan_response_rate'].mean())

    rr_dict["step1_rr"].append(dt_design_5['plan_response_rate'].mean())
    sample_size_dict["step1_sample_size"].append(len(dt_design_5))

    for round in range(2, n_rounds+1):

        rr_dict["step"+str(round)+"_rr"].append(np.dot(np.array(highest_cluster['p_vec']), np.array(highest_cluster['plan_response_rate'])))
        plan_number_dict["step"+str(round)+"_plan_number"].append(len(p_vec_next))

        ## when it comes to the last round:
        if round == n_rounds:

            ## save benchmark results for last round:
            dt_benchmark = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                      n_patient_per_plan = n_patient_per_plan, seed=seed+round)
            maxidx = np.argmax(dt_benchmark['plan_response_rate'])
            random_rr_dict["step"+str(round)+'_random_max_rr'].append(dt_benchmark.iloc[maxidx,6])
            random_rr_dict["step"+str(round)+'_random_mean_rr'].append(dt_benchmark['plan_response_rate'].mean())

            if round == 2: # if the last round is round 2
                remaining_size = total_n - len(dt_design_5) # apply to the rest of the patients
            else:
                remaining_size -= len(dt_design_5_step2up)

            print("remaining size for last Round " + str(round) + ": " + str(remaining_size))
            sample_size_dict["step"+str(round)+"_sample_size"].append(remaining_size)
            last_round_sample_size.append(remaining_size)

            ## data generation, combine previous data:
            dt_design_5_rest = generate_data_step2up(highest_cluster=highest_cluster, p_vec = p_vec_next, round=round,
                                                          design_number = n_design, n_rounds = n_rounds,
                                                          n_patient_per_plan = n_patient_per_plan, size = int(remaining_size), seed=seed)
            plan_list = dt_design_5_rest['recruitment_plan'].unique()
            if round == 2:
                supp = dt_design_5[dt_design_5['recruitment_plan'].isin(plan_list)] # data from step 1
            else:
                # step from previous rounds
                supp = dt_design_5_step2up_overall[dt_design_5_step2up_overall['recruitment_plan'].isin(plan_list)]
            dt_design_5_rest_overall = pd.concat([dt_design_5_rest, supp.reset_index(drop=True)], axis = 0)
            dt_design_5_rest_overall.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)

            stopping_dict['early_stopping'].append(0)
            stopping_dict['early_stopping_plan'].append(0)
            stopping_dict['early_stopping_orr'].append(0)
            stopping_dict['early_stopping_size'].append(0)

            ## record final plan number
            final_plan_number.append(len(p_vec_next))

            ## record highest responserate overall
            highest_rr_overall.append(np.max(dt_design_5_rest_overall['plan_response_rate']))

            ## prepare to chance of better performance:
            temp = dt_design_5_rest[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
            event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            event_list_adaptive += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print("step"+str(round), np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print(len(temp), temp['plan_response_rate'].mean())
            # print(temp)

            result = binomtest(int(event_list), n=total_n, p=p0)
            # print(int(event_list), total_n, p0)
            orr_total.append(event_list)
            orr_total_adaptive.append(event_list_adaptive)
            better_chance.append(1 if result.pvalue < 0.05 else 0)
            # print(result.pvalue)

            break



        ## If haven't reached the last round:
        ## check remaining sample size:
        if round == 2:
            remaining_size = total_n - len(dt_design_5) # apply to the rest of the patients
        else:
            remaining_size -= len(dt_design_5_step2up)

        print("remaining size for Round " + str(round) + ": " + str(remaining_size))

        ## determine whether move on to step 2:
        if len(highest_cluster) == 1: # if there is only one plan left

            sample_size_dict["step"+str(round)+"_sample_size"].append(remaining_size)
            last_round_sample_size.append(remaining_size) # record the last round sample size
            for r in range(round+1, n_rounds+1):
                sample_size_dict["step"+str(r)+"_sample_size"].append(np.nan)

            ## save benchmark results for last round:
            dt_benchmark = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                      n_patient_per_plan = n_patient_per_plan, seed=seed+round)
            maxidx = np.argmax(dt_benchmark['plan_response_rate'])
            random_rr_dict["step"+str(round)+'_random_max_rr'].append(dt_benchmark.iloc[maxidx,6])
            random_rr_dict["step"+str(round)+'_random_mean_rr'].append(dt_benchmark['plan_response_rate'].mean())

            dt_design_5_step2up = generate_data_step2up(highest_cluster=highest_cluster, p_vec = p_vec_next,round=round,
                                                      design_number = n_design, n_rounds = n_rounds,
                                                      n_patient_per_plan = n_patient_per_plan, size = int(remaining_size), seed=seed)
            plan_list = dt_design_5_step2up['recruitment_plan'].unique()
            if round == 2:
                supp = dt_design_5[dt_design_5['recruitment_plan'].isin(plan_list)] # data from step 1
            else:
                # step from previous rounds
                supp = dt_design_5_step2up_overall[dt_design_5_step2up_overall['recruitment_plan'].isin(plan_list)]

            dt_design_5_step2up_overall = pd.concat([dt_design_5_step2up, supp.reset_index(drop=True)], axis = 0)
            dt_design_5_step2up_overall.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)

            stopping_dict['early_stopping'].append(1)
            stopping_dict['early_stopping_plan'].append(1)
            stopping_dict['early_stopping_orr'].append(0)
            stopping_dict['early_stopping_size'].append(0)

            ## record final plan number
            final_plan_number.append(len(p_vec_next))

            ## record highest responserate overall
            highest_rr_overall.append(np.max(dt_design_5_step2up_overall['plan_response_rate']))

            for r in range(round+1, n_rounds+1):
                rr_dict["step"+str(r)+"_rr"].append(np.nan)

            ## prepare for chance of better performance:
            temp = dt_design_5_step2up[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
            event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            event_list_adaptive += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print("step"+str(round), np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print(temp)

            result = binomtest(int(event_list), n=total_n, p=p0)
            # print(int(event_list), total_n, p0)
            orr_total.append(event_list)
            orr_total_adaptive.append(event_list_adaptive)
            better_chance.append(1 if result.pvalue < 0.05 else 0)
            # print(result.pvalue)
            break

        ## sample size determination:
        if round == 2:
            dt_design_5_step2up_overall = dt_design_5
        orr_1 = dt_design_5_step2up_overall['response'].mean() # observed overall response rates for previous rounds
        orr_2 = orr_1 + delta
        n_1 = len(dt_design_5_step2up_overall)

        size_step2up = sample_size_calc(orr_1, n_1, delta=delta, alpha=alpha, power=power) # total size for dataset
        if size_step2up > 0 and size_step2up < 1000:
            size_step2up = 1000 # if size in [0,1000], then it is 1000 for this round.
        elif size_step2up >= 1000:
            size_step2up = min(size_step2up, int(total_n/n_rounds)) # dataset size capped by the n_patient_per_plan
        else:
        # if size_step2up <= 0:
            # the process stops at this step
            print('Calculated size for Round ' + str(round) + ': ' + str(size_step2up) + 'lt 0, break')
            sample_size_dict["step"+str(round)+"_sample_size"].append(remaining_size)
            last_round_sample_size.append(remaining_size)
            for r in range(round+1, n_rounds+1):
                sample_size_dict["step"+str(r)+"_sample_size"].append(np.nan)

            dt_benchmark = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                      n_patient_per_plan = n_patient_per_plan, seed=seed+round)
            maxidx = np.argmax(dt_benchmark['plan_response_rate'])
            random_rr_dict["step"+str(round)+'_random_max_rr'].append(dt_benchmark.iloc[maxidx,6])
            random_rr_dict["step"+str(round)+'_random_mean_rr'].append(dt_benchmark['plan_response_rate'].mean())

            dt_design_5_step2up = generate_data_step2up(highest_cluster=highest_cluster, p_vec = p_vec_next,round=round,
                                                      design_number = n_design, n_rounds = n_rounds,
                                                      n_patient_per_plan = n_patient_per_plan, size = int(remaining_size), seed=seed)
            plan_list = dt_design_5_step2up['recruitment_plan'].unique()
            if round == 2:
                supp = dt_design_5[dt_design_5['recruitment_plan'].isin(plan_list)] # data from step 1
            else:
                # step from previous rounds
                supp = dt_design_5_step2up_overall[dt_design_5_step2up_overall['recruitment_plan'].isin(plan_list)]

            dt_design_5_step2up_overall = pd.concat([dt_design_5_step2up, supp.reset_index(drop=True)], axis = 0)
            dt_design_5_step2up_overall.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)

            stopping_dict['early_stopping'].append(1)
            stopping_dict['early_stopping_plan'].append(0)
            stopping_dict['early_stopping_orr'].append(0)
            stopping_dict['early_stopping_size'].append(1)

            ## record final plan number
            final_plan_number.append(len(p_vec_next))

            ## record highest responserate overall
            highest_rr_overall.append(np.max(dt_design_5_step2up_overall['plan_response_rate']))

            for r in range(round+1, n_rounds+1):
                rr_dict["step"+str(r)+"_rr"].append(np.nan)

            ## prepare for chance of better performance:
            temp = dt_design_5_step2up[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
            event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            event_list_adaptive += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print("step"+str(round), np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print(temp)

            result = binomtest(int(event_list), n=total_n, p=p0)
            # print(int(event_list), total_n, p0)
            orr_total.append(event_list)
            orr_total_adaptive.append(event_list_adaptive)
            better_chance.append(1 if result.pvalue < 0.05 else 0)
            # print(result.pvalue)
            break

        print('Calculated size for Round ' + str(round) + ': ' + str(size_step2up))

        sample_size_dict["step"+str(round)+"_sample_size"].append(size_step2up)

        ## save benchmark results for last round:
        dt_benchmark = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                            n_patient_per_plan = n_patient_per_plan, seed=seed+round)
        maxidx = np.argmax(dt_benchmark['plan_response_rate'])
        random_rr_dict["step"+str(round)+'_random_max_rr'].append(dt_benchmark.iloc[maxidx,6])
        random_rr_dict["step"+str(round)+'_random_mean_rr'].append(dt_benchmark['plan_response_rate'].mean())

        ## data generation, combine previous data:
        dt_design_5_step2up = generate_data_step2up(highest_cluster=highest_cluster, p_vec = p_vec_next,round=round,
                                                  design_number = n_design, n_rounds = n_rounds,
                                                  n_patient_per_plan = n_patient_per_plan, size = int(size_step2up), seed=seed)

        plan_list = dt_design_5_step2up['recruitment_plan'].unique()
        if round == 2:
            supp = dt_design_5[dt_design_5['recruitment_plan'].isin(plan_list)]
        else:
            supp = dt_design_5_step2up_overall[dt_design_5_step2up_overall['recruitment_plan'].isin(plan_list)]
        dt_design_5_step2up_overall = pd.concat([dt_design_5_step2up, supp.reset_index(drop=True)], axis = 0)
        dt_design_5_step2up_overall.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)

        ## Ensemble model fitting:
        dt_design_5_step2up.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)
        y_pred2up = ensemble_model_fit(data = dt_design_5_step2up_overall, data_pred = dt_design_5_step2up)
        pred_df_step2up = pd.DataFrame(np.hstack((dt_design_5_step2up,  y_pred2up[:, 1].reshape(-1, 1))),
                                    columns=list(dt_design_5_step2up.columns) + ['predicted_response_rate'])
        pred_df_rr_step2up = pred_df_step2up.groupby([f'Design_Feature_{i+1}' for i in range(n_design)]+['recruitment_plan'])['predicted_response_rate'].mean().reset_index(name='predicted_response_rate')

        ## select recruitment plans:
        x = pred_df_rr_step2up['predicted_response_rate'].values

        if len(x) <= 10:
            best_k = 2
        else:
            best_k = kmeans_fit(data = x)['best_k']
        kmeans_results = kmeans_bhattacharyya(data=x, k=best_k)

        ## match the cluster results back to the original data
        highest_cluster_previous = highest_cluster # save previous cluster results

        merged_df = pd.merge(pred_df_rr_step2up, kmeans_results['clusters'][['predicted_response_rate', 'cluster_number']], on='predicted_response_rate')
        cluster_with_highest_rate = kmeans_results['clusters'].groupby('cluster_number')['predicted_response_rate'].mean().idxmax()
        highest_cluster = merged_df[merged_df['cluster_number']==cluster_with_highest_rate].reset_index(drop=True)
        highest_cluster.sort_values(by='predicted_response_rate', ascending=False, inplace=True)

        # p_vec_previous = p_vec_next # save p_vec of previous round
        p_vec_next = np.array(highest_cluster['predicted_response_rate']/np.sum(highest_cluster['predicted_response_rate']))

        highest_cluster['p_vec'] = p_vec_next

        highest_cluster = pd.merge(highest_cluster, dt_design_5_step2up[['recruitment_plan','plan_response_rate']].drop_duplicates(), how='left', on='recruitment_plan')

        ## prepare to chance of better performance:
        temp = dt_design_5_step2up[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
        event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
        event_list_adaptive += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
        # print("step"+str(round), np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
        # print(temp)

        ## check early stopping predicted ORR:
        orr_df = pd.merge(highest_cluster_previous, highest_cluster[['recruitment_plan','predicted_response_rate','p_vec']], on='recruitment_plan', how='left')
        orr_df.fillna(0, inplace=True)
        p_orr_1 = np.dot(np.array(orr_df['p_vec_x']), np.array(orr_df['predicted_response_rate_y']))
        p_orr_2 = np.dot(np.array(orr_df['p_vec_y']), np.array(orr_df['predicted_response_rate_y']))
        print("orr termination", p_orr_1, p_orr_2, p_orr_2 - p_orr_1)

        if (p_orr_2 - p_orr_1 < epsilon):
            # step 3 use the same strategy of step2
            print(i, p_orr_1, p_orr_2, "early stop at Round " + str(round))
            rr_dict["step"+str(round+1)+"_rr"].append(np.dot(np.array(highest_cluster['p_vec']), np.array(highest_cluster['plan_response_rate'])))

            ## save benchmark results for last round:
            dt_benchmark = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                                n_patient_per_plan = n_patient_per_plan, seed=seed+round)
            maxidx = np.argmax(dt_benchmark['plan_response_rate'])
            random_rr_dict["step"+str(round+1)+'_random_max_rr'].append(dt_benchmark.iloc[maxidx,6])
            random_rr_dict["step"+str(round+1)+'_random_mean_rr'].append(dt_benchmark['plan_response_rate'].mean())

            ### update remaining size:
            remaining_size -= len(dt_design_5_step2up)
            print("early stop, remaining size for Round" + str(round + 1) + ": " + str(remaining_size))
            sample_size_dict["step"+str(round)+"_sample_size"].append(remaining_size)
            last_round_sample_size.append(remaining_size)

            for r in range(round+2, n_rounds+1):
                sample_size_dict["step"+str(r)+"_sample_size"].append(np.nan)

            dt_design_5_rest = generate_data_step2up(highest_cluster=highest_cluster, p_vec = p_vec_next, round=round,
                                                      design_number = n_design, n_rounds = n_rounds,
                                                      n_patient_per_plan = n_patient_per_plan, size = int(remaining_size), seed=seed)
            plan_list = dt_design_5_rest['recruitment_plan'].unique()
            supp = dt_design_5_step2up_overall[dt_design_5_step2up_overall['recruitment_plan'].isin(plan_list)]
            dt_design_5_rest_overall = pd.concat([dt_design_5_rest, supp.reset_index(drop=True)], axis = 0)
            dt_design_5_rest_overall.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)

            stopping_dict['early_stopping'].append(1)
            stopping_dict['early_stopping_plan'].append(0)
            stopping_dict['early_stopping_orr'].append(1)
            stopping_dict['early_stopping_size'].append(0)

            ## record final plan number
            final_plan_number.append(len(p_vec_next))

            ## record highest responserate overall
            highest_rr_overall.append(np.max(dt_design_5_rest_overall['plan_response_rate']))

            for r in range(round+2, n_rounds+1):
                rr_dict["step"+str(r)+"_rr"].append(np.nan)

            ## prepare to chance of better performance:
            temp = dt_design_5_rest[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
            event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            event_list_adaptive += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print("step"+str(round+1), np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print(temp)

            result = binomtest(int(event_list), n=total_n, p=p0)
            # print(int(event_list), total_n, p0)
            orr_total.append(event_list)
            orr_total_adaptive.append(event_list_adaptive)
            better_chance.append(1 if result.pvalue < 0.05 else 0)
            # print(result.pvalue)

            break



1
['step1_rr: 0', 'step2_rr: 0', 'step3_rr: 0', 'step4_rr: 0', 'step5_rr: 0']
empirical response rate 0.05655306495882891
remaining size for Round 2: 140000
Calculated size for Round 2: 4240.799240494033
orr termination 0.023692156640350504 0.07230555879718663 0.04861340215683613
remaining size for Round 3: 135760
2
['step1_rr: 1', 'step2_rr: 1', 'step3_rr: 1', 'step4_rr: 1', 'step5_rr: 1']
empirical response rate 0.054637465690759376
remaining size for Round 2: 140000
Calculated size for Round 2: 4113.519228021547
orr termination 0.04739092921486517 0.07127662623484328 0.023885697019978107
remaining size for Round 3: 135887
Calculated size for Round 3: 9827.715869701693
orr termination 0.03824708261820013 0.07222230506546733 0.033975222447267195
remaining size for Round 4: 126060
3
['step1_rr: 2', 'step2_rr: 2', 'step3_rr: 2', 'step4_rr: 2', 'step5_rr: 2']
empirical response rate 0.05303636779505947
remaining size for Round 2: 140000
Calculated size for Round 2: 4007.355182472531
orr 

### Results summary

In [204]:
rr_df = pd.DataFrame(rr_dict)
rr_df

Unnamed: 0,step1_rr,step2_rr,step3_rr,step4_rr,step5_rr
0,0.055,0.068471,0.068471,,
1,0.055,0.063950,0.068471,0.068471,
2,0.055,0.068471,0.068471,0.068471,
3,0.055,0.061833,0.063991,0.068471,0.068471
4,0.055,0.065161,0.068471,,
...,...,...,...,...,...
95,0.055,0.068471,0.068471,0.068471,
96,0.055,0.068471,,,
97,0.055,0.068471,,,
98,0.055,0.064735,0.066419,0.068471,0.068471


In [205]:
# Calculate average rounds:
row_non_nan_counts = rr_df.count(axis=1)

print(f"{np.mean(row_non_nan_counts):.1f} ({np.std(row_non_nan_counts):.1f})")

3.4 (0.9)


In [206]:
result_dict = {}
for key, values in rr_df.items():
    mean_value = np.mean(values)
    std_value = np.std(values)
    result_dict[key] = f"{mean_value:.3f} ({std_value:.3f})"

for key, value in list(result_dict.items())[:12]:
    print(f"{key}: {value}")

step1_rr: 0.055 (0.000)
step2_rr: 0.066 (0.003)
step3_rr: 0.068 (0.001)
step4_rr: 0.068 (0.001)
step5_rr: 0.068 (0.000)


In [207]:
# Overall orr:
result_dict = np.array(orr_total) / total_n
mean_result = np.mean(result_dict)
std_result = np.std(result_dict)

print(f"Mean: {mean_result:.3f}, StD: {std_result:.3f}")

Mean: 0.066, StD: 0.002


In [208]:
print(np.mean(highest_true_rr))
print(np.std(highest_true_rr))

0.068470526432204
1.3877787807814457e-17


In [209]:
result_dict = np.array(orr_total_adaptive) / (total_n-34976)
mean_result = np.mean(result_dict)
std_result = np.std(result_dict)

print(f"Mean: {mean_result:.3f}, StD: {std_result:.3f}")

Mean: 0.068, StD: 0.002


In [210]:
# plan_number_dict = {"step{}_plan_number".format(r): [] for r in range(1, n_rounds + 1)}
result_dict = {}
for key, values in plan_number_dict.items():
    mean_value = np.mean(values)
    std_value = np.std(values)
    result_dict[key] = f"{mean_value:.1f} ({std_value:.1f})"
#     result_dict[key + "_std"] = std_value
for key, value in list(result_dict.items())[:12]:
    print(f"{key}: {value}")

step1_plan_number: nan (nan)
step2_plan_number: 4.3 (3.2)
step3_plan_number: 2.3 (2.1)
step4_plan_number: 1.8 (1.5)
step5_plan_number: 1.6 (1.5)


In [211]:
means = {key: np.mean(values) for key, values in stopping_dict.items()}
means

{'early_stopping': 0.84,
 'early_stopping_plan': 0.84,
 'early_stopping_orr': 0.0,
 'early_stopping_size': 0.0}

In [212]:
print(np.mean(last_round_sample_size), np.std(last_round_sample_size))

132461.36 5840.641710497229


In [213]:
# plan_number_dict = {"step{}_plan_number".format(r): [] for r in range(1, n_rounds + 1)}
result_dict = {}
for key, values in sample_size_dict.items():
    mean_value = np.nanmean(values)
    std_value = np.nanstd(values)
    result_dict[key] = f"{mean_value:.3f} ({std_value:.3f})"
#     result_dict[key + "_std"] = std_value
for key, value in list(result_dict.items())[:12]:
    print(f"{key}: {value}")

step1_sample_size: 34976.000 (0.000)
step2_sample_size: 24523.433 (48509.914)
step3_sample_size: 75461.071 (64064.865)
step4_sample_size: 79143.303 (59483.378)
step5_sample_size: 122987.688 (987.106)


## Ensemble learning - 7 methods

In [214]:
# def ensemble_model_fit(data, data_pred):
#     X_train, X_test, y_train, y_test = train_test_split(
#         data.drop(['recruitment_plan','plan_response_rate','group_size','response'], axis=1),
#         data['response'],
#         test_size=0.2,
#         random_state=0
#     )

#     # Define the VotingClassifier with the individual classifiers
#     voting_classifier = ensemble.VotingClassifier(
#         estimators=[
#             ('LR', linear_model.LogisticRegression(max_iter=200, random_state=0))
# #             ('Ridge', linear_model.LogisticRegression(penalty='l2', solver='lbfgs', max_iter=200, random_state=0))
#                     # ('SVM', svm.SVC(kernel='linear', C=1.0, random_state=0, probability=True, class_weight='balanced'))
# #                     ('RF', ensemble.RandomForestClassifier(n_estimators=200, criterion='gini', random_state=0))
#                     # ('XGB', XGBClassifier(n_estimators=50, learning_rate=0.1, random_state=0))
#                    ],
#         voting='soft'
#     )

#     # Define the hyperparameter grid to search
#     # Best Hyperparameters: {'LR__C': 0.01, 'RF__n_estimators': 50, 'XGB__n_estimators': 50}
#     param_grid = {
#         # 'NB__alpha': [0.01, 0.05, 0.1],  # '__' is used to specify hyperparameters for individual classifiers
#         'LR__C': [0.01] # [0.01, 0.05, 0.1]
#         # 'Ridge__C': [0.01]
#         # 'SVM__C': [0.01, 0.05, 0.1]
# #         'RF__n_estimators': [50] # [10, 30, 50]
#         # 'XGB__n_estimators': [50]
#     }

#     # Create a GridSearchCV object
#     # custom_scorer_auc = make_scorer(roc_auc_score, needs_proba=True)
#     grid_search = GridSearchCV(voting_classifier, param_grid, cv=10, scoring='roc_auc')

#     # Perform the grid search on the training data
#     grid_search.fit(X_train, y_train)

#     # Get the best hyperparameters
#     best_params = grid_search.best_params_

#     # Train the final VotingClassifier with the best hyperparameters on the full training set
#     final_voting_classifier = grid_search.best_estimator_
#     final_voting_classifier.fit(X_train, y_train)

#     # Predict probabilities instead of binary outcomes on the test set
#     y_pred_proba_test = final_voting_classifier.predict_proba(X_test)
#     y_pred_test = final_voting_classifier.predict(X_test)
#     X_dt = data_pred.drop(['recruitment_plan','plan_response_rate','group_size','response'], axis=1)
#     y_pred = final_voting_classifier.predict_proba(X_dt)

#     return y_pred

In [215]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import ensemble, linear_model
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier

def ensemble_model_fit(data, data_pred):
    X_train, X_test, y_train, y_test = train_test_split(
        data.drop(['recruitment_plan', 'plan_response_rate', 'group_size', 'response'], axis=1),
        data['response'],
        test_size=0.2,
        random_state=0
    )

    # Define the VotingClassifier with the individual classifiers
    voting_classifier = ensemble.VotingClassifier(
        estimators=[
            ('LR', linear_model.LogisticRegression(penalty = 'none', max_iter=200, random_state=0)),
            ('Lasso', linear_model.LogisticRegression(penalty = "l1", max_iter=200, random_state=0,
                                                     solver="liblinear")),
            ('Ridge', linear_model.LogisticRegression(penalty = "l2", max_iter=200, random_state=0)),
            ('GBM', ensemble.GradientBoostingClassifier(random_state=0)),
            ('RF', ensemble.RandomForestClassifier(random_state=0)),
            ('XGB', XGBClassifier(random_state=0)),
            ('NN', MLPClassifier(random_state=0))
        ],
        voting='soft'
    )

    # Define the hyperparameter grid to search
#     param_grid = {
# #         'LR__C': [0.01, 0.1, 1.0],  # Regularization parameter for logistic regression
#         'Lasso__C': [0.01, 0.1, 1.0],  # Regularization parameter for lasso regression
#         'Ridge__C': [0.01, 0.1, 1.0],  # Regularization parameter for ridge regression
#         'GBM__learning_rate': [0.01, 0.1, 0.5],  # Learning rate for gradient boosting machine
#         'GBM__n_estimators': [50, 100, 200],  # Number of trees for gradient boosting machine
#         'RF__n_estimators': [50, 100, 200],  # Number of trees for random forest
# #         'RF__max_depth': [10, 20],  # Maximum depth of trees for random forest
#         'XGB__learning_rate': [0.01, 0.1, 0.5],  # Learning rate for XGBoost
#         'XGB__n_estimators': [50, 100, 200],  # Number of trees for XGBoost
#         'NN__hidden_layer_sizes': [(50,), (100,), (50, 50)],  # Size of hidden layers for neural networks
#         'NN__alpha': [0.0001, 0.001, 0.01]  # Regularization parameter for neural networks
#     }
    param_grid = {
#         'LR__C': [0.01, 0.1, 1.0],  # Regularization parameter for logistic regression
        'Lasso__C': [0.1],  # Regularization parameter for lasso regression
        'Ridge__C': [0.01],  # Regularization parameter for ridge regression
        'GBM__learning_rate': [0.01],  # Learning rate for gradient boosting machine
        'GBM__n_estimators': [50],  # Number of trees for gradient boosting machine
        'RF__n_estimators': [50],  # Number of trees for random forest
#         'RF__max_depth': [10, 20],  # Maximum depth of trees for random forest
        'XGB__learning_rate': [0.01],  # Learning rate for XGBoost
        'XGB__n_estimators': [50],  # Number of trees for XGBoost
        'NN__hidden_layer_sizes': [(50,)],  # Size of hidden layers for neural networks
        'NN__alpha': [0.01]  # Regularization parameter for neural networks
    }

    # Create a GridSearchCV object
    grid_search = GridSearchCV(voting_classifier, param_grid, cv=10, scoring='roc_auc')

    # Perform the grid search on the training data
    grid_search.fit(X_train, y_train)

    # Get the best hyperparameters
    best_params = grid_search.best_params_
#     print("Best fit parameters:", best_params)

    # Train the final VotingClassifier with the best hyperparameters on the full training set
    final_voting_classifier = grid_search.best_estimator_
    final_voting_classifier.fit(X_train, y_train)

    # Predict probabilities instead of binary outcomes on the test set
    y_pred_proba_test = final_voting_classifier.predict_proba(X_test)
    y_pred_test = final_voting_classifier.predict(X_test)
    X_dt = data_pred.drop(['recruitment_plan', 'plan_response_rate', 'group_size', 'response'], axis=1)
    y_pred = final_voting_classifier.predict_proba(X_dt)

    return y_pred


In [216]:
# ensemble_model_fit(dt_design_5, dt_design_5)

### Simulation starts

In [217]:
n_sim = 100
# create a list of random seeds
random.seed(42)
random_seeds = [random.randint(1, 100000) for _ in range(n_sim)]

design_list = [5,8,10]
patient_n_list = [5468,680,170] # n_patient_per_plan

n_design = 5
n_patient_per_plan = 5468
n_rounds = 5
total_n = n_patient_per_plan * (2**n_design)

## sample size determination
beta = 0.2
power = 1 - beta
alpha = 0.05
delta = 0.01 # effect size

## early stopping
epsilon = 0.001

In [218]:
rr_dict = {"step{}_rr".format(r): [] for r in range(1, n_rounds + 1)}
plan_number_dict = {"step{}_plan_number".format(r): [] for r in range(1, n_rounds + 1)}
sample_size_dict = {"step{}_sample_size".format(r): [] for r in range(1, n_rounds + 1)}
last_round_sample_size = []
random_rr_dict = {"step{}_random_max_rr".format(r): [] for r in range(1, n_rounds + 1)}
random_rr_dict.update({
    "step{}_random_mean_rr".format(r): [] for r in range(1, n_rounds + 1)
})

stopping_dict = {"early_stopping": [],
                 "early_stopping_plan":[],
                 "early_stopping_orr":[],
                 "early_stopping_size":[]}
final_plan_number = []
highest_rr_overall = []
highest_true_rr = []

better_chance = []

orr_total = []
orr_total_adaptive = []



i = 0

for seed in random_seeds:
    i += 1
    print(i)

    print([f"{key}: {len(value)}" for key, value in rr_dict.items()])



    event_list = 0
    event_list_adaptive = 0
    max_rr = []

    # step 1:
    ## Generate dataset
    dt_design_5 = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                      n_patient_per_plan = n_patient_per_plan, seed=seed)
    highest_true_rr.append(np.max(dt_design_5['plan_response_rate']))
    print("empirical response rate", dt_design_5['response'].mean())
    ## save for benchmark results
    maxidx = np.argmax(dt_design_5['plan_response_rate'])
    random_rr_dict['step1_random_max_rr'].append(dt_design_5.iloc[maxidx,6])
    random_rr_dict['step1_random_mean_rr'].append(dt_design_5['plan_response_rate'].mean())

    ## Ensemble modelling:
    y_pred = ensemble_model_fit(data=dt_design_5, data_pred = dt_design_5)

    ## select recruitment plan
    pred_df = pd.DataFrame(np.hstack((dt_design_5,  y_pred[:, 1].reshape(-1, 1))),
                             columns=list(dt_design_5.columns) + ['predicted_response_rate'])
    pred_df_rr = pred_df.groupby([f'Design_Feature_{i+1}' for i in range(n_design)]+['recruitment_plan'])['predicted_response_rate'].mean().reset_index(name='predicted_response_rate')

    x = pred_df_rr['predicted_response_rate'].values
    if len(x) <= 10:
        best_k = 2
    else:
        best_k = kmeans_fit(data = x)['best_k']
    kmeans_results = kmeans_bhattacharyya(data=x, k=best_k)

    merged_df = pd.merge(pred_df_rr, kmeans_results['clusters'][['predicted_response_rate', 'cluster_number']], on='predicted_response_rate')
    cluster_with_highest_rate = kmeans_results['clusters'].groupby('cluster_number')['predicted_response_rate'].mean().idxmax()
    highest_cluster = merged_df[merged_df['cluster_number']==cluster_with_highest_rate].reset_index(drop=True)
    highest_cluster.sort_values(by='predicted_response_rate', ascending=False, inplace=True)


    p_vec_next = np.array(highest_cluster['predicted_response_rate']/np.sum(highest_cluster['predicted_response_rate']))
    highest_cluster['p_vec'] = p_vec_next
    highest_cluster = pd.merge(highest_cluster, dt_design_5[['recruitment_plan','plan_response_rate']].drop_duplicates(), how='left', on='recruitment_plan')

    # highest_cluster = pred_df_rr

    # highest_cluster = pd.merge(highest_cluster, dt_design_5[['recruitment_plan','plan_response_rate']].drop_duplicates(), how='left', on='recruitment_plan')
    # highest_cluster['cluster_number'] = highest_cluster['recruitment_plan']

    ## prepare to chance of better performance:
    temp = dt_design_5[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
    event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
    # print("step1", np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
    # print(temp)
    p0 = (temp['plan_response_rate'].mean())

    rr_dict["step1_rr"].append(dt_design_5['plan_response_rate'].mean())
    sample_size_dict["step1_sample_size"].append(len(dt_design_5))

    for round in range(2, n_rounds+1):

        rr_dict["step"+str(round)+"_rr"].append(np.dot(np.array(highest_cluster['p_vec']), np.array(highest_cluster['plan_response_rate'])))
        plan_number_dict["step"+str(round)+"_plan_number"].append(len(p_vec_next))

        ## when it comes to the last round:
        if round == n_rounds:

            ## save benchmark results for last round:
            dt_benchmark = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                      n_patient_per_plan = n_patient_per_plan, seed=seed+round)
            maxidx = np.argmax(dt_benchmark['plan_response_rate'])
            random_rr_dict["step"+str(round)+'_random_max_rr'].append(dt_benchmark.iloc[maxidx,6])
            random_rr_dict["step"+str(round)+'_random_mean_rr'].append(dt_benchmark['plan_response_rate'].mean())

            if round == 2: # if the last round is round 2
                remaining_size = total_n - len(dt_design_5) # apply to the rest of the patients
            else:
                remaining_size -= len(dt_design_5_step2up)

            print("remaining size for last Round " + str(round) + ": " + str(remaining_size))
            sample_size_dict["step"+str(round)+"_sample_size"].append(remaining_size)
            last_round_sample_size.append(remaining_size)

            ## data generation, combine previous data:
            dt_design_5_rest = generate_data_step2up(highest_cluster=highest_cluster, p_vec = p_vec_next, round=round,
                                                          design_number = n_design, n_rounds = n_rounds,
                                                          n_patient_per_plan = n_patient_per_plan, size = int(remaining_size), seed=seed)
            plan_list = dt_design_5_rest['recruitment_plan'].unique()
            if round == 2:
                supp = dt_design_5[dt_design_5['recruitment_plan'].isin(plan_list)] # data from step 1
            else:
                # step from previous rounds
                supp = dt_design_5_step2up_overall[dt_design_5_step2up_overall['recruitment_plan'].isin(plan_list)]
            dt_design_5_rest_overall = pd.concat([dt_design_5_rest, supp.reset_index(drop=True)], axis = 0)
            dt_design_5_rest_overall.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)

            stopping_dict['early_stopping'].append(0)
            stopping_dict['early_stopping_plan'].append(0)
            stopping_dict['early_stopping_orr'].append(0)
            stopping_dict['early_stopping_size'].append(0)

            ## record final plan number
            final_plan_number.append(len(p_vec_next))

            ## record highest responserate overall
            highest_rr_overall.append(np.max(dt_design_5_rest_overall['plan_response_rate']))

            ## prepare to chance of better performance:
            temp = dt_design_5_rest[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
            event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            event_list_adaptive += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print("step"+str(round), np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print(len(temp), temp['plan_response_rate'].mean())
            # print(temp)

            result = binomtest(int(event_list), n=total_n, p=p0)
            # print(int(event_list), total_n, p0)
            orr_total.append(event_list)
            orr_total_adaptive.append(event_list_adaptive)
            better_chance.append(1 if result.pvalue < 0.05 else 0)
            # print(result.pvalue)

            break



        ## If haven't reached the last round:
        ## check remaining sample size:
        if round == 2:
            remaining_size = total_n - len(dt_design_5) # apply to the rest of the patients
        else:
            remaining_size -= len(dt_design_5_step2up)

        print("remaining size for Round " + str(round) + ": " + str(remaining_size))

        ## determine whether move on to step 2:
        if len(highest_cluster) == 1: # if there is only one plan left

            sample_size_dict["step"+str(round)+"_sample_size"].append(remaining_size)
            last_round_sample_size.append(remaining_size) # record the last round sample size
            for r in range(round+1, n_rounds+1):
                sample_size_dict["step"+str(r)+"_sample_size"].append(np.nan)

            ## save benchmark results for last round:
            dt_benchmark = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                      n_patient_per_plan = n_patient_per_plan, seed=seed+round)
            maxidx = np.argmax(dt_benchmark['plan_response_rate'])
            random_rr_dict["step"+str(round)+'_random_max_rr'].append(dt_benchmark.iloc[maxidx,6])
            random_rr_dict["step"+str(round)+'_random_mean_rr'].append(dt_benchmark['plan_response_rate'].mean())

            dt_design_5_step2up = generate_data_step2up(highest_cluster=highest_cluster, p_vec = p_vec_next,round=round,
                                                      design_number = n_design, n_rounds = n_rounds,
                                                      n_patient_per_plan = n_patient_per_plan, size = int(remaining_size), seed=seed)
            plan_list = dt_design_5_step2up['recruitment_plan'].unique()
            if round == 2:
                supp = dt_design_5[dt_design_5['recruitment_plan'].isin(plan_list)] # data from step 1
            else:
                # step from previous rounds
                supp = dt_design_5_step2up_overall[dt_design_5_step2up_overall['recruitment_plan'].isin(plan_list)]

            dt_design_5_step2up_overall = pd.concat([dt_design_5_step2up, supp.reset_index(drop=True)], axis = 0)
            dt_design_5_step2up_overall.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)

            stopping_dict['early_stopping'].append(1)
            stopping_dict['early_stopping_plan'].append(1)
            stopping_dict['early_stopping_orr'].append(0)
            stopping_dict['early_stopping_size'].append(0)

            ## record final plan number
            final_plan_number.append(len(p_vec_next))

            ## record highest responserate overall
            highest_rr_overall.append(np.max(dt_design_5_step2up_overall['plan_response_rate']))

            for r in range(round+1, n_rounds+1):
                rr_dict["step"+str(r)+"_rr"].append(np.nan)

            ## prepare for chance of better performance:
            temp = dt_design_5_step2up[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
            event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            event_list_adaptive += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print("step"+str(round), np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print(temp)

            result = binomtest(int(event_list), n=total_n, p=p0)
            # print(int(event_list), total_n, p0)
            orr_total.append(event_list)
            orr_total_adaptive.append(event_list_adaptive)
            better_chance.append(1 if result.pvalue < 0.05 else 0)
            # print(result.pvalue)
            break

        ## sample size determination:
        if round == 2:
            dt_design_5_step2up_overall = dt_design_5
        orr_1 = dt_design_5_step2up_overall['response'].mean() # observed overall response rates for previous rounds
        orr_2 = orr_1 + delta
        n_1 = len(dt_design_5_step2up_overall)

        size_step2up = sample_size_calc(orr_1, n_1, delta=delta, alpha=alpha, power=power) # total size for dataset
        if size_step2up > 0 and size_step2up < 1000:
            size_step2up = 1000 # if size in [0,1000], then it is 1000 for this round.
        elif size_step2up >= 1000:
            size_step2up = min(size_step2up, int(total_n/n_rounds)) # dataset size capped by the n_patient_per_plan
        else:
        # if size_step2up <= 0:
            # the process stops at this step
            print('Calculated size for Round ' + str(round) + ': ' + str(size_step2up) + 'lt 0, break')
            sample_size_dict["step"+str(round)+"_sample_size"].append(remaining_size)
            last_round_sample_size.append(remaining_size)
            for r in range(round+1, n_rounds+1):
                sample_size_dict["step"+str(r)+"_sample_size"].append(np.nan)

            dt_benchmark = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                      n_patient_per_plan = n_patient_per_plan, seed=seed+round)
            maxidx = np.argmax(dt_benchmark['plan_response_rate'])
            random_rr_dict["step"+str(round)+'_random_max_rr'].append(dt_benchmark.iloc[maxidx,6])
            random_rr_dict["step"+str(round)+'_random_mean_rr'].append(dt_benchmark['plan_response_rate'].mean())

            dt_design_5_step2up = generate_data_step2up(highest_cluster=highest_cluster, p_vec = p_vec_next,round=round,
                                                      design_number = n_design, n_rounds = n_rounds,
                                                      n_patient_per_plan = n_patient_per_plan, size = int(remaining_size), seed=seed)
            plan_list = dt_design_5_step2up['recruitment_plan'].unique()
            if round == 2:
                supp = dt_design_5[dt_design_5['recruitment_plan'].isin(plan_list)] # data from step 1
            else:
                # step from previous rounds
                supp = dt_design_5_step2up_overall[dt_design_5_step2up_overall['recruitment_plan'].isin(plan_list)]

            dt_design_5_step2up_overall = pd.concat([dt_design_5_step2up, supp.reset_index(drop=True)], axis = 0)
            dt_design_5_step2up_overall.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)

            stopping_dict['early_stopping'].append(1)
            stopping_dict['early_stopping_plan'].append(0)
            stopping_dict['early_stopping_orr'].append(0)
            stopping_dict['early_stopping_size'].append(1)

            ## record final plan number
            final_plan_number.append(len(p_vec_next))

            ## record highest responserate overall
            highest_rr_overall.append(np.max(dt_design_5_step2up_overall['plan_response_rate']))

            for r in range(round+1, n_rounds+1):
                rr_dict["step"+str(r)+"_rr"].append(np.nan)

            ## prepare for chance of better performance:
            temp = dt_design_5_step2up[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
            event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            event_list_adaptive += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print("step"+str(round), np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print(temp)

            result = binomtest(int(event_list), n=total_n, p=p0)
            # print(int(event_list), total_n, p0)
            orr_total.append(event_list)
            orr_total_adaptive.append(event_list_adaptive)
            better_chance.append(1 if result.pvalue < 0.05 else 0)
            # print(result.pvalue)
            break

        print('Calculated size for Round ' + str(round) + ': ' + str(size_step2up))

        sample_size_dict["step"+str(round)+"_sample_size"].append(size_step2up)

        ## save benchmark results for last round:
        dt_benchmark = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                            n_patient_per_plan = n_patient_per_plan, seed=seed+round)
        maxidx = np.argmax(dt_benchmark['plan_response_rate'])
        random_rr_dict["step"+str(round)+'_random_max_rr'].append(dt_benchmark.iloc[maxidx,6])
        random_rr_dict["step"+str(round)+'_random_mean_rr'].append(dt_benchmark['plan_response_rate'].mean())

        ## data generation, combine previous data:
        dt_design_5_step2up = generate_data_step2up(highest_cluster=highest_cluster, p_vec = p_vec_next,round=round,
                                                  design_number = n_design, n_rounds = n_rounds,
                                                  n_patient_per_plan = n_patient_per_plan, size = int(size_step2up), seed=seed)

        plan_list = dt_design_5_step2up['recruitment_plan'].unique()
        if round == 2:
            supp = dt_design_5[dt_design_5['recruitment_plan'].isin(plan_list)]
        else:
            supp = dt_design_5_step2up_overall[dt_design_5_step2up_overall['recruitment_plan'].isin(plan_list)]
        dt_design_5_step2up_overall = pd.concat([dt_design_5_step2up, supp.reset_index(drop=True)], axis = 0)
        dt_design_5_step2up_overall.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)

        ## Ensemble model fitting:
        dt_design_5_step2up.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)
        y_pred2up = ensemble_model_fit(data = dt_design_5_step2up_overall, data_pred = dt_design_5_step2up)
        pred_df_step2up = pd.DataFrame(np.hstack((dt_design_5_step2up,  y_pred2up[:, 1].reshape(-1, 1))),
                                    columns=list(dt_design_5_step2up.columns) + ['predicted_response_rate'])
        pred_df_rr_step2up = pred_df_step2up.groupby([f'Design_Feature_{i+1}' for i in range(n_design)]+['recruitment_plan'])['predicted_response_rate'].mean().reset_index(name='predicted_response_rate')

        ## select recruitment plans:
        x = pred_df_rr_step2up['predicted_response_rate'].values

        if len(x) <= 10:
            best_k = 2
        else:
            best_k = kmeans_fit(data = x)['best_k']
        kmeans_results = kmeans_bhattacharyya(data=x, k=best_k)

        ## match the cluster results back to the original data
        highest_cluster_previous = highest_cluster # save previous cluster results

        merged_df = pd.merge(pred_df_rr_step2up, kmeans_results['clusters'][['predicted_response_rate', 'cluster_number']], on='predicted_response_rate')
        cluster_with_highest_rate = kmeans_results['clusters'].groupby('cluster_number')['predicted_response_rate'].mean().idxmax()
        highest_cluster = merged_df[merged_df['cluster_number']==cluster_with_highest_rate].reset_index(drop=True)
        highest_cluster.sort_values(by='predicted_response_rate', ascending=False, inplace=True)

        # p_vec_previous = p_vec_next # save p_vec of previous round
        p_vec_next = np.array(highest_cluster['predicted_response_rate']/np.sum(highest_cluster['predicted_response_rate']))

        highest_cluster['p_vec'] = p_vec_next

        highest_cluster = pd.merge(highest_cluster, dt_design_5_step2up[['recruitment_plan','plan_response_rate']].drop_duplicates(), how='left', on='recruitment_plan')

        ## prepare to chance of better performance:
        temp = dt_design_5_step2up[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
        event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
        event_list_adaptive += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
        # print("step"+str(round), np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
        # print(temp)

        ## check early stopping predicted ORR:
        orr_df = pd.merge(highest_cluster_previous, highest_cluster[['recruitment_plan','predicted_response_rate','p_vec']], on='recruitment_plan', how='left')
        orr_df.fillna(0, inplace=True)
        p_orr_1 = np.dot(np.array(orr_df['p_vec_x']), np.array(orr_df['predicted_response_rate_y']))
        p_orr_2 = np.dot(np.array(orr_df['p_vec_y']), np.array(orr_df['predicted_response_rate_y']))
        print("orr termination", p_orr_1, p_orr_2, p_orr_2 - p_orr_1)

        if (p_orr_2 - p_orr_1 < epsilon):
            # step 3 use the same strategy of step2
            print(i, p_orr_1, p_orr_2, "early stop at Round " + str(round))
            rr_dict["step"+str(round+1)+"_rr"].append(np.dot(np.array(highest_cluster['p_vec']), np.array(highest_cluster['plan_response_rate'])))

            ## save benchmark results for last round:
            dt_benchmark = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                                n_patient_per_plan = n_patient_per_plan, seed=seed+round)
            maxidx = np.argmax(dt_benchmark['plan_response_rate'])
            random_rr_dict["step"+str(round+1)+'_random_max_rr'].append(dt_benchmark.iloc[maxidx,6])
            random_rr_dict["step"+str(round+1)+'_random_mean_rr'].append(dt_benchmark['plan_response_rate'].mean())

            ### update remaining size:
            remaining_size -= len(dt_design_5_step2up)
            print("early stop, remaining size for Round" + str(round + 1) + ": " + str(remaining_size))
            sample_size_dict["step"+str(round)+"_sample_size"].append(remaining_size)
            last_round_sample_size.append(remaining_size)

            for r in range(round+2, n_rounds+1):
                sample_size_dict["step"+str(r)+"_sample_size"].append(np.nan)

            dt_design_5_rest = generate_data_step2up(highest_cluster=highest_cluster, p_vec = p_vec_next, round=round,
                                                      design_number = n_design, n_rounds = n_rounds,
                                                      n_patient_per_plan = n_patient_per_plan, size = int(remaining_size), seed=seed)
            plan_list = dt_design_5_rest['recruitment_plan'].unique()
            supp = dt_design_5_step2up_overall[dt_design_5_step2up_overall['recruitment_plan'].isin(plan_list)]
            dt_design_5_rest_overall = pd.concat([dt_design_5_rest, supp.reset_index(drop=True)], axis = 0)
            dt_design_5_rest_overall.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)

            stopping_dict['early_stopping'].append(1)
            stopping_dict['early_stopping_plan'].append(0)
            stopping_dict['early_stopping_orr'].append(1)
            stopping_dict['early_stopping_size'].append(0)

            ## record final plan number
            final_plan_number.append(len(p_vec_next))

            ## record highest responserate overall
            highest_rr_overall.append(np.max(dt_design_5_rest_overall['plan_response_rate']))

            for r in range(round+2, n_rounds+1):
                rr_dict["step"+str(r)+"_rr"].append(np.nan)

            ## prepare to chance of better performance:
            temp = dt_design_5_rest[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
            event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            event_list_adaptive += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print("step"+str(round+1), np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print(temp)

            result = binomtest(int(event_list), n=total_n, p=p0)
            # print(int(event_list), total_n, p0)
            orr_total.append(event_list)
            orr_total_adaptive.append(event_list_adaptive)
            better_chance.append(1 if result.pvalue < 0.05 else 0)
            # print(result.pvalue)

            break



1
['step1_rr: 0', 'step2_rr: 0', 'step3_rr: 0', 'step4_rr: 0', 'step5_rr: 0']
empirical response rate 0.05655306495882891
remaining size for Round 2: 140000
Calculated size for Round 2: 4240.799240494033
orr termination 0.028050578004677983 0.07747640179726747 0.04942582379258949
remaining size for Round 3: 135760
Calculated size for Round 3: 5262.438447913083
orr termination 0.06434239544326537 0.08008295682627871 0.015740561383013343
remaining size for Round 4: 130498
Calculated size for Round 4: 6733.849778617701
orr termination 0.05940425964963364 0.07882647321925763 0.019422213569623994
remaining size for last Round 5: 123765
2
['step1_rr: 1', 'step2_rr: 1', 'step3_rr: 1', 'step4_rr: 1', 'step5_rr: 1']
empirical response rate 0.054637465690759376
remaining size for Round 2: 140000
Calculated size for Round 2: 4113.519228021547
orr termination 0.05258566545057533 0.07778894936772027 0.025203283917144936
remaining size for Round 3: 135887
Calculated size for Round 3: 9863.9128231646

### Results summary

In [219]:
rr_df = pd.DataFrame(rr_dict)
rr_df

Unnamed: 0,step1_rr,step2_rr,step3_rr,step4_rr,step5_rr
0,0.055,0.061816,0.068471,0.068471,0.068471
1,0.055,0.064082,0.062143,0.068471,
2,0.055,0.065190,0.068471,0.068471,0.068471
3,0.055,0.063879,0.065887,0.068471,0.068471
4,0.055,0.065803,0.068471,0.068471,
...,...,...,...,...,...
95,0.055,0.068471,0.068471,0.068471,
96,0.055,0.068471,0.068471,0.068471,
97,0.055,0.068471,0.068471,,
98,0.055,0.063531,0.068471,0.068471,0.068471


In [220]:
# Calculate average rounds:
row_non_nan_counts = rr_df.count(axis=1)

print(f"{np.mean(row_non_nan_counts):.1f} ({np.std(row_non_nan_counts):.1f})")

3.8 (0.9)


In [221]:
result_dict = {}
for key, values in rr_df.items():
    mean_value = np.mean(values)
    std_value = np.std(values)
    result_dict[key] = f"{mean_value:.3f} ({std_value:.3f})"

for key, value in list(result_dict.items())[:12]:
    print(f"{key}: {value}")

step1_rr: 0.055 (0.000)
step2_rr: 0.066 (0.002)
step3_rr: 0.068 (0.002)
step4_rr: 0.068 (0.001)
step5_rr: 0.068 (0.000)


In [222]:
result_dict = np.array(orr_total) / total_n
mean_result = np.mean(result_dict)
std_result = np.std(result_dict)

print(f"Mean: {mean_result:.3f}, StD: {std_result:.3f}")

Mean: 0.066, StD: 0.001


In [223]:
print(np.mean(highest_true_rr))
print(np.std(highest_true_rr))

0.068470526432204
1.3877787807814457e-17


In [224]:
result_dict = np.array(orr_total_adaptive) / (total_n-34976)
mean_result = np.mean(result_dict)
std_result = np.std(result_dict)

print(f"Mean: {mean_result:.3f}, StD: {std_result:.3f}")

Mean: 0.068, StD: 0.001


In [225]:
# plan_number_dict = {"step{}_plan_number".format(r): [] for r in range(1, n_rounds + 1)}
result_dict = {}
for key, values in plan_number_dict.items():
    mean_value = np.mean(values)
    std_value = np.std(values)
    result_dict[key] = f"{mean_value:.1f} ({std_value:.1f})"
#     result_dict[key + "_std"] = std_value
for key, value in list(result_dict.items())[:12]:
    print(f"{key}: {value}")

step1_plan_number: nan (nan)
step2_plan_number: 5.1 (3.4)
step3_plan_number: 2.8 (1.9)
step4_plan_number: 1.9 (1.3)
step5_plan_number: 1.7 (0.8)


In [226]:
means = {key: np.mean(values) for key, values in stopping_dict.items()}
means

{'early_stopping': 0.74,
 'early_stopping_plan': 0.74,
 'early_stopping_orr': 0.0,
 'early_stopping_size': 0.0}

In [227]:
print(np.mean(last_round_sample_size), np.std(last_round_sample_size))

129809.8 5716.781251718488


In [228]:
# plan_number_dict = {"step{}_plan_number".format(r): [] for r in range(1, n_rounds + 1)}
result_dict = {}
for key, values in sample_size_dict.items():
    mean_value = np.nanmean(values)
    std_value = np.nanstd(values)
    result_dict[key] = f"{mean_value:.3f} ({std_value:.3f})"
#     result_dict[key + "_std"] = std_value
for key, value in list(result_dict.items())[:12]:
    print(f"{key}: {value}")

step1_sample_size: 34976.000 (0.000)
step2_sample_size: 13653.204 (34663.534)
step3_sample_size: 48926.921 (60003.920)
step4_sample_size: 77445.742 (59804.188)
step5_sample_size: 123288.077 (968.081)


In [229]:
np.mean(better_chance)

0.99