In [48]:
import numpy as np
import pandas as pd
import random
import time
from itertools import product
from scipy.stats import norm, binomtest
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression, Lasso, Ridge

from sklearn import linear_model, svm, naive_bayes, ensemble
from sklearn.model_selection import cross_validate, train_test_split, RepeatedStratifiedKFold
from sklearn.utils import class_weight
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, f1_score, roc_auc_score
from xgboost import XGBClassifier
# from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix, roc_auc_score, matthews_corrcoef

from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score


## Read functions

In [49]:
%run functions_scenario1.ipynb

## Scenario 1

## Logistic regression

In [3]:
def ensemble_model_fit(data, data_pred):
    X_train, X_test, y_train, y_test = train_test_split(
        data.drop(['recruitment_plan','plan_response_rate','group_size','response'], axis=1),
        data['response'],
        test_size=0.2,
        random_state=0
    )

    # Define the VotingClassifier with the individual classifiers
    voting_classifier = ensemble.VotingClassifier(
        estimators=[
            ('LR', linear_model.LogisticRegression(max_iter=200, random_state=0))
#             ('Ridge', linear_model.LogisticRegression(penalty='l2', solver='lbfgs', max_iter=200, random_state=0))
                    # ('SVM', svm.SVC(kernel='linear', C=1.0, random_state=0, probability=True, class_weight='balanced'))
#                     ('RF', ensemble.RandomForestClassifier(n_estimators=200, criterion='gini', random_state=0))
                    # ('XGB', XGBClassifier(n_estimators=50, learning_rate=0.1, random_state=0))
                   ],
        voting='soft'
    )

    # Define the hyperparameter grid to search
    # Best Hyperparameters: {'LR__C': 0.01, 'RF__n_estimators': 50, 'XGB__n_estimators': 50}
    param_grid = {
        # 'NB__alpha': [0.01, 0.05, 0.1],  # '__' is used to specify hyperparameters for individual classifiers
        'LR__C': [0.01] # [0.01, 0.05, 0.1]
        # 'Ridge__C': [0.01]
        # 'SVM__C': [0.01, 0.05, 0.1]
#         'RF__n_estimators': [50] # [10, 30, 50]
        # 'XGB__n_estimators': [50]
    }

    # Create a GridSearchCV object
    # custom_scorer_auc = make_scorer(roc_auc_score, needs_proba=True)
    grid_search = GridSearchCV(voting_classifier, param_grid, cv=10, scoring='roc_auc')

    # Perform the grid search on the training data
    grid_search.fit(X_train, y_train)

    # Get the best hyperparameters
    best_params = grid_search.best_params_

    # Train the final VotingClassifier with the best hyperparameters on the full training set
    final_voting_classifier = grid_search.best_estimator_
    final_voting_classifier.fit(X_train, y_train)

    # Predict probabilities instead of binary outcomes on the test set
    y_pred_proba_test = final_voting_classifier.predict_proba(X_test)
    y_pred_test = final_voting_classifier.predict(X_test)
    X_dt = data_pred.drop(['recruitment_plan','plan_response_rate','group_size','response'], axis=1)
    y_pred = final_voting_classifier.predict_proba(X_dt)

    return y_pred

In [4]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import train_test_split

# def ensemble_model_fit(data, data_pred):
#     # Split the data into training and testing sets
#     X_train, X_test, y_train, y_test = train_test_split(
#         data.drop(['recruitment_plan', 'plan_response_rate', 'group_size','response'], axis=1),
#         data['response'],
#         test_size=0.2,
#         random_state=0
#     )

#     # Define the Logistic Regression model
#     logistic_regression = LogisticRegression(C=0.01, max_iter=200,random_state=0)

#     # Fit the Logistic Regression model on the training data
#     logistic_regression.fit(X_train, y_train)

#     # Print out the coefficients of the logistic regression model
#     print("Coefficients:", logistic_regression.coef_)
#     print("Intercept:", logistic_regression.intercept_)
#     print("Coefficients shape:", logistic_regression.coef_.shape)

#     # Predict probabilities instead of binary outcomes on the test set
#     y_pred_proba_test = logistic_regression.predict_proba(X_test)
#     y_pred_test = logistic_regression.predict(X_test)

#     # You can also predict probabilities for the prediction data (data_pred)
#     X_dt = data_pred.drop(['recruitment_plan', 'plan_response_rate', 'group_size','response'], axis=1)
#     y_pred = logistic_regression.predict_proba(X_dt)

#     return y_pred

# # Call the function
# # ensemble_model_fit(data, data_pred)


### Simulation starts

In [5]:
n_sim = 100
# create a list of random seeds
random.seed(42)
random_seeds = [random.randint(1, 100000) for _ in range(n_sim)]

design_list = [5,8,10]
patient_n_list = [5468,680,170] # n_patient_per_plan

n_design = 5
n_patient_per_plan = 5468
n_rounds = 5
total_n = n_patient_per_plan * (2**n_design)

## sample size determination
beta = 0.2
power = 1 - beta
alpha = 0.05
delta = 0.01 # effect size

## early stopping
epsilon = 0.001

In [6]:
rr_dict = {"step{}_rr".format(r): [] for r in range(1, n_rounds + 1)}
plan_number_dict = {"step{}_plan_number".format(r): [] for r in range(1, n_rounds + 1)}
sample_size_dict = {"step{}_sample_size".format(r): [] for r in range(1, n_rounds + 1)}
last_round_sample_size = []
random_rr_dict = {"step{}_random_max_rr".format(r): [] for r in range(1, n_rounds + 1)}
random_rr_dict.update({
    "step{}_random_mean_rr".format(r): [] for r in range(1, n_rounds + 1)
})

stopping_dict = {"early_stopping": [],
                 "early_stopping_plan":[],
                 "early_stopping_orr":[],
                 "early_stopping_size":[]}
final_plan_number = []
highest_rr_overall = []

better_chance = []

orr_total = []
orr_total_adaptive = []



i = 0

for seed in random_seeds:
    i += 1
    print(i)

    print([f"{key}: {len(value)}" for key, value in rr_dict.items()])



    event_list = 0
    event_list_adaptive = 0
    max_rr = []

    # step 1:
    ## Generate dataset
    dt_design_5 = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                      n_patient_per_plan = n_patient_per_plan, seed=seed)
    print("empirical response rate", dt_design_5['response'].mean())
    ## save for benchmark results
    maxidx = np.argmax(dt_design_5['plan_response_rate'])
    random_rr_dict['step1_random_max_rr'].append(dt_design_5.iloc[maxidx,6])
    random_rr_dict['step1_random_mean_rr'].append(dt_design_5['plan_response_rate'].mean())

    ## Ensemble modelling:
    y_pred = ensemble_model_fit(data=dt_design_5, data_pred = dt_design_5)

    ## select recruitment plan
    pred_df = pd.DataFrame(np.hstack((dt_design_5,  y_pred[:, 1].reshape(-1, 1))),
                             columns=list(dt_design_5.columns) + ['predicted_response_rate'])
    pred_df_rr = pred_df.groupby([f'Design_Feature_{i+1}' for i in range(n_design)]+['recruitment_plan'])['predicted_response_rate'].mean().reset_index(name='predicted_response_rate')

    x = pred_df_rr['predicted_response_rate'].values
    if len(x) <= 10:
        best_k = 2
    else:
        best_k = kmeans_fit(data = x)['best_k']
    kmeans_results = kmeans_bhattacharyya(data=x, k=best_k)

    merged_df = pd.merge(pred_df_rr, kmeans_results['clusters'][['predicted_response_rate', 'cluster_number']], on='predicted_response_rate')
    cluster_with_highest_rate = kmeans_results['clusters'].groupby('cluster_number')['predicted_response_rate'].mean().idxmax()
    highest_cluster = merged_df[merged_df['cluster_number']==cluster_with_highest_rate].reset_index(drop=True)
    highest_cluster.sort_values(by='predicted_response_rate', ascending=False, inplace=True)


    p_vec_next = np.array(highest_cluster['predicted_response_rate']/np.sum(highest_cluster['predicted_response_rate']))
    highest_cluster['p_vec'] = p_vec_next
    highest_cluster = pd.merge(highest_cluster, dt_design_5[['recruitment_plan','plan_response_rate']].drop_duplicates(), how='left', on='recruitment_plan')

    # highest_cluster = pred_df_rr

    # highest_cluster = pd.merge(highest_cluster, dt_design_5[['recruitment_plan','plan_response_rate']].drop_duplicates(), how='left', on='recruitment_plan')
    # highest_cluster['cluster_number'] = highest_cluster['recruitment_plan']

    ## prepare to chance of better performance:
    temp = dt_design_5[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
    event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
    # print("step1", np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
    # print(temp)
    p0 = (temp['plan_response_rate'].mean())

    rr_dict["step1_rr"].append(dt_design_5['plan_response_rate'].mean())
    sample_size_dict["step1_sample_size"].append(len(dt_design_5))

    for round in range(2, n_rounds+1):

        rr_dict["step"+str(round)+"_rr"].append(np.dot(np.array(highest_cluster['p_vec']), np.array(highest_cluster['plan_response_rate'])))
        plan_number_dict["step"+str(round)+"_plan_number"].append(len(p_vec_next))

        ## when it comes to the last round:
        if round == n_rounds:

            ## save benchmark results for last round:
            dt_benchmark = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                      n_patient_per_plan = n_patient_per_plan, seed=seed+round)
            maxidx = np.argmax(dt_benchmark['plan_response_rate'])
            random_rr_dict["step"+str(round)+'_random_max_rr'].append(dt_benchmark.iloc[maxidx,6])
            random_rr_dict["step"+str(round)+'_random_mean_rr'].append(dt_benchmark['plan_response_rate'].mean())

            if round == 2: # if the last round is round 2
                remaining_size = total_n - len(dt_design_5) # apply to the rest of the patients
            else:
                remaining_size -= len(dt_design_5_step2up)

            print("remaining size for last Round " + str(round) + ": " + str(remaining_size))
            sample_size_dict["step"+str(round)+"_sample_size"].append(remaining_size)
            last_round_sample_size.append(remaining_size)

            ## data generation, combine previous data:
            dt_design_5_rest = generate_data_step2up(highest_cluster=highest_cluster, p_vec = p_vec_next, 
                                                          design_number = n_design, n_rounds = n_rounds,
                                                          n_patient_per_plan = n_patient_per_plan, size = int(remaining_size), seed=seed)
            plan_list = dt_design_5_rest['recruitment_plan'].unique()
            if round == 2:
                supp = dt_design_5[dt_design_5['recruitment_plan'].isin(plan_list)] # data from step 1
            else:
                # step from previous rounds
                supp = dt_design_5_step2up_overall[dt_design_5_step2up_overall['recruitment_plan'].isin(plan_list)]
            dt_design_5_rest_overall = pd.concat([dt_design_5_rest, supp.reset_index(drop=True)], axis = 0)
            dt_design_5_rest_overall.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)

            stopping_dict['early_stopping'].append(0)
            stopping_dict['early_stopping_plan'].append(0)
            stopping_dict['early_stopping_orr'].append(0)
            stopping_dict['early_stopping_size'].append(0)

            ## record final plan number
            final_plan_number.append(len(p_vec_next))

            ## record highest responserate overall
            highest_rr_overall.append(np.max(dt_design_5_rest_overall['plan_response_rate']))

            ## prepare to chance of better performance:
            temp = dt_design_5_rest[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
            event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            event_list_adaptive += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print("step"+str(round), np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print(len(temp), temp['plan_response_rate'].mean())
            # print(temp)

            result = binomtest(int(event_list), n=total_n, p=p0)
            # print(int(event_list), total_n, p0)
            orr_total.append(event_list)
            orr_total_adaptive.append(event_list_adaptive)
            better_chance.append(1 if result.pvalue < 0.05 else 0)
            # print(result.pvalue)

            break



        ## If haven't reached the last round:
        ## check remaining sample size:
        if round == 2:
            remaining_size = total_n - len(dt_design_5) # apply to the rest of the patients
        else:
            remaining_size -= len(dt_design_5_step2up)

        print("remaining size for Round " + str(round) + ": " + str(remaining_size))

        ## determine whether move on to step 2:
        if len(highest_cluster) == 1: # if there is only one plan left

            sample_size_dict["step"+str(round)+"_sample_size"].append(remaining_size)
            last_round_sample_size.append(remaining_size) # record the last round sample size
            for r in range(round+1, n_rounds+1):
                sample_size_dict["step"+str(r)+"_sample_size"].append(np.nan)

            ## save benchmark results for last round:
            dt_benchmark = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                      n_patient_per_plan = n_patient_per_plan, seed=seed+round)
            maxidx = np.argmax(dt_benchmark['plan_response_rate'])
            random_rr_dict["step"+str(round)+'_random_max_rr'].append(dt_benchmark.iloc[maxidx,6])
            random_rr_dict["step"+str(round)+'_random_mean_rr'].append(dt_benchmark['plan_response_rate'].mean())

            dt_design_5_step2up = generate_data_step2up(highest_cluster=highest_cluster, p_vec = p_vec_next,
                                                      design_number = n_design, n_rounds = n_rounds,
                                                      n_patient_per_plan = n_patient_per_plan, size = int(remaining_size), seed=seed)
            plan_list = dt_design_5_step2up['recruitment_plan'].unique()
            if round == 2:
                supp = dt_design_5[dt_design_5['recruitment_plan'].isin(plan_list)] # data from step 1
            else:
                # step from previous rounds
                supp = dt_design_5_step2up_overall[dt_design_5_step2up_overall['recruitment_plan'].isin(plan_list)]

            dt_design_5_step2up_overall = pd.concat([dt_design_5_step2up, supp.reset_index(drop=True)], axis = 0)
            dt_design_5_step2up_overall.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)

            stopping_dict['early_stopping'].append(1)
            stopping_dict['early_stopping_plan'].append(1)
            stopping_dict['early_stopping_orr'].append(0)
            stopping_dict['early_stopping_size'].append(0)

            ## record final plan number
            final_plan_number.append(len(p_vec_next))

            ## record highest responserate overall
            highest_rr_overall.append(np.max(dt_design_5_step2up_overall['plan_response_rate']))

            for r in range(round+1, n_rounds+1):
                rr_dict["step"+str(r)+"_rr"].append(np.nan)

            ## prepare for chance of better performance:
            temp = dt_design_5_step2up[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
            event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            event_list_adaptive += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print("step"+str(round), np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print(temp)

            result = binomtest(int(event_list), n=total_n, p=p0)
            # print(int(event_list), total_n, p0)
            orr_total.append(event_list)
            orr_total_adaptive.append(event_list_adaptive)
            better_chance.append(1 if result.pvalue < 0.05 else 0)
            # print(result.pvalue)
            break

        ## sample size determination:
        if round == 2:
            dt_design_5_step2up_overall = dt_design_5
        orr_1 = dt_design_5_step2up_overall['response'].mean() # observed overall response rates for previous rounds
        orr_2 = orr_1 + delta
        n_1 = len(dt_design_5_step2up_overall)

        size_step2up = sample_size_calc(orr_1, n_1, delta=delta, alpha=alpha, power=power) # total size for dataset
        if size_step2up > 0 and size_step2up < 1000:
            size_step2up = 1000 # if size in [0,1000], then it is 1000 for this round.
        elif size_step2up >= 1000:
            size_step2up = min(size_step2up, int(total_n/n_rounds)) # dataset size capped by the n_patient_per_plan
        else:
        # if size_step2up <= 0:
            # the process stops at this step
            print('Calculated size for Round ' + str(round) + ': ' + str(size_step2up) + 'lt 0, break')
            sample_size_dict["step"+str(round)+"_sample_size"].append(remaining_size)
            last_round_sample_size.append(remaining_size)
            for r in range(round+1, n_rounds+1):
                sample_size_dict["step"+str(r)+"_sample_size"].append(np.nan)

            dt_benchmark = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                      n_patient_per_plan = n_patient_per_plan, seed=seed+round)
            maxidx = np.argmax(dt_benchmark['plan_response_rate'])
            random_rr_dict["step"+str(round)+'_random_max_rr'].append(dt_benchmark.iloc[maxidx,6])
            random_rr_dict["step"+str(round)+'_random_mean_rr'].append(dt_benchmark['plan_response_rate'].mean())

            dt_design_5_step2up = generate_data_step2up(highest_cluster=highest_cluster, p_vec = p_vec_next,
                                                      design_number = n_design, n_rounds = n_rounds,
                                                      n_patient_per_plan = n_patient_per_plan, size = int(remaining_size), seed=seed)
            plan_list = dt_design_5_step2up['recruitment_plan'].unique()
            if round == 2:
                supp = dt_design_5[dt_design_5['recruitment_plan'].isin(plan_list)] # data from step 1
            else:
                # step from previous rounds
                supp = dt_design_5_step2up_overall[dt_design_5_step2up_overall['recruitment_plan'].isin(plan_list)]

            dt_design_5_step2up_overall = pd.concat([dt_design_5_step2up, supp.reset_index(drop=True)], axis = 0)
            dt_design_5_step2up_overall.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)

            stopping_dict['early_stopping'].append(1)
            stopping_dict['early_stopping_plan'].append(0)
            stopping_dict['early_stopping_orr'].append(0)
            stopping_dict['early_stopping_size'].append(1)

            ## record final plan number
            final_plan_number.append(len(p_vec_next))

            ## record highest responserate overall
            highest_rr_overall.append(np.max(dt_design_5_step2up_overall['plan_response_rate']))

            for r in range(round+1, n_rounds+1):
                rr_dict["step"+str(r)+"_rr"].append(np.nan)

            ## prepare for chance of better performance:
            temp = dt_design_5_step2up[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
            event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            event_list_adaptive += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print("step"+str(round), np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print(temp)

            result = binomtest(int(event_list), n=total_n, p=p0)
            # print(int(event_list), total_n, p0)
            orr_total.append(event_list)
            orr_total_adaptive.append(event_list_adaptive)
            better_chance.append(1 if result.pvalue < 0.05 else 0)
            # print(result.pvalue)
            break

        print('Calculated size for Round ' + str(round) + ': ' + str(size_step2up))

        sample_size_dict["step"+str(round)+"_sample_size"].append(size_step2up)

        ## save benchmark results for last round:
        dt_benchmark = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                            n_patient_per_plan = n_patient_per_plan, seed=seed+round)
        maxidx = np.argmax(dt_benchmark['plan_response_rate'])
        random_rr_dict["step"+str(round)+'_random_max_rr'].append(dt_benchmark.iloc[maxidx,6])
        random_rr_dict["step"+str(round)+'_random_mean_rr'].append(dt_benchmark['plan_response_rate'].mean())

        ## data generation, combine previous data:
        dt_design_5_step2up = generate_data_step2up(highest_cluster=highest_cluster, p_vec = p_vec_next,
                                                  design_number = n_design, n_rounds = n_rounds,
                                                  n_patient_per_plan = n_patient_per_plan, size = int(size_step2up), seed=seed)

        plan_list = dt_design_5_step2up['recruitment_plan'].unique()
        if round == 2:
            supp = dt_design_5[dt_design_5['recruitment_plan'].isin(plan_list)]
        else:
            supp = dt_design_5_step2up_overall[dt_design_5_step2up_overall['recruitment_plan'].isin(plan_list)]
        dt_design_5_step2up_overall = pd.concat([dt_design_5_step2up, supp.reset_index(drop=True)], axis = 0)
        dt_design_5_step2up_overall.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)

        ## Ensemble model fitting:
        dt_design_5_step2up.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)
        y_pred2up = ensemble_model_fit(data = dt_design_5_step2up_overall, data_pred = dt_design_5_step2up)
        pred_df_step2up = pd.DataFrame(np.hstack((dt_design_5_step2up,  y_pred2up[:, 1].reshape(-1, 1))),
                                    columns=list(dt_design_5_step2up.columns) + ['predicted_response_rate'])
        pred_df_rr_step2up = pred_df_step2up.groupby([f'Design_Feature_{i+1}' for i in range(n_design)]+['recruitment_plan'])['predicted_response_rate'].mean().reset_index(name='predicted_response_rate')

        ## select recruitment plans:
        x = pred_df_rr_step2up['predicted_response_rate'].values

        if len(x) <= 10:
            best_k = 2
        else:
            best_k = kmeans_fit(data = x)['best_k']
        kmeans_results = kmeans_bhattacharyya(data=x, k=best_k)

        ## match the cluster results back to the original data
        highest_cluster_previous = highest_cluster # save previous cluster results

        merged_df = pd.merge(pred_df_rr_step2up, kmeans_results['clusters'][['predicted_response_rate', 'cluster_number']], on='predicted_response_rate')
        cluster_with_highest_rate = kmeans_results['clusters'].groupby('cluster_number')['predicted_response_rate'].mean().idxmax()
        highest_cluster = merged_df[merged_df['cluster_number']==cluster_with_highest_rate].reset_index(drop=True)
        highest_cluster.sort_values(by='predicted_response_rate', ascending=False, inplace=True)

        # p_vec_previous = p_vec_next # save p_vec of previous round
        p_vec_next = np.array(highest_cluster['predicted_response_rate']/np.sum(highest_cluster['predicted_response_rate']))

        highest_cluster['p_vec'] = p_vec_next

        highest_cluster = pd.merge(highest_cluster, dt_design_5_step2up[['recruitment_plan','plan_response_rate']].drop_duplicates(), how='left', on='recruitment_plan')

        ## prepare to chance of better performance:
        temp = dt_design_5_step2up[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
        event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
        event_list_adaptive += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
        # print("step"+str(round), np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
        # print(temp)

        ## check early stopping predicted ORR:
        orr_df = pd.merge(highest_cluster_previous, highest_cluster[['recruitment_plan','predicted_response_rate','p_vec']], on='recruitment_plan', how='left')
        orr_df.fillna(0, inplace=True)
        p_orr_1 = np.dot(np.array(orr_df['p_vec_x']), np.array(orr_df['predicted_response_rate_y']))
        p_orr_2 = np.dot(np.array(orr_df['p_vec_y']), np.array(orr_df['predicted_response_rate_y']))
        print("orr termination", p_orr_1, p_orr_2, p_orr_2 - p_orr_1)

        if (p_orr_2 - p_orr_1 < epsilon):
            # step 3 use the same strategy of step2
            print(i, p_orr_1, p_orr_2, "early stop at Round " + str(round))
            rr_dict["step"+str(round+1)+"_rr"].append(np.dot(np.array(highest_cluster['p_vec']), np.array(highest_cluster['plan_response_rate'])))

            ## save benchmark results for last round:
            dt_benchmark = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                                n_patient_per_plan = n_patient_per_plan, seed=seed+round)
            maxidx = np.argmax(dt_benchmark['plan_response_rate'])
            random_rr_dict["step"+str(round+1)+'_random_max_rr'].append(dt_benchmark.iloc[maxidx,6])
            random_rr_dict["step"+str(round+1)+'_random_mean_rr'].append(dt_benchmark['plan_response_rate'].mean())

            ### update remaining size:
            remaining_size -= len(dt_design_5_step2up)
            print("early stop, remaining size for Round" + str(round + 1) + ": " + str(remaining_size))
            sample_size_dict["step"+str(round)+"_sample_size"].append(remaining_size)
            last_round_sample_size.append(remaining_size)

            for r in range(round+2, n_rounds+1):
                sample_size_dict["step"+str(r)+"_sample_size"].append(np.nan)

            dt_design_5_rest = generate_data_step2up(highest_cluster=highest_cluster, p_vec = p_vec_next, 
                                                      design_number = n_design, n_rounds = n_rounds,
                                                      n_patient_per_plan = n_patient_per_plan, size = int(remaining_size), seed=seed)
            plan_list = dt_design_5_rest['recruitment_plan'].unique()
            supp = dt_design_5_step2up_overall[dt_design_5_step2up_overall['recruitment_plan'].isin(plan_list)]
            dt_design_5_rest_overall = pd.concat([dt_design_5_rest, supp.reset_index(drop=True)], axis = 0)
            dt_design_5_rest_overall.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)

            stopping_dict['early_stopping'].append(1)
            stopping_dict['early_stopping_plan'].append(0)
            stopping_dict['early_stopping_orr'].append(1)
            stopping_dict['early_stopping_size'].append(0)

            ## record final plan number
            final_plan_number.append(len(p_vec_next))

            ## record highest responserate overall
            highest_rr_overall.append(np.max(dt_design_5_rest_overall['plan_response_rate']))

            for r in range(round+2, n_rounds+1):
                rr_dict["step"+str(r)+"_rr"].append(np.nan)

            ## prepare to chance of better performance:
            temp = dt_design_5_rest[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
            event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            event_list_adaptive += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print("step"+str(round+1), np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print(temp)

            result = binomtest(int(event_list), n=total_n, p=p0)
            # print(int(event_list), total_n, p0)
            orr_total.append(event_list)
            orr_total_adaptive.append(event_list_adaptive)
            better_chance.append(1 if result.pvalue < 0.05 else 0)
            # print(result.pvalue)

            break



1
['step1_rr: 0', 'step2_rr: 0', 'step3_rr: 0', 'step4_rr: 0', 'step5_rr: 0']
empirical response rate 0.06653133577310155
remaining size for Round 2: 140000
Calculated size for Round 2: 4908.31412781265
orr termination 0.03002990703375628 0.09348194294352946 0.06345203590977318
remaining size for Round 3: 135092
2
['step1_rr: 1', 'step2_rr: 1', 'step3_rr: 1', 'step4_rr: 1', 'step5_rr: 1']
empirical response rate 0.056867566331198535
remaining size for Round 2: 140000
Calculated size for Round 2: 4261.723102741266
orr termination 0.017144733492426314 0.0734268878037351 0.056282154311308785
remaining size for Round 3: 135739
Calculated size for Round 3: 4829.855095347762
orr termination 0.04892416537409498 0.08147975130276232 0.032555585928667335
remaining size for Round 4: 130910
Calculated size for Round 4: 7441.151492043256
orr termination 0.06704796180200037 0.09751731458697818 0.03046935278497781
remaining size for last Round 5: 123469
3
['step1_rr: 2', 'step2_rr: 2', 'step3_rr: 2',

### Results summary

In [7]:
rr_df = pd.DataFrame(rr_dict)
rr_df

Unnamed: 0,step1_rr,step2_rr,step3_rr,step4_rr,step5_rr
0,0.054501,0.070594,0.098086,,
1,0.057829,0.061878,0.071296,0.086111,0.095943
2,0.062228,0.071151,0.092669,,
3,0.054011,0.061966,0.067244,0.084578,0.090330
4,0.056766,0.065395,0.083580,,
...,...,...,...,...,...
95,0.052518,0.068806,0.097057,0.099877,
96,0.042686,0.053083,0.080764,,
97,0.053834,0.050656,0.048803,,
98,0.045935,0.056210,0.075262,0.085008,


In [8]:
# Calculate average rounds:
row_non_nan_counts = rr_df.count(axis=1)

print(f"{np.mean(row_non_nan_counts):.1f} ({np.std(row_non_nan_counts):.1f})")

3.8 (0.9)


In [9]:
# ORR for each round:
result_dict = {}
for key, values in rr_df.items():
    mean_value = np.mean(values)
    std_value = np.std(values)
    result_dict[key] = f"{mean_value:.3f} ({std_value:.3f})"

for key, value in list(result_dict.items())[:12]:
    print(f"{key}: {value}")

step1_rr: 0.055 (0.005)
step2_rr: 0.067 (0.011)
step3_rr: 0.080 (0.013)
step4_rr: 0.088 (0.008)
step5_rr: 0.092 (0.006)


In [10]:
# Overall ORR:
result_dict = np.array(orr_total) / total_n
mean_result = np.mean(result_dict)
std_result = np.std(result_dict)

print(f"Mean: {mean_result:.3f} ({std_result:.3f})")

Mean: 0.080 (0.010)


In [11]:
# Highest ORR:
print(np.mean(highest_true_rr))
print(np.std(highest_true_rr))

0.08815302832887745
0.012615898320377


In [12]:
# Adaptive ORR:
result_dict = np.array(orr_total_adaptive) / (total_n-34976)
mean_result = np.mean(result_dict)
std_result = np.std(result_dict)

print(f"Mean: {mean_result:.3f} ({std_result:.3f})")

Mean: 0.087 (0.012)


In [13]:
# Average plan number for each round:
result_dict = {}
for key, values in plan_number_dict.items():
    mean_value = np.mean(values)
    std_value = np.std(values)
    result_dict[key] = f"{mean_value:.1f} ({std_value:.1f})"
#     result_dict[key + "_std"] = std_value
for key, value in list(result_dict.items())[:12]:
    print(f"{key}: {value}")

step1_plan_number: nan (nan)
step2_plan_number: 9.5 (8.0)
step3_plan_number: 3.2 (3.0)
step4_plan_number: 2.0 (1.4)
step5_plan_number: 1.5 (0.7)


In [14]:
# Early stopping probabilities for different reasons:
means = {key: np.mean(values) for key, values in stopping_dict.items()}
means

{'early_stopping': 0.73,
 'early_stopping_plan': 0.73,
 'early_stopping_orr': 0.0,
 'early_stopping_size': 0.0}

In [15]:
# sample size for the last round:
print(np.mean(last_round_sample_size), np.std(last_round_sample_size))

130411.68 5760.969513684307


In [16]:
# Probability of better performance compared to the benchmark:
np.mean(better_chance)

0.99

In [17]:
# plan_number_dict = {"step{}_plan_number".format(r): [] for r in range(1, n_rounds + 1)}
result_dict = {}
for key, values in sample_size_dict.items():
    mean_value = np.nanmean(values)
    std_value = np.nanstd(values)
    result_dict[key] = f"{mean_value:.3f} ({std_value:.3f})"
#     result_dict[key + "_std"] = std_value
for key, value in list(result_dict.items())[:12]:
    print(f"{key}: {value}")

step1_sample_size: 34976.000 (0.000)
step2_sample_size: 10911.066 (29618.684)
step3_sample_size: 52635.256 (62178.642)
step4_sample_size: 75146.049 (60789.764)
step5_sample_size: 123137.926 (2717.028)


## Random Forest

In [18]:
def ensemble_model_fit(data, data_pred):
    X_train, X_test, y_train, y_test = train_test_split(
        data.drop(['recruitment_plan','plan_response_rate','group_size','response'], axis=1),
        data['response'],
        test_size=0.2,
        random_state=0
    )

    # Define the VotingClassifier with the individual classifiers
    voting_classifier = ensemble.VotingClassifier(
        estimators=[
            # ('LR', linear_model.LogisticRegression(max_iter=200, random_state=0))
#             ('Ridge', linear_model.LogisticRegression(penalty='l2', solver='lbfgs', max_iter=200, random_state=0))
                    # ('SVM', svm.SVC(kernel='linear', C=1.0, random_state=0, probability=True, class_weight='balanced'))
                    ('RF', ensemble.RandomForestClassifier(criterion='gini', random_state=0))
                    # ('XGB', XGBClassifier(n_estimators=50, learning_rate=0.1, random_state=0))
                   ],
        voting='soft'
    )

    # Define the hyperparameter grid to search
    # Best Hyperparameters: {'LR__C': 0.01, 'RF__n_estimators': 50, 'XGB__n_estimators': 50}
    param_grid = {
        # 'NB__alpha': [0.01, 0.05, 0.1],  # '__' is used to specify hyperparameters for individual classifiers
        # 'LR__C': [0.01] # [0.01, 0.05, 0.1]
        # 'Ridge__C': [0.01]
        # 'SVM__C': [0.01, 0.05, 0.1]
        'RF__n_estimators': [50] # [10, 30, 50]
        # 'XGB__n_estimators': [50]
    }

    # Create a GridSearchCV object
    # custom_scorer_auc = make_scorer(roc_auc_score, needs_proba=True)
    grid_search = GridSearchCV(voting_classifier, param_grid, cv=10, scoring='roc_auc')

    # Perform the grid search on the training data
    grid_search.fit(X_train, y_train)

    # Get the best hyperparameters
    best_params = grid_search.best_params_

    # Train the final VotingClassifier with the best hyperparameters on the full training set
    final_voting_classifier = grid_search.best_estimator_
    final_voting_classifier.fit(X_train, y_train)

    # Predict probabilities instead of binary outcomes on the test set
    y_pred_proba_test = final_voting_classifier.predict_proba(X_test)
    y_pred_test = final_voting_classifier.predict(X_test)
    X_dt = data_pred.drop(['recruitment_plan','plan_response_rate','group_size','response'], axis=1)
    y_pred = final_voting_classifier.predict_proba(X_dt)

    return y_pred

In [19]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import train_test_split

# def ensemble_model_fit(data, data_pred):
#     # Split the data into training and testing sets
#     X_train, X_test, y_train, y_test = train_test_split(
#         data.drop(['recruitment_plan', 'plan_response_rate', 'group_size','response'], axis=1),
#         data['response'],
#         test_size=0.2,
#         random_state=0
#     )

#     # Define the Logistic Regression model
#     logistic_regression = LogisticRegression(C=0.01, max_iter=200,random_state=0)

#     # Fit the Logistic Regression model on the training data
#     logistic_regression.fit(X_train, y_train)

#     # Print out the coefficients of the logistic regression model
#     print("Coefficients:", logistic_regression.coef_)
#     print("Intercept:", logistic_regression.intercept_)
#     print("Coefficients shape:", logistic_regression.coef_.shape)

#     # Predict probabilities instead of binary outcomes on the test set
#     y_pred_proba_test = logistic_regression.predict_proba(X_test)
#     y_pred_test = logistic_regression.predict(X_test)

#     # You can also predict probabilities for the prediction data (data_pred)
#     X_dt = data_pred.drop(['recruitment_plan', 'plan_response_rate', 'group_size','response'], axis=1)
#     y_pred = logistic_regression.predict_proba(X_dt)

#     return y_pred

# # Call the function
# # ensemble_model_fit(data, data_pred)


### Simulation starts

In [20]:
n_sim = 100
# create a list of random seeds
random.seed(42)
random_seeds = [random.randint(1, 100000) for _ in range(n_sim)]

design_list = [5,8,10]
patient_n_list = [5468,680,170] # n_patient_per_plan

n_design = 5
n_patient_per_plan = 5468
n_rounds = 5
total_n = n_patient_per_plan * (2**n_design)

## sample size determination
beta = 0.2
power = 1 - beta
alpha = 0.05
delta = 0.01 # effect size

## early stopping
epsilon = 0.001

In [21]:
rr_dict = {"step{}_rr".format(r): [] for r in range(1, n_rounds + 1)}
plan_number_dict = {"step{}_plan_number".format(r): [] for r in range(1, n_rounds + 1)}
sample_size_dict = {"step{}_sample_size".format(r): [] for r in range(1, n_rounds + 1)}
last_round_sample_size = []
random_rr_dict = {"step{}_random_max_rr".format(r): [] for r in range(1, n_rounds + 1)}
random_rr_dict.update({
    "step{}_random_mean_rr".format(r): [] for r in range(1, n_rounds + 1)
})

stopping_dict = {"early_stopping": [],
                 "early_stopping_plan":[],
                 "early_stopping_orr":[],
                 "early_stopping_size":[]}
final_plan_number = []
highest_rr_overall = []

better_chance = []

orr_total = []
orr_total_adaptive = []



i = 0

for seed in random_seeds:
    i += 1
    print(i)

    print([f"{key}: {len(value)}" for key, value in rr_dict.items()])



    event_list = 0
    event_list_adaptive = 0
    max_rr = []

    # step 1:
    ## Generate dataset
    dt_design_5 = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                      n_patient_per_plan = n_patient_per_plan, seed=seed)
    print("empirical response rate", dt_design_5['response'].mean())
    ## save for benchmark results
    maxidx = np.argmax(dt_design_5['plan_response_rate'])
    random_rr_dict['step1_random_max_rr'].append(dt_design_5.iloc[maxidx,6])
    random_rr_dict['step1_random_mean_rr'].append(dt_design_5['plan_response_rate'].mean())

    ## Ensemble modelling:
    y_pred = ensemble_model_fit(data=dt_design_5, data_pred = dt_design_5)

    ## select recruitment plan
    pred_df = pd.DataFrame(np.hstack((dt_design_5,  y_pred[:, 1].reshape(-1, 1))),
                             columns=list(dt_design_5.columns) + ['predicted_response_rate'])
    pred_df_rr = pred_df.groupby([f'Design_Feature_{i+1}' for i in range(n_design)]+['recruitment_plan'])['predicted_response_rate'].mean().reset_index(name='predicted_response_rate')

    x = pred_df_rr['predicted_response_rate'].values
    if len(x) <= 10:
        best_k = 2
    else:
        best_k = kmeans_fit(data = x)['best_k']
    kmeans_results = kmeans_bhattacharyya(data=x, k=best_k)

    merged_df = pd.merge(pred_df_rr, kmeans_results['clusters'][['predicted_response_rate', 'cluster_number']], on='predicted_response_rate')
    cluster_with_highest_rate = kmeans_results['clusters'].groupby('cluster_number')['predicted_response_rate'].mean().idxmax()
    highest_cluster = merged_df[merged_df['cluster_number']==cluster_with_highest_rate].reset_index(drop=True)
    highest_cluster.sort_values(by='predicted_response_rate', ascending=False, inplace=True)


    p_vec_next = np.array(highest_cluster['predicted_response_rate']/np.sum(highest_cluster['predicted_response_rate']))
    highest_cluster['p_vec'] = p_vec_next
    highest_cluster = pd.merge(highest_cluster, dt_design_5[['recruitment_plan','plan_response_rate']].drop_duplicates(), how='left', on='recruitment_plan')

    # highest_cluster = pred_df_rr

    # highest_cluster = pd.merge(highest_cluster, dt_design_5[['recruitment_plan','plan_response_rate']].drop_duplicates(), how='left', on='recruitment_plan')
    # highest_cluster['cluster_number'] = highest_cluster['recruitment_plan']

    ## prepare to chance of better performance:
    temp = dt_design_5[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
    event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
    # print("step1", np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
    # print(temp)
    p0 = (temp['plan_response_rate'].mean())

    rr_dict["step1_rr"].append(dt_design_5['plan_response_rate'].mean())
    sample_size_dict["step1_sample_size"].append(len(dt_design_5))

    for round in range(2, n_rounds+1):

        rr_dict["step"+str(round)+"_rr"].append(np.dot(np.array(highest_cluster['p_vec']), np.array(highest_cluster['plan_response_rate'])))
        plan_number_dict["step"+str(round)+"_plan_number"].append(len(p_vec_next))

        ## when it comes to the last round:
        if round == n_rounds:

            ## save benchmark results for last round:
            dt_benchmark = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                      n_patient_per_plan = n_patient_per_plan, seed=seed+round)
            maxidx = np.argmax(dt_benchmark['plan_response_rate'])
            random_rr_dict["step"+str(round)+'_random_max_rr'].append(dt_benchmark.iloc[maxidx,6])
            random_rr_dict["step"+str(round)+'_random_mean_rr'].append(dt_benchmark['plan_response_rate'].mean())

            if round == 2: # if the last round is round 2
                remaining_size = total_n - len(dt_design_5) # apply to the rest of the patients
            else:
                remaining_size -= len(dt_design_5_step2up)

            print("remaining size for last Round " + str(round) + ": " + str(remaining_size))
            sample_size_dict["step"+str(round)+"_sample_size"].append(remaining_size)
            last_round_sample_size.append(remaining_size)

            ## data generation, combine previous data:
            dt_design_5_rest = generate_data_step2up(highest_cluster=highest_cluster, p_vec = p_vec_next, 
                                                          design_number = n_design, n_rounds = n_rounds,
                                                          n_patient_per_plan = n_patient_per_plan, size = int(remaining_size), seed=seed)
            plan_list = dt_design_5_rest['recruitment_plan'].unique()
            if round == 2:
                supp = dt_design_5[dt_design_5['recruitment_plan'].isin(plan_list)] # data from step 1
            else:
                # step from previous rounds
                supp = dt_design_5_step2up_overall[dt_design_5_step2up_overall['recruitment_plan'].isin(plan_list)]
            dt_design_5_rest_overall = pd.concat([dt_design_5_rest, supp.reset_index(drop=True)], axis = 0)
            dt_design_5_rest_overall.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)

            stopping_dict['early_stopping'].append(0)
            stopping_dict['early_stopping_plan'].append(0)
            stopping_dict['early_stopping_orr'].append(0)
            stopping_dict['early_stopping_size'].append(0)

            ## record final plan number
            final_plan_number.append(len(p_vec_next))

            ## record highest responserate overall
            highest_rr_overall.append(np.max(dt_design_5_rest_overall['plan_response_rate']))

            ## prepare to chance of better performance:
            temp = dt_design_5_rest[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
            event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            event_list_adaptive += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print("step"+str(round), np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print(len(temp), temp['plan_response_rate'].mean())
            # print(temp)

            result = binomtest(int(event_list), n=total_n, p=p0)
            # print(int(event_list), total_n, p0)
            orr_total.append(event_list)
            orr_total_adaptive.append(event_list_adaptive)
            better_chance.append(1 if result.pvalue < 0.05 else 0)
            # print(result.pvalue)

            break



        ## If haven't reached the last round:
        ## check remaining sample size:
        if round == 2:
            remaining_size = total_n - len(dt_design_5) # apply to the rest of the patients
        else:
            remaining_size -= len(dt_design_5_step2up)

        print("remaining size for Round " + str(round) + ": " + str(remaining_size))

        ## determine whether move on to step 2:
        if len(highest_cluster) == 1: # if there is only one plan left

            sample_size_dict["step"+str(round)+"_sample_size"].append(remaining_size)
            last_round_sample_size.append(remaining_size) # record the last round sample size
            for r in range(round+1, n_rounds+1):
                sample_size_dict["step"+str(r)+"_sample_size"].append(np.nan)

            ## save benchmark results for last round:
            dt_benchmark = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                      n_patient_per_plan = n_patient_per_plan, seed=seed+round)
            maxidx = np.argmax(dt_benchmark['plan_response_rate'])
            random_rr_dict["step"+str(round)+'_random_max_rr'].append(dt_benchmark.iloc[maxidx,6])
            random_rr_dict["step"+str(round)+'_random_mean_rr'].append(dt_benchmark['plan_response_rate'].mean())

            dt_design_5_step2up = generate_data_step2up(highest_cluster=highest_cluster, p_vec = p_vec_next,
                                                      design_number = n_design, n_rounds = n_rounds,
                                                      n_patient_per_plan = n_patient_per_plan, size = int(remaining_size), seed=seed)
            plan_list = dt_design_5_step2up['recruitment_plan'].unique()
            if round == 2:
                supp = dt_design_5[dt_design_5['recruitment_plan'].isin(plan_list)] # data from step 1
            else:
                # step from previous rounds
                supp = dt_design_5_step2up_overall[dt_design_5_step2up_overall['recruitment_plan'].isin(plan_list)]

            dt_design_5_step2up_overall = pd.concat([dt_design_5_step2up, supp.reset_index(drop=True)], axis = 0)
            dt_design_5_step2up_overall.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)

            stopping_dict['early_stopping'].append(1)
            stopping_dict['early_stopping_plan'].append(1)
            stopping_dict['early_stopping_orr'].append(0)
            stopping_dict['early_stopping_size'].append(0)

            ## record final plan number
            final_plan_number.append(len(p_vec_next))

            ## record highest responserate overall
            highest_rr_overall.append(np.max(dt_design_5_step2up_overall['plan_response_rate']))

            for r in range(round+1, n_rounds+1):
                rr_dict["step"+str(r)+"_rr"].append(np.nan)

            ## prepare for chance of better performance:
            temp = dt_design_5_step2up[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
            event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            event_list_adaptive += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print("step"+str(round), np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print(temp)

            result = binomtest(int(event_list), n=total_n, p=p0)
            # print(int(event_list), total_n, p0)
            orr_total.append(event_list)
            orr_total_adaptive.append(event_list_adaptive)
            better_chance.append(1 if result.pvalue < 0.05 else 0)
            # print(result.pvalue)
            break

        ## sample size determination:
        if round == 2:
            dt_design_5_step2up_overall = dt_design_5
        orr_1 = dt_design_5_step2up_overall['response'].mean() # observed overall response rates for previous rounds
        orr_2 = orr_1 + delta
        n_1 = len(dt_design_5_step2up_overall)

        size_step2up = sample_size_calc(orr_1, n_1, delta=delta, alpha=alpha, power=power) # total size for dataset
        if size_step2up > 0 and size_step2up < 1000:
            size_step2up = 1000 # if size in [0,1000], then it is 1000 for this round.
        elif size_step2up >= 1000:
            size_step2up = min(size_step2up, int(total_n/n_rounds)) # dataset size capped by the n_patient_per_plan
        else:
        # if size_step2up <= 0:
            # the process stops at this step
            print('Calculated size for Round ' + str(round) + ': ' + str(size_step2up) + 'lt 0, break')
            sample_size_dict["step"+str(round)+"_sample_size"].append(remaining_size)
            last_round_sample_size.append(remaining_size)
            for r in range(round+1, n_rounds+1):
                sample_size_dict["step"+str(r)+"_sample_size"].append(np.nan)

            dt_benchmark = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                      n_patient_per_plan = n_patient_per_plan, seed=seed+round)
            maxidx = np.argmax(dt_benchmark['plan_response_rate'])
            random_rr_dict["step"+str(round)+'_random_max_rr'].append(dt_benchmark.iloc[maxidx,6])
            random_rr_dict["step"+str(round)+'_random_mean_rr'].append(dt_benchmark['plan_response_rate'].mean())

            dt_design_5_step2up = generate_data_step2up(highest_cluster=highest_cluster, p_vec = p_vec_next,
                                                      design_number = n_design, n_rounds = n_rounds,
                                                      n_patient_per_plan = n_patient_per_plan, size = int(remaining_size), seed=seed)
            plan_list = dt_design_5_step2up['recruitment_plan'].unique()
            if round == 2:
                supp = dt_design_5[dt_design_5['recruitment_plan'].isin(plan_list)] # data from step 1
            else:
                # step from previous rounds
                supp = dt_design_5_step2up_overall[dt_design_5_step2up_overall['recruitment_plan'].isin(plan_list)]

            dt_design_5_step2up_overall = pd.concat([dt_design_5_step2up, supp.reset_index(drop=True)], axis = 0)
            dt_design_5_step2up_overall.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)

            stopping_dict['early_stopping'].append(1)
            stopping_dict['early_stopping_plan'].append(0)
            stopping_dict['early_stopping_orr'].append(0)
            stopping_dict['early_stopping_size'].append(1)

            ## record final plan number
            final_plan_number.append(len(p_vec_next))

            ## record highest responserate overall
            highest_rr_overall.append(np.max(dt_design_5_step2up_overall['plan_response_rate']))

            for r in range(round+1, n_rounds+1):
                rr_dict["step"+str(r)+"_rr"].append(np.nan)

            ## prepare for chance of better performance:
            temp = dt_design_5_step2up[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
            event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            event_list_adaptive += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print("step"+str(round), np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print(temp)

            result = binomtest(int(event_list), n=total_n, p=p0)
            # print(int(event_list), total_n, p0)
            orr_total.append(event_list)
            orr_total_adaptive.append(event_list_adaptive)
            better_chance.append(1 if result.pvalue < 0.05 else 0)
            # print(result.pvalue)
            break

        print('Calculated size for Round ' + str(round) + ': ' + str(size_step2up))

        sample_size_dict["step"+str(round)+"_sample_size"].append(size_step2up)

        ## save benchmark results for last round:
        dt_benchmark = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                            n_patient_per_plan = n_patient_per_plan, seed=seed+round)
        maxidx = np.argmax(dt_benchmark['plan_response_rate'])
        random_rr_dict["step"+str(round)+'_random_max_rr'].append(dt_benchmark.iloc[maxidx,6])
        random_rr_dict["step"+str(round)+'_random_mean_rr'].append(dt_benchmark['plan_response_rate'].mean())

        ## data generation, combine previous data:
        dt_design_5_step2up = generate_data_step2up(highest_cluster=highest_cluster, p_vec = p_vec_next,
                                                  design_number = n_design, n_rounds = n_rounds,
                                                  n_patient_per_plan = n_patient_per_plan, size = int(size_step2up), seed=seed)

        plan_list = dt_design_5_step2up['recruitment_plan'].unique()
        if round == 2:
            supp = dt_design_5[dt_design_5['recruitment_plan'].isin(plan_list)]
        else:
            supp = dt_design_5_step2up_overall[dt_design_5_step2up_overall['recruitment_plan'].isin(plan_list)]
        dt_design_5_step2up_overall = pd.concat([dt_design_5_step2up, supp.reset_index(drop=True)], axis = 0)
        dt_design_5_step2up_overall.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)

        ## Ensemble model fitting:
        dt_design_5_step2up.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)
        y_pred2up = ensemble_model_fit(data = dt_design_5_step2up_overall, data_pred = dt_design_5_step2up)
        pred_df_step2up = pd.DataFrame(np.hstack((dt_design_5_step2up,  y_pred2up[:, 1].reshape(-1, 1))),
                                    columns=list(dt_design_5_step2up.columns) + ['predicted_response_rate'])
        pred_df_rr_step2up = pred_df_step2up.groupby([f'Design_Feature_{i+1}' for i in range(n_design)]+['recruitment_plan'])['predicted_response_rate'].mean().reset_index(name='predicted_response_rate')

        ## select recruitment plans:
        x = pred_df_rr_step2up['predicted_response_rate'].values

        if len(x) <= 10:
            best_k = 2
        else:
            best_k = kmeans_fit(data = x)['best_k']
        kmeans_results = kmeans_bhattacharyya(data=x, k=best_k)

        ## match the cluster results back to the original data
        highest_cluster_previous = highest_cluster # save previous cluster results

        merged_df = pd.merge(pred_df_rr_step2up, kmeans_results['clusters'][['predicted_response_rate', 'cluster_number']], on='predicted_response_rate')
        cluster_with_highest_rate = kmeans_results['clusters'].groupby('cluster_number')['predicted_response_rate'].mean().idxmax()
        highest_cluster = merged_df[merged_df['cluster_number']==cluster_with_highest_rate].reset_index(drop=True)
        highest_cluster.sort_values(by='predicted_response_rate', ascending=False, inplace=True)

        # p_vec_previous = p_vec_next # save p_vec of previous round
        p_vec_next = np.array(highest_cluster['predicted_response_rate']/np.sum(highest_cluster['predicted_response_rate']))

        highest_cluster['p_vec'] = p_vec_next

        highest_cluster = pd.merge(highest_cluster, dt_design_5_step2up[['recruitment_plan','plan_response_rate']].drop_duplicates(), how='left', on='recruitment_plan')

        ## prepare to chance of better performance:
        temp = dt_design_5_step2up[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
        event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
        event_list_adaptive += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
        # print("step"+str(round), np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
        # print(temp)

        ## check early stopping predicted ORR:
        orr_df = pd.merge(highest_cluster_previous, highest_cluster[['recruitment_plan','predicted_response_rate','p_vec']], on='recruitment_plan', how='left')
        orr_df.fillna(0, inplace=True)
        p_orr_1 = np.dot(np.array(orr_df['p_vec_x']), np.array(orr_df['predicted_response_rate_y']))
        p_orr_2 = np.dot(np.array(orr_df['p_vec_y']), np.array(orr_df['predicted_response_rate_y']))
        print("orr termination", p_orr_1, p_orr_2, p_orr_2 - p_orr_1)

        if (p_orr_2 - p_orr_1 < epsilon):
            # step 3 use the same strategy of step2
            print(i, p_orr_1, p_orr_2, "early stop at Round " + str(round))
            rr_dict["step"+str(round+1)+"_rr"].append(np.dot(np.array(highest_cluster['p_vec']), np.array(highest_cluster['plan_response_rate'])))

            ## save benchmark results for last round:
            dt_benchmark = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                                n_patient_per_plan = n_patient_per_plan, seed=seed+round)
            maxidx = np.argmax(dt_benchmark['plan_response_rate'])
            random_rr_dict["step"+str(round+1)+'_random_max_rr'].append(dt_benchmark.iloc[maxidx,6])
            random_rr_dict["step"+str(round+1)+'_random_mean_rr'].append(dt_benchmark['plan_response_rate'].mean())

            ### update remaining size:
            remaining_size -= len(dt_design_5_step2up)
            print("early stop, remaining size for Round" + str(round + 1) + ": " + str(remaining_size))
            sample_size_dict["step"+str(round)+"_sample_size"].append(remaining_size)
            last_round_sample_size.append(remaining_size)

            for r in range(round+2, n_rounds+1):
                sample_size_dict["step"+str(r)+"_sample_size"].append(np.nan)

            dt_design_5_rest = generate_data_step2up(highest_cluster=highest_cluster, p_vec = p_vec_next, 
                                                      design_number = n_design, n_rounds = n_rounds,
                                                      n_patient_per_plan = n_patient_per_plan, size = int(remaining_size), seed=seed)
            plan_list = dt_design_5_rest['recruitment_plan'].unique()
            supp = dt_design_5_step2up_overall[dt_design_5_step2up_overall['recruitment_plan'].isin(plan_list)]
            dt_design_5_rest_overall = pd.concat([dt_design_5_rest, supp.reset_index(drop=True)], axis = 0)
            dt_design_5_rest_overall.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)

            stopping_dict['early_stopping'].append(1)
            stopping_dict['early_stopping_plan'].append(0)
            stopping_dict['early_stopping_orr'].append(1)
            stopping_dict['early_stopping_size'].append(0)

            ## record final plan number
            final_plan_number.append(len(p_vec_next))

            ## record highest responserate overall
            highest_rr_overall.append(np.max(dt_design_5_rest_overall['plan_response_rate']))

            for r in range(round+2, n_rounds+1):
                rr_dict["step"+str(r)+"_rr"].append(np.nan)

            ## prepare to chance of better performance:
            temp = dt_design_5_rest[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
            event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            event_list_adaptive += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print("step"+str(round+1), np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print(temp)

            result = binomtest(int(event_list), n=total_n, p=p0)
            # print(int(event_list), total_n, p0)
            orr_total.append(event_list)
            orr_total_adaptive.append(event_list_adaptive)
            better_chance.append(1 if result.pvalue < 0.05 else 0)
            # print(result.pvalue)

            break



1
['step1_rr: 0', 'step2_rr: 0', 'step3_rr: 0', 'step4_rr: 0', 'step5_rr: 0']
empirical response rate 0.06653133577310155
remaining size for Round 2: 140000
Calculated size for Round 2: 4908.31412781265
orr termination 0.06161194435349385 0.10654822447419941 0.044936280120705566
remaining size for Round 3: 135092
Calculated size for Round 3: 10868.98280016696
orr termination 0.07889259114750125 0.10391424940745084 0.025021658259949592
remaining size for Round 4: 124224
Calculated size for Round 4: 8982.011028803528
orr termination 0.07086543871953453 0.10572039693561597 0.034854958216081436
remaining size for last Round 5: 115242
2
['step1_rr: 1', 'step2_rr: 1', 'step3_rr: 1', 'step4_rr: 1', 'step5_rr: 1']
empirical response rate 0.056867566331198535
remaining size for Round 2: 140000
Calculated size for Round 2: 4261.723102741266
orr termination 0.0761481374196176 0.0955666035323601 0.019418466112742505
remaining size for Round 3: 135739
Calculated size for Round 3: 12529.817535220078

### Results summary

In [22]:
rr_df = pd.DataFrame(rr_dict)
rr_df

Unnamed: 0,step1_rr,step2_rr,step3_rr,step4_rr,step5_rr
0,0.054501,0.089405,0.092572,0.092241,0.094387
1,0.057829,0.096128,0.096308,0.097453,0.096156
2,0.062228,0.080196,0.095165,0.097710,0.099401
3,0.054011,0.082146,0.084481,0.089669,0.093916
4,0.056766,0.093382,,,
...,...,...,...,...,...
95,0.052518,0.091321,0.094260,0.099877,
96,0.042686,0.074721,0.083974,0.084863,
97,0.053834,0.082200,0.090042,0.091222,0.097313
98,0.045935,0.071878,0.079848,0.087978,0.090896


In [23]:
# Calculate average rounds:
row_non_nan_counts = rr_df.count(axis=1)

print(f"{np.mean(row_non_nan_counts):.1f} ({np.std(row_non_nan_counts):.1f})")

4.2 (0.9)


In [24]:
# Average ORR for each round:
result_dict = {}
for key, values in rr_df.items():
    mean_value = np.mean(values)
    std_value = np.std(values)
    result_dict[key] = f"{mean_value:.3f} ({std_value:.3f})"

for key, value in list(result_dict.items())[:12]:
    print(f"{key}: {value}")

step1_rr: 0.055 (0.005)
step2_rr: 0.086 (0.008)
step3_rr: 0.093 (0.005)
step4_rr: 0.095 (0.004)
step5_rr: 0.096 (0.003)


In [25]:
# overall ORR:
result_dict = np.array(orr_total) / total_n
mean_result = np.mean(result_dict)
std_result = np.std(result_dict)

print(f"Mean: {mean_result:.3f} ({std_result:.3f})")

Mean: 0.087 (0.003)


In [26]:
# Adaptive learning ORR:
result_dict = np.array(orr_total_adaptive) / (total_n-34976)
mean_result = np.mean(result_dict)
std_result = np.std(result_dict)

print(f"Mean: {mean_result:.3f} ({std_result:.3f})")

Mean: 0.095 (0.003)


In [27]:
# Highest True RR:
mean_result = (np.mean(highest_true_rr))
std_result = (np.std(highest_true_rr))
print(f"Mean: {mean_result:.3f} ({std_result:.3f})")

Mean: 0.096 (0.004)


In [28]:
# Average plan number at each round:
result_dict = {}
for key, values in plan_number_dict.items():
    mean_value = np.mean(values)
    std_value = np.std(values)
    result_dict[key] = f"{mean_value:.1f} ({std_value:.1f})"
#     result_dict[key + "_std"] = std_value
for key, value in list(result_dict.items())[:12]:
    print(f"{key}: {value}")

step1_plan_number: nan (nan)
step2_plan_number: 9.8 (6.6)
step3_plan_number: 4.0 (2.9)
step4_plan_number: 2.4 (1.8)
step5_plan_number: 1.8 (1.2)


In [29]:
means = {key: np.mean(values) for key, values in stopping_dict.items()}
means

{'early_stopping': 0.55,
 'early_stopping_plan': 0.55,
 'early_stopping_orr': 0.0,
 'early_stopping_size': 0.0}

In [30]:
print(np.mean(last_round_sample_size), np.std(last_round_sample_size))

125088.03 7340.812579619507


In [31]:
result_dict = {}
for key, values in sample_size_dict.items():
    mean_value = np.nanmean(values)
    std_value = np.nanstd(values)
    result_dict[key] = f"{mean_value:.3f} ({std_value:.3f})"
#     result_dict[key + "_std"] = std_value
for key, value in list(result_dict.items())[:12]:
    print(f"{key}: {value}")

step1_sample_size: 34976.000 (0.000)
step2_sample_size: 9543.121 (26633.474)
step3_sample_size: 30068.010 (47512.805)
step4_sample_size: 59827.490 (58183.097)
step5_sample_size: 119401.667 (2697.713)


In [32]:
# Prob. of better performance compared to the benchmark 
np.mean(better_chance)

1.0

## XGBoost

In [33]:
def ensemble_model_fit(data, data_pred):
    X_train, X_test, y_train, y_test = train_test_split(
        data.drop(['recruitment_plan','plan_response_rate','group_size','response'], axis=1),
        data['response'],
        test_size=0.2,
        random_state=0
    )

    # Define the VotingClassifier with the individual classifiers
    voting_classifier = ensemble.VotingClassifier(
        estimators=[
            # ('LR', linear_model.LogisticRegression(max_iter=200, random_state=0))
#             ('Ridge', linear_model.LogisticRegression(penalty='l2', solver='lbfgs', max_iter=200, random_state=0))
                    # ('SVM', svm.SVC(kernel='linear', C=1.0, random_state=0, probability=True, class_weight='balanced'))
#                     ('RF', ensemble.RandomForestClassifier(n_estimators=200, criterion='gini', random_state=0))
                    ('XGB', XGBClassifier(n_estimators=50, learning_rate=0.1, random_state=0))
                   ],
        voting='soft'
    )

    # Define the hyperparameter grid to search
    # Best Hyperparameters: {'LR__C': 0.01, 'RF__n_estimators': 50, 'XGB__n_estimators': 50}
    param_grid = {
        # 'NB__alpha': [0.01, 0.05, 0.1],  # '__' is used to specify hyperparameters for individual classifiers
        # 'LR__C': [0.01] # [0.01, 0.05, 0.1]
        # 'Ridge__C': [0.01]
        # 'SVM__C': [0.01, 0.05, 0.1]
#         'RF__n_estimators': [50] # [10, 30, 50]
        'XGB__n_estimators': [50]
    }

    # Create a GridSearchCV object
    # custom_scorer_auc = make_scorer(roc_auc_score, needs_proba=True)
    grid_search = GridSearchCV(voting_classifier, param_grid, cv=10, scoring='roc_auc')

    # Perform the grid search on the training data
    grid_search.fit(X_train, y_train)

    # Get the best hyperparameters
    best_params = grid_search.best_params_

    # Train the final VotingClassifier with the best hyperparameters on the full training set
    final_voting_classifier = grid_search.best_estimator_
    final_voting_classifier.fit(X_train, y_train)

    # Predict probabilities instead of binary outcomes on the test set
    y_pred_proba_test = final_voting_classifier.predict_proba(X_test)
    y_pred_test = final_voting_classifier.predict(X_test)
    X_dt = data_pred.drop(['recruitment_plan','plan_response_rate','group_size','response'], axis=1)
    y_pred = final_voting_classifier.predict_proba(X_dt)

    return y_pred

### Simulation starts

In [35]:
n_sim = 100
# create a list of random seeds
random.seed(42)
random_seeds = [random.randint(1, 100000) for _ in range(n_sim)]

design_list = [5,8,10]
patient_n_list = [5468,680,170] # n_patient_per_plan

n_design = 5
n_patient_per_plan = 5468
n_rounds = 5
total_n = n_patient_per_plan * (2**n_design)

## sample size determination
beta = 0.2
power = 1 - beta
alpha = 0.05
delta = 0.01 # effect size

## early stopping
epsilon = 0.001

In [36]:
rr_dict = {"step{}_rr".format(r): [] for r in range(1, n_rounds + 1)}
plan_number_dict = {"step{}_plan_number".format(r): [] for r in range(1, n_rounds + 1)}
sample_size_dict = {"step{}_sample_size".format(r): [] for r in range(1, n_rounds + 1)}
last_round_sample_size = []
random_rr_dict = {"step{}_random_max_rr".format(r): [] for r in range(1, n_rounds + 1)}
random_rr_dict.update({
    "step{}_random_mean_rr".format(r): [] for r in range(1, n_rounds + 1)
})

stopping_dict = {"early_stopping": [],
                 "early_stopping_plan":[],
                 "early_stopping_orr":[],
                 "early_stopping_size":[]}
final_plan_number = []
highest_rr_overall = []

better_chance = []

orr_total = []
orr_total_adaptive = []



i = 0

for seed in random_seeds:
    i += 1
    print(i)

    print([f"{key}: {len(value)}" for key, value in rr_dict.items()])



    event_list = 0
    event_list_adaptive = 0
    max_rr = []

    # step 1:
    ## Generate dataset
    dt_design_5 = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                      n_patient_per_plan = n_patient_per_plan, seed=seed)
    print("empirical response rate", dt_design_5['response'].mean())
    ## save for benchmark results
    maxidx = np.argmax(dt_design_5['plan_response_rate'])
    random_rr_dict['step1_random_max_rr'].append(dt_design_5.iloc[maxidx,6])
    random_rr_dict['step1_random_mean_rr'].append(dt_design_5['plan_response_rate'].mean())

    ## Ensemble modelling:
    y_pred = ensemble_model_fit(data=dt_design_5, data_pred = dt_design_5)

    ## select recruitment plan
    pred_df = pd.DataFrame(np.hstack((dt_design_5,  y_pred[:, 1].reshape(-1, 1))),
                             columns=list(dt_design_5.columns) + ['predicted_response_rate'])
    pred_df_rr = pred_df.groupby([f'Design_Feature_{i+1}' for i in range(n_design)]+['recruitment_plan'])['predicted_response_rate'].mean().reset_index(name='predicted_response_rate')

    x = pred_df_rr['predicted_response_rate'].values
    if len(x) <= 10:
        best_k = 2
    else:
        best_k = kmeans_fit(data = x)['best_k']
    kmeans_results = kmeans_bhattacharyya(data=x, k=best_k)

    merged_df = pd.merge(pred_df_rr, kmeans_results['clusters'][['predicted_response_rate', 'cluster_number']], on='predicted_response_rate')
    cluster_with_highest_rate = kmeans_results['clusters'].groupby('cluster_number')['predicted_response_rate'].mean().idxmax()
    highest_cluster = merged_df[merged_df['cluster_number']==cluster_with_highest_rate].reset_index(drop=True)
    highest_cluster.sort_values(by='predicted_response_rate', ascending=False, inplace=True)


    p_vec_next = np.array(highest_cluster['predicted_response_rate']/np.sum(highest_cluster['predicted_response_rate']))
    highest_cluster['p_vec'] = p_vec_next
    highest_cluster = pd.merge(highest_cluster, dt_design_5[['recruitment_plan','plan_response_rate']].drop_duplicates(), how='left', on='recruitment_plan')

    # highest_cluster = pred_df_rr

    # highest_cluster = pd.merge(highest_cluster, dt_design_5[['recruitment_plan','plan_response_rate']].drop_duplicates(), how='left', on='recruitment_plan')
    # highest_cluster['cluster_number'] = highest_cluster['recruitment_plan']

    ## prepare to chance of better performance:
    temp = dt_design_5[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
    event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
    # print("step1", np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
    # print(temp)
    p0 = (temp['plan_response_rate'].mean())

    rr_dict["step1_rr"].append(dt_design_5['plan_response_rate'].mean())
    sample_size_dict["step1_sample_size"].append(len(dt_design_5))

    for round in range(2, n_rounds+1):

        rr_dict["step"+str(round)+"_rr"].append(np.dot(np.array(highest_cluster['p_vec']), np.array(highest_cluster['plan_response_rate'])))
        plan_number_dict["step"+str(round)+"_plan_number"].append(len(p_vec_next))

        ## when it comes to the last round:
        if round == n_rounds:

            ## save benchmark results for last round:
            dt_benchmark = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                      n_patient_per_plan = n_patient_per_plan, seed=seed+round)
            maxidx = np.argmax(dt_benchmark['plan_response_rate'])
            random_rr_dict["step"+str(round)+'_random_max_rr'].append(dt_benchmark.iloc[maxidx,6])
            random_rr_dict["step"+str(round)+'_random_mean_rr'].append(dt_benchmark['plan_response_rate'].mean())

            if round == 2: # if the last round is round 2
                remaining_size = total_n - len(dt_design_5) # apply to the rest of the patients
            else:
                remaining_size -= len(dt_design_5_step2up)

            print("remaining size for last Round " + str(round) + ": " + str(remaining_size))
            sample_size_dict["step"+str(round)+"_sample_size"].append(remaining_size)
            last_round_sample_size.append(remaining_size)

            ## data generation, combine previous data:
            dt_design_5_rest = generate_data_step2up(highest_cluster=highest_cluster, p_vec = p_vec_next, 
                                                          design_number = n_design, n_rounds = n_rounds,
                                                          n_patient_per_plan = n_patient_per_plan, size = int(remaining_size), seed=seed)
            plan_list = dt_design_5_rest['recruitment_plan'].unique()
            if round == 2:
                supp = dt_design_5[dt_design_5['recruitment_plan'].isin(plan_list)] # data from step 1
            else:
                # step from previous rounds
                supp = dt_design_5_step2up_overall[dt_design_5_step2up_overall['recruitment_plan'].isin(plan_list)]
            dt_design_5_rest_overall = pd.concat([dt_design_5_rest, supp.reset_index(drop=True)], axis = 0)
            dt_design_5_rest_overall.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)

            stopping_dict['early_stopping'].append(0)
            stopping_dict['early_stopping_plan'].append(0)
            stopping_dict['early_stopping_orr'].append(0)
            stopping_dict['early_stopping_size'].append(0)

            ## record final plan number
            final_plan_number.append(len(p_vec_next))

            ## record highest responserate overall
            highest_rr_overall.append(np.max(dt_design_5_rest_overall['plan_response_rate']))

            ## prepare to chance of better performance:
            temp = dt_design_5_rest[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
            event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            event_list_adaptive += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print("step"+str(round), np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print(len(temp), temp['plan_response_rate'].mean())
            # print(temp)

            result = binomtest(int(event_list), n=total_n, p=p0)
            # print(int(event_list), total_n, p0)
            orr_total.append(event_list)
            orr_total_adaptive.append(event_list_adaptive)
            better_chance.append(1 if result.pvalue < 0.05 else 0)
            # print(result.pvalue)

            break



        ## If haven't reached the last round:
        ## check remaining sample size:
        if round == 2:
            remaining_size = total_n - len(dt_design_5) # apply to the rest of the patients
        else:
            remaining_size -= len(dt_design_5_step2up)

        print("remaining size for Round " + str(round) + ": " + str(remaining_size))

        ## determine whether move on to step 2:
        if len(highest_cluster) == 1: # if there is only one plan left

            sample_size_dict["step"+str(round)+"_sample_size"].append(remaining_size)
            last_round_sample_size.append(remaining_size) # record the last round sample size
            for r in range(round+1, n_rounds+1):
                sample_size_dict["step"+str(r)+"_sample_size"].append(np.nan)

            ## save benchmark results for last round:
            dt_benchmark = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                      n_patient_per_plan = n_patient_per_plan, seed=seed+round)
            maxidx = np.argmax(dt_benchmark['plan_response_rate'])
            random_rr_dict["step"+str(round)+'_random_max_rr'].append(dt_benchmark.iloc[maxidx,6])
            random_rr_dict["step"+str(round)+'_random_mean_rr'].append(dt_benchmark['plan_response_rate'].mean())

            dt_design_5_step2up = generate_data_step2up(highest_cluster=highest_cluster, p_vec = p_vec_next,
                                                      design_number = n_design, n_rounds = n_rounds,
                                                      n_patient_per_plan = n_patient_per_plan, size = int(remaining_size), seed=seed)
            plan_list = dt_design_5_step2up['recruitment_plan'].unique()
            if round == 2:
                supp = dt_design_5[dt_design_5['recruitment_plan'].isin(plan_list)] # data from step 1
            else:
                # step from previous rounds
                supp = dt_design_5_step2up_overall[dt_design_5_step2up_overall['recruitment_plan'].isin(plan_list)]

            dt_design_5_step2up_overall = pd.concat([dt_design_5_step2up, supp.reset_index(drop=True)], axis = 0)
            dt_design_5_step2up_overall.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)

            stopping_dict['early_stopping'].append(1)
            stopping_dict['early_stopping_plan'].append(1)
            stopping_dict['early_stopping_orr'].append(0)
            stopping_dict['early_stopping_size'].append(0)

            ## record final plan number
            final_plan_number.append(len(p_vec_next))

            ## record highest responserate overall
            highest_rr_overall.append(np.max(dt_design_5_step2up_overall['plan_response_rate']))

            for r in range(round+1, n_rounds+1):
                rr_dict["step"+str(r)+"_rr"].append(np.nan)

            ## prepare for chance of better performance:
            temp = dt_design_5_step2up[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
            event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            event_list_adaptive += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print("step"+str(round), np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print(temp)

            result = binomtest(int(event_list), n=total_n, p=p0)
            # print(int(event_list), total_n, p0)
            orr_total.append(event_list)
            orr_total_adaptive.append(event_list_adaptive)
            better_chance.append(1 if result.pvalue < 0.05 else 0)
            # print(result.pvalue)
            break

        ## sample size determination:
        if round == 2:
            dt_design_5_step2up_overall = dt_design_5
        orr_1 = dt_design_5_step2up_overall['response'].mean() # observed overall response rates for previous rounds
        orr_2 = orr_1 + delta
        n_1 = len(dt_design_5_step2up_overall)

        size_step2up = sample_size_calc(orr_1, n_1, delta=delta, alpha=alpha, power=power) # total size for dataset
        if size_step2up > 0 and size_step2up < 1000:
            size_step2up = 1000 # if size in [0,1000], then it is 1000 for this round.
        elif size_step2up >= 1000:
            size_step2up = min(size_step2up, int(total_n/n_rounds)) # dataset size capped by the n_patient_per_plan
        else:
        # if size_step2up <= 0:
            # the process stops at this step
            print('Calculated size for Round ' + str(round) + ': ' + str(size_step2up) + 'lt 0, break')
            sample_size_dict["step"+str(round)+"_sample_size"].append(remaining_size)
            last_round_sample_size.append(remaining_size)
            for r in range(round+1, n_rounds+1):
                sample_size_dict["step"+str(r)+"_sample_size"].append(np.nan)

            dt_benchmark = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                      n_patient_per_plan = n_patient_per_plan, seed=seed+round)
            maxidx = np.argmax(dt_benchmark['plan_response_rate'])
            random_rr_dict["step"+str(round)+'_random_max_rr'].append(dt_benchmark.iloc[maxidx,6])
            random_rr_dict["step"+str(round)+'_random_mean_rr'].append(dt_benchmark['plan_response_rate'].mean())

            dt_design_5_step2up = generate_data_step2up(highest_cluster=highest_cluster, p_vec = p_vec_next,
                                                      design_number = n_design, n_rounds = n_rounds,
                                                      n_patient_per_plan = n_patient_per_plan, size = int(remaining_size), seed=seed)
            plan_list = dt_design_5_step2up['recruitment_plan'].unique()
            if round == 2:
                supp = dt_design_5[dt_design_5['recruitment_plan'].isin(plan_list)] # data from step 1
            else:
                # step from previous rounds
                supp = dt_design_5_step2up_overall[dt_design_5_step2up_overall['recruitment_plan'].isin(plan_list)]

            dt_design_5_step2up_overall = pd.concat([dt_design_5_step2up, supp.reset_index(drop=True)], axis = 0)
            dt_design_5_step2up_overall.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)

            stopping_dict['early_stopping'].append(1)
            stopping_dict['early_stopping_plan'].append(0)
            stopping_dict['early_stopping_orr'].append(0)
            stopping_dict['early_stopping_size'].append(1)

            ## record final plan number
            final_plan_number.append(len(p_vec_next))

            ## record highest responserate overall
            highest_rr_overall.append(np.max(dt_design_5_step2up_overall['plan_response_rate']))

            for r in range(round+1, n_rounds+1):
                rr_dict["step"+str(r)+"_rr"].append(np.nan)

            ## prepare for chance of better performance:
            temp = dt_design_5_step2up[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
            event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            event_list_adaptive += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print("step"+str(round), np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print(temp)

            result = binomtest(int(event_list), n=total_n, p=p0)
            # print(int(event_list), total_n, p0)
            orr_total.append(event_list)
            orr_total_adaptive.append(event_list_adaptive)
            better_chance.append(1 if result.pvalue < 0.05 else 0)
            # print(result.pvalue)
            break

        print('Calculated size for Round ' + str(round) + ': ' + str(size_step2up))

        sample_size_dict["step"+str(round)+"_sample_size"].append(size_step2up)

        ## save benchmark results for last round:
        dt_benchmark = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                            n_patient_per_plan = n_patient_per_plan, seed=seed+round)
        maxidx = np.argmax(dt_benchmark['plan_response_rate'])
        random_rr_dict["step"+str(round)+'_random_max_rr'].append(dt_benchmark.iloc[maxidx,6])
        random_rr_dict["step"+str(round)+'_random_mean_rr'].append(dt_benchmark['plan_response_rate'].mean())

        ## data generation, combine previous data:
        dt_design_5_step2up = generate_data_step2up(highest_cluster=highest_cluster, p_vec = p_vec_next,
                                                  design_number = n_design, n_rounds = n_rounds,
                                                  n_patient_per_plan = n_patient_per_plan, size = int(size_step2up), seed=seed)

        plan_list = dt_design_5_step2up['recruitment_plan'].unique()
        if round == 2:
            supp = dt_design_5[dt_design_5['recruitment_plan'].isin(plan_list)]
        else:
            supp = dt_design_5_step2up_overall[dt_design_5_step2up_overall['recruitment_plan'].isin(plan_list)]
        dt_design_5_step2up_overall = pd.concat([dt_design_5_step2up, supp.reset_index(drop=True)], axis = 0)
        dt_design_5_step2up_overall.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)

        ## Ensemble model fitting:
        dt_design_5_step2up.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)
        y_pred2up = ensemble_model_fit(data = dt_design_5_step2up_overall, data_pred = dt_design_5_step2up)
        pred_df_step2up = pd.DataFrame(np.hstack((dt_design_5_step2up,  y_pred2up[:, 1].reshape(-1, 1))),
                                    columns=list(dt_design_5_step2up.columns) + ['predicted_response_rate'])
        pred_df_rr_step2up = pred_df_step2up.groupby([f'Design_Feature_{i+1}' for i in range(n_design)]+['recruitment_plan'])['predicted_response_rate'].mean().reset_index(name='predicted_response_rate')

        ## select recruitment plans:
        x = pred_df_rr_step2up['predicted_response_rate'].values

        if len(x) <= 10:
            best_k = 2
        else:
            best_k = kmeans_fit(data = x)['best_k']
        kmeans_results = kmeans_bhattacharyya(data=x, k=best_k)

        ## match the cluster results back to the original data
        highest_cluster_previous = highest_cluster # save previous cluster results

        merged_df = pd.merge(pred_df_rr_step2up, kmeans_results['clusters'][['predicted_response_rate', 'cluster_number']], on='predicted_response_rate')
        cluster_with_highest_rate = kmeans_results['clusters'].groupby('cluster_number')['predicted_response_rate'].mean().idxmax()
        highest_cluster = merged_df[merged_df['cluster_number']==cluster_with_highest_rate].reset_index(drop=True)
        highest_cluster.sort_values(by='predicted_response_rate', ascending=False, inplace=True)

        # p_vec_previous = p_vec_next # save p_vec of previous round
        p_vec_next = np.array(highest_cluster['predicted_response_rate']/np.sum(highest_cluster['predicted_response_rate']))

        highest_cluster['p_vec'] = p_vec_next

        highest_cluster = pd.merge(highest_cluster, dt_design_5_step2up[['recruitment_plan','plan_response_rate']].drop_duplicates(), how='left', on='recruitment_plan')

        ## prepare to chance of better performance:
        temp = dt_design_5_step2up[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
        event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
        event_list_adaptive += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
        # print("step"+str(round), np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
        # print(temp)

        ## check early stopping predicted ORR:
        orr_df = pd.merge(highest_cluster_previous, highest_cluster[['recruitment_plan','predicted_response_rate','p_vec']], on='recruitment_plan', how='left')
        orr_df.fillna(0, inplace=True)
        p_orr_1 = np.dot(np.array(orr_df['p_vec_x']), np.array(orr_df['predicted_response_rate_y']))
        p_orr_2 = np.dot(np.array(orr_df['p_vec_y']), np.array(orr_df['predicted_response_rate_y']))
        print("orr termination", p_orr_1, p_orr_2, p_orr_2 - p_orr_1)

        if (p_orr_2 - p_orr_1 < epsilon):
            # step 3 use the same strategy of step2
            print(i, p_orr_1, p_orr_2, "early stop at Round " + str(round))
            rr_dict["step"+str(round+1)+"_rr"].append(np.dot(np.array(highest_cluster['p_vec']), np.array(highest_cluster['plan_response_rate'])))

            ## save benchmark results for last round:
            dt_benchmark = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                                n_patient_per_plan = n_patient_per_plan, seed=seed+round)
            maxidx = np.argmax(dt_benchmark['plan_response_rate'])
            random_rr_dict["step"+str(round+1)+'_random_max_rr'].append(dt_benchmark.iloc[maxidx,6])
            random_rr_dict["step"+str(round+1)+'_random_mean_rr'].append(dt_benchmark['plan_response_rate'].mean())

            ### update remaining size:
            remaining_size -= len(dt_design_5_step2up)
            print("early stop, remaining size for Round" + str(round + 1) + ": " + str(remaining_size))
            sample_size_dict["step"+str(round)+"_sample_size"].append(remaining_size)
            last_round_sample_size.append(remaining_size)

            for r in range(round+2, n_rounds+1):
                sample_size_dict["step"+str(r)+"_sample_size"].append(np.nan)

            dt_design_5_rest = generate_data_step2up(highest_cluster=highest_cluster, p_vec = p_vec_next, 
                                                      design_number = n_design, n_rounds = n_rounds,
                                                      n_patient_per_plan = n_patient_per_plan, size = int(remaining_size), seed=seed)
            plan_list = dt_design_5_rest['recruitment_plan'].unique()
            supp = dt_design_5_step2up_overall[dt_design_5_step2up_overall['recruitment_plan'].isin(plan_list)]
            dt_design_5_rest_overall = pd.concat([dt_design_5_rest, supp.reset_index(drop=True)], axis = 0)
            dt_design_5_rest_overall.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)

            stopping_dict['early_stopping'].append(1)
            stopping_dict['early_stopping_plan'].append(0)
            stopping_dict['early_stopping_orr'].append(1)
            stopping_dict['early_stopping_size'].append(0)

            ## record final plan number
            final_plan_number.append(len(p_vec_next))

            ## record highest responserate overall
            highest_rr_overall.append(np.max(dt_design_5_rest_overall['plan_response_rate']))

            for r in range(round+2, n_rounds+1):
                rr_dict["step"+str(r)+"_rr"].append(np.nan)

            ## prepare to chance of better performance:
            temp = dt_design_5_rest[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
            event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            event_list_adaptive += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print("step"+str(round+1), np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print(temp)

            result = binomtest(int(event_list), n=total_n, p=p0)
            # print(int(event_list), total_n, p0)
            orr_total.append(event_list)
            orr_total_adaptive.append(event_list_adaptive)
            better_chance.append(1 if result.pvalue < 0.05 else 0)
            # print(result.pvalue)

            break



1
['step1_rr: 0', 'step2_rr: 0', 'step3_rr: 0', 'step4_rr: 0', 'step5_rr: 0']
empirical response rate 0.06653133577310155
remaining size for Round 2: 140000
Calculated size for Round 2: 4908.31412781265
orr termination 0.06881553393147219 0.10311214769232589 0.0342966137608537
remaining size for Round 3: 135092
Calculated size for Round 3: 18485.24183129777
orr termination 0.057444137186662594 0.10928504168987274 0.05184090450321015
remaining size for Round 4: 116607
2
['step1_rr: 1', 'step2_rr: 1', 'step3_rr: 1', 'step4_rr: 1', 'step5_rr: 1']
empirical response rate 0.056867566331198535
remaining size for Round 2: 140000
Calculated size for Round 2: 4261.723102741266
orr termination 0.05737133575520622 0.09515731229608002 0.037785976540873796
remaining size for Round 3: 135739
Calculated size for Round 3: 12479.026943269855
orr termination 0.06649053664517672 0.10064429987420616 0.03415376322902944
remaining size for Round 4: 123260
Calculated size for Round 4: 8603.953433747745
orr t

### Results summary

In [37]:
rr_df = pd.DataFrame(rr_dict)
rr_df

Unnamed: 0,step1_rr,step2_rr,step3_rr,step4_rr,step5_rr
0,0.054501,0.092153,0.094426,0.098086,
1,0.057829,0.096130,0.096309,0.096894,0.098775
2,0.062228,0.080152,0.093251,0.093906,
3,0.054011,0.081445,0.092203,0.093915,0.093398
4,0.056766,0.093382,,,
...,...,...,...,...,...
95,0.052518,0.093172,0.095070,0.099877,
96,0.042686,0.063888,0.083896,0.085995,
97,0.053834,0.087099,0.091190,0.093792,0.097313
98,0.045935,0.071839,0.080000,0.084611,0.090896


In [38]:
# Calculate average rounds:
row_non_nan_counts = rr_df.count(axis=1)

print(f"{np.mean(row_non_nan_counts):.1f} ({np.std(row_non_nan_counts):.1f})")

4.0 (0.9)


In [39]:
# ORR for each round:
result_dict = {}
for key, values in rr_df.items():
    mean_value = np.mean(values)
    std_value = np.std(values)
    result_dict[key] = f"{mean_value:.3f} ({std_value:.3f})"

for key, value in list(result_dict.items())[:12]:
    print(f"{key}: {value}")

step1_rr: 0.055 (0.005)
step2_rr: 0.086 (0.008)
step3_rr: 0.093 (0.005)
step4_rr: 0.095 (0.005)
step5_rr: 0.096 (0.003)


In [40]:
# Overall ORR:
result_dict = np.array(orr_total) / total_n
mean_result = np.mean(result_dict)
std_result = np.std(result_dict)

print(f"Mean: {mean_result:.3f}, StD: {std_result:.3f}")

Mean: 0.087, StD: 0.003


In [41]:
# Highest True ORR:
print(np.mean(highest_true_rr))
print(np.std(highest_true_rr))

0.09637294263286936
0.0035224757023158263


In [42]:
# Adaptive learning ORR:
result_dict = np.array(orr_total_adaptive) / (total_n-34976)
mean_result = np.mean(result_dict)
std_result = np.std(result_dict)

print(f"Mean: {mean_result:.3f}, StD: {std_result:.3f}")

Mean: 0.096, StD: 0.003


In [43]:
# plan_number_dict = {"step{}_plan_number".format(r): [] for r in range(1, n_rounds + 1)}
result_dict = {}
for key, values in plan_number_dict.items():
    mean_value = np.mean(values)
    std_value = np.std(values)
    result_dict[key] = f"{mean_value:.1f} ({std_value:.1f})"
#     result_dict[key + "_std"] = std_value
for key, value in list(result_dict.items())[:12]:
    print(f"{key}: {value}")

step1_plan_number: nan (nan)
step2_plan_number: 9.3 (6.2)
step3_plan_number: 3.7 (2.9)
step4_plan_number: 2.4 (2.0)
step5_plan_number: 1.7 (1.1)


In [44]:
means = {key: np.mean(values) for key, values in stopping_dict.items()}
means

{'early_stopping': 0.63,
 'early_stopping_plan': 0.63,
 'early_stopping_orr': 0.0,
 'early_stopping_size': 0.0}

In [45]:
print(np.mean(last_round_sample_size), np.std(last_round_sample_size))

126751.5 7443.797859291989


In [46]:
# plan_number_dict = {"step{}_plan_number".format(r): [] for r in range(1, n_rounds + 1)}
result_dict = {}
for key, values in sample_size_dict.items():
    mean_value = np.nanmean(values)
    std_value = np.nanstd(values)
    result_dict[key] = f"{mean_value:.3f} ({std_value:.3f})"
#     result_dict[key + "_std"] = std_value
for key, value in list(result_dict.items())[:12]:
    print(f"{key}: {value}")

step1_sample_size: 34976.000 (0.000)
step2_sample_size: 10911.094 (29618.551)
step3_sample_size: 40901.285 (55369.390)
step4_sample_size: 64527.638 (58601.277)
step5_sample_size: 120051.595 (2825.500)


In [47]:
np.mean(better_chance)

1.0

## Ensemble learning - 3 methods

In [50]:
def ensemble_model_fit(data, data_pred):
    X_train, X_test, y_train, y_test = train_test_split(
        data.drop(['recruitment_plan','plan_response_rate','group_size','response'], axis=1),
        data['response'],
        test_size=0.2,
        random_state=0
    )

    # Define the VotingClassifier with the individual classifiers
    voting_classifier = ensemble.VotingClassifier(
        estimators=[
            ('LR', linear_model.LogisticRegression(max_iter=200, random_state=0)),
                    ('RF', ensemble.RandomForestClassifier(criterion='gini', random_state=0)),
                    ('XGB', XGBClassifier(learning_rate=0.1, random_state=0))
                   ],
        voting='soft'
    )

    # Define the hyperparameter grid to search
    # Best Hyperparameters: {'LR__C': 0.01, 'RF__n_estimators': 50, 'XGB__n_estimators': 50}
    param_grid = {
        'LR__C': [0.05],
        'RF__n_estimators': [100],
        'XGB__n_estimators': [50]
    }

    # Create a GridSearchCV object
    # custom_scorer_auc = make_scorer(roc_auc_score, needs_proba=True)
    grid_search = GridSearchCV(voting_classifier, param_grid, cv=10, scoring='roc_auc')

    # Perform the grid search on the training data
    grid_search.fit(X_train, y_train)

    # Get the best hyperparameters
    best_params = grid_search.best_params_

    # Train the final VotingClassifier with the best hyperparameters on the full training set
    final_voting_classifier = grid_search.best_estimator_
    final_voting_classifier.fit(X_train, y_train)

    # Predict probabilities instead of binary outcomes on the test set
    y_pred_proba_test = final_voting_classifier.predict_proba(X_test)
    y_pred_test = final_voting_classifier.predict(X_test)
    X_dt = data_pred.drop(['recruitment_plan','plan_response_rate','group_size','response'], axis=1)
    y_pred = final_voting_classifier.predict_proba(X_dt)

    return y_pred

### Simulation starts

In [51]:
n_sim = 100
# create a list of random seeds
random.seed(42)
random_seeds = [random.randint(1, 100000) for _ in range(n_sim)]

design_list = [5,8,10]
patient_n_list = [5468,680,170] # n_patient_per_plan

n_design = 5
n_patient_per_plan = 5468
n_rounds = 5
total_n = n_patient_per_plan * (2**n_design)

## sample size determination
beta = 0.2
power = 1 - beta
alpha = 0.05
delta = 0.01 # effect size

## early stopping
epsilon = 0.001

In [52]:
rr_dict = {"step{}_rr".format(r): [] for r in range(1, n_rounds + 1)}
plan_number_dict = {"step{}_plan_number".format(r): [] for r in range(1, n_rounds + 1)}
sample_size_dict = {"step{}_sample_size".format(r): [] for r in range(1, n_rounds + 1)}
last_round_sample_size = []
random_rr_dict = {"step{}_random_max_rr".format(r): [] for r in range(1, n_rounds + 1)}
random_rr_dict.update({
    "step{}_random_mean_rr".format(r): [] for r in range(1, n_rounds + 1)
})

stopping_dict = {"early_stopping": [],
                 "early_stopping_plan":[],
                 "early_stopping_orr":[],
                 "early_stopping_size":[]}
final_plan_number = []
highest_rr_overall = []

better_chance = []

orr_total = []
orr_total_adaptive = []



i = 0

for seed in random_seeds:
    i += 1
    print(i)

    print([f"{key}: {len(value)}" for key, value in rr_dict.items()])



    event_list = 0
    event_list_adaptive = 0
    max_rr = []

    # step 1:
    ## Generate dataset
    dt_design_5 = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                      n_patient_per_plan = n_patient_per_plan, seed=seed)
    print("empirical response rate", dt_design_5['response'].mean())
    ## save for benchmark results
    maxidx = np.argmax(dt_design_5['plan_response_rate'])
    random_rr_dict['step1_random_max_rr'].append(dt_design_5.iloc[maxidx,6])
    random_rr_dict['step1_random_mean_rr'].append(dt_design_5['plan_response_rate'].mean())

    ## Ensemble modelling:
    y_pred = ensemble_model_fit(data=dt_design_5, data_pred = dt_design_5)

    ## select recruitment plan
    pred_df = pd.DataFrame(np.hstack((dt_design_5,  y_pred[:, 1].reshape(-1, 1))),
                             columns=list(dt_design_5.columns) + ['predicted_response_rate'])
    pred_df_rr = pred_df.groupby([f'Design_Feature_{i+1}' for i in range(n_design)]+['recruitment_plan'])['predicted_response_rate'].mean().reset_index(name='predicted_response_rate')

    x = pred_df_rr['predicted_response_rate'].values
    if len(x) <= 10:
        best_k = 2
    else:
        best_k = kmeans_fit(data = x)['best_k']
    kmeans_results = kmeans_bhattacharyya(data=x, k=best_k)

    merged_df = pd.merge(pred_df_rr, kmeans_results['clusters'][['predicted_response_rate', 'cluster_number']], on='predicted_response_rate')
    cluster_with_highest_rate = kmeans_results['clusters'].groupby('cluster_number')['predicted_response_rate'].mean().idxmax()
    highest_cluster = merged_df[merged_df['cluster_number']==cluster_with_highest_rate].reset_index(drop=True)
    highest_cluster.sort_values(by='predicted_response_rate', ascending=False, inplace=True)


    p_vec_next = np.array(highest_cluster['predicted_response_rate']/np.sum(highest_cluster['predicted_response_rate']))
    highest_cluster['p_vec'] = p_vec_next
    highest_cluster = pd.merge(highest_cluster, dt_design_5[['recruitment_plan','plan_response_rate']].drop_duplicates(), how='left', on='recruitment_plan')

    # highest_cluster = pred_df_rr

    # highest_cluster = pd.merge(highest_cluster, dt_design_5[['recruitment_plan','plan_response_rate']].drop_duplicates(), how='left', on='recruitment_plan')
    # highest_cluster['cluster_number'] = highest_cluster['recruitment_plan']

    ## prepare to chance of better performance:
    temp = dt_design_5[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
    event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
    # print("step1", np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
    # print(temp)
    p0 = (temp['plan_response_rate'].mean())

    rr_dict["step1_rr"].append(dt_design_5['plan_response_rate'].mean())
    sample_size_dict["step1_sample_size"].append(len(dt_design_5))

    for round in range(2, n_rounds+1):

        rr_dict["step"+str(round)+"_rr"].append(np.dot(np.array(highest_cluster['p_vec']), np.array(highest_cluster['plan_response_rate'])))
        plan_number_dict["step"+str(round)+"_plan_number"].append(len(p_vec_next))

        ## when it comes to the last round:
        if round == n_rounds:

            ## save benchmark results for last round:
            dt_benchmark = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                      n_patient_per_plan = n_patient_per_plan, seed=seed+round)
            maxidx = np.argmax(dt_benchmark['plan_response_rate'])
            random_rr_dict["step"+str(round)+'_random_max_rr'].append(dt_benchmark.iloc[maxidx,6])
            random_rr_dict["step"+str(round)+'_random_mean_rr'].append(dt_benchmark['plan_response_rate'].mean())

            if round == 2: # if the last round is round 2
                remaining_size = total_n - len(dt_design_5) # apply to the rest of the patients
            else:
                remaining_size -= len(dt_design_5_step2up)

            print("remaining size for last Round " + str(round) + ": " + str(remaining_size))
            sample_size_dict["step"+str(round)+"_sample_size"].append(remaining_size)
            last_round_sample_size.append(remaining_size)

            ## data generation, combine previous data:
            dt_design_5_rest = generate_data_step2up(highest_cluster=highest_cluster, p_vec = p_vec_next, 
                                                          design_number = n_design, n_rounds = n_rounds,
                                                          n_patient_per_plan = n_patient_per_plan, size = int(remaining_size), seed=seed)
            plan_list = dt_design_5_rest['recruitment_plan'].unique()
            if round == 2:
                supp = dt_design_5[dt_design_5['recruitment_plan'].isin(plan_list)] # data from step 1
            else:
                # step from previous rounds
                supp = dt_design_5_step2up_overall[dt_design_5_step2up_overall['recruitment_plan'].isin(plan_list)]
            dt_design_5_rest_overall = pd.concat([dt_design_5_rest, supp.reset_index(drop=True)], axis = 0)
            dt_design_5_rest_overall.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)

            stopping_dict['early_stopping'].append(0)
            stopping_dict['early_stopping_plan'].append(0)
            stopping_dict['early_stopping_orr'].append(0)
            stopping_dict['early_stopping_size'].append(0)

            ## record final plan number
            final_plan_number.append(len(p_vec_next))

            ## record highest responserate overall
            highest_rr_overall.append(np.max(dt_design_5_rest_overall['plan_response_rate']))

            ## prepare to chance of better performance:
            temp = dt_design_5_rest[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
            event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            event_list_adaptive += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print("step"+str(round), np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print(len(temp), temp['plan_response_rate'].mean())
            # print(temp)

            result = binomtest(int(event_list), n=total_n, p=p0)
            # print(int(event_list), total_n, p0)
            orr_total.append(event_list)
            orr_total_adaptive.append(event_list_adaptive)
            better_chance.append(1 if result.pvalue < 0.05 else 0)
            # print(result.pvalue)

            break



        ## If haven't reached the last round:
        ## check remaining sample size:
        if round == 2:
            remaining_size = total_n - len(dt_design_5) # apply to the rest of the patients
        else:
            remaining_size -= len(dt_design_5_step2up)

        print("remaining size for Round " + str(round) + ": " + str(remaining_size))

        ## determine whether move on to step 2:
        if len(highest_cluster) == 1: # if there is only one plan left

            sample_size_dict["step"+str(round)+"_sample_size"].append(remaining_size)
            last_round_sample_size.append(remaining_size) # record the last round sample size
            for r in range(round+1, n_rounds+1):
                sample_size_dict["step"+str(r)+"_sample_size"].append(np.nan)

            ## save benchmark results for last round:
            dt_benchmark = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                      n_patient_per_plan = n_patient_per_plan, seed=seed+round)
            maxidx = np.argmax(dt_benchmark['plan_response_rate'])
            random_rr_dict["step"+str(round)+'_random_max_rr'].append(dt_benchmark.iloc[maxidx,6])
            random_rr_dict["step"+str(round)+'_random_mean_rr'].append(dt_benchmark['plan_response_rate'].mean())

            dt_design_5_step2up = generate_data_step2up(highest_cluster=highest_cluster, p_vec = p_vec_next,
                                                      design_number = n_design, n_rounds = n_rounds,
                                                      n_patient_per_plan = n_patient_per_plan, size = int(remaining_size), seed=seed)
            plan_list = dt_design_5_step2up['recruitment_plan'].unique()
            if round == 2:
                supp = dt_design_5[dt_design_5['recruitment_plan'].isin(plan_list)] # data from step 1
            else:
                # step from previous rounds
                supp = dt_design_5_step2up_overall[dt_design_5_step2up_overall['recruitment_plan'].isin(plan_list)]

            dt_design_5_step2up_overall = pd.concat([dt_design_5_step2up, supp.reset_index(drop=True)], axis = 0)
            dt_design_5_step2up_overall.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)

            stopping_dict['early_stopping'].append(1)
            stopping_dict['early_stopping_plan'].append(1)
            stopping_dict['early_stopping_orr'].append(0)
            stopping_dict['early_stopping_size'].append(0)

            ## record final plan number
            final_plan_number.append(len(p_vec_next))

            ## record highest responserate overall
            highest_rr_overall.append(np.max(dt_design_5_step2up_overall['plan_response_rate']))

            for r in range(round+1, n_rounds+1):
                rr_dict["step"+str(r)+"_rr"].append(np.nan)

            ## prepare for chance of better performance:
            temp = dt_design_5_step2up[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
            event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            event_list_adaptive += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print("step"+str(round), np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print(temp)

            result = binomtest(int(event_list), n=total_n, p=p0)
            # print(int(event_list), total_n, p0)
            orr_total.append(event_list)
            orr_total_adaptive.append(event_list_adaptive)
            better_chance.append(1 if result.pvalue < 0.05 else 0)
            # print(result.pvalue)
            break

        ## sample size determination:
        if round == 2:
            dt_design_5_step2up_overall = dt_design_5
        orr_1 = dt_design_5_step2up_overall['response'].mean() # observed overall response rates for previous rounds
        orr_2 = orr_1 + delta
        n_1 = len(dt_design_5_step2up_overall)

        size_step2up = sample_size_calc(orr_1, n_1, delta=delta, alpha=alpha, power=power) # total size for dataset
        if size_step2up > 0 and size_step2up < 1000:
            size_step2up = 1000 # if size in [0,1000], then it is 1000 for this round.
        elif size_step2up >= 1000:
            size_step2up = min(size_step2up, int(total_n/n_rounds)) # dataset size capped by the n_patient_per_plan
        else:
        # if size_step2up <= 0:
            # the process stops at this step
            print('Calculated size for Round ' + str(round) + ': ' + str(size_step2up) + 'lt 0, break')
            sample_size_dict["step"+str(round)+"_sample_size"].append(remaining_size)
            last_round_sample_size.append(remaining_size)
            for r in range(round+1, n_rounds+1):
                sample_size_dict["step"+str(r)+"_sample_size"].append(np.nan)

            dt_benchmark = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                      n_patient_per_plan = n_patient_per_plan, seed=seed+round)
            maxidx = np.argmax(dt_benchmark['plan_response_rate'])
            random_rr_dict["step"+str(round)+'_random_max_rr'].append(dt_benchmark.iloc[maxidx,6])
            random_rr_dict["step"+str(round)+'_random_mean_rr'].append(dt_benchmark['plan_response_rate'].mean())

            dt_design_5_step2up = generate_data_step2up(highest_cluster=highest_cluster, p_vec = p_vec_next,
                                                      design_number = n_design, n_rounds = n_rounds,
                                                      n_patient_per_plan = n_patient_per_plan, size = int(remaining_size), seed=seed)
            plan_list = dt_design_5_step2up['recruitment_plan'].unique()
            if round == 2:
                supp = dt_design_5[dt_design_5['recruitment_plan'].isin(plan_list)] # data from step 1
            else:
                # step from previous rounds
                supp = dt_design_5_step2up_overall[dt_design_5_step2up_overall['recruitment_plan'].isin(plan_list)]

            dt_design_5_step2up_overall = pd.concat([dt_design_5_step2up, supp.reset_index(drop=True)], axis = 0)
            dt_design_5_step2up_overall.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)

            stopping_dict['early_stopping'].append(1)
            stopping_dict['early_stopping_plan'].append(0)
            stopping_dict['early_stopping_orr'].append(0)
            stopping_dict['early_stopping_size'].append(1)

            ## record final plan number
            final_plan_number.append(len(p_vec_next))

            ## record highest responserate overall
            highest_rr_overall.append(np.max(dt_design_5_step2up_overall['plan_response_rate']))

            for r in range(round+1, n_rounds+1):
                rr_dict["step"+str(r)+"_rr"].append(np.nan)

            ## prepare for chance of better performance:
            temp = dt_design_5_step2up[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
            event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            event_list_adaptive += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print("step"+str(round), np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print(temp)

            result = binomtest(int(event_list), n=total_n, p=p0)
            # print(int(event_list), total_n, p0)
            orr_total.append(event_list)
            orr_total_adaptive.append(event_list_adaptive)
            better_chance.append(1 if result.pvalue < 0.05 else 0)
            # print(result.pvalue)
            break

        print('Calculated size for Round ' + str(round) + ': ' + str(size_step2up))

        sample_size_dict["step"+str(round)+"_sample_size"].append(size_step2up)

        ## save benchmark results for last round:
        dt_benchmark = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                            n_patient_per_plan = n_patient_per_plan, seed=seed+round)
        maxidx = np.argmax(dt_benchmark['plan_response_rate'])
        random_rr_dict["step"+str(round)+'_random_max_rr'].append(dt_benchmark.iloc[maxidx,6])
        random_rr_dict["step"+str(round)+'_random_mean_rr'].append(dt_benchmark['plan_response_rate'].mean())

        ## data generation, combine previous data:
        dt_design_5_step2up = generate_data_step2up(highest_cluster=highest_cluster, p_vec = p_vec_next,
                                                  design_number = n_design, n_rounds = n_rounds,
                                                  n_patient_per_plan = n_patient_per_plan, size = int(size_step2up), seed=seed)

        plan_list = dt_design_5_step2up['recruitment_plan'].unique()
        if round == 2:
            supp = dt_design_5[dt_design_5['recruitment_plan'].isin(plan_list)]
        else:
            supp = dt_design_5_step2up_overall[dt_design_5_step2up_overall['recruitment_plan'].isin(plan_list)]
        dt_design_5_step2up_overall = pd.concat([dt_design_5_step2up, supp.reset_index(drop=True)], axis = 0)
        dt_design_5_step2up_overall.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)

        ## Ensemble model fitting:
        dt_design_5_step2up.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)
        y_pred2up = ensemble_model_fit(data = dt_design_5_step2up_overall, data_pred = dt_design_5_step2up)
        pred_df_step2up = pd.DataFrame(np.hstack((dt_design_5_step2up,  y_pred2up[:, 1].reshape(-1, 1))),
                                    columns=list(dt_design_5_step2up.columns) + ['predicted_response_rate'])
        pred_df_rr_step2up = pred_df_step2up.groupby([f'Design_Feature_{i+1}' for i in range(n_design)]+['recruitment_plan'])['predicted_response_rate'].mean().reset_index(name='predicted_response_rate')

        ## select recruitment plans:
        x = pred_df_rr_step2up['predicted_response_rate'].values

        if len(x) <= 10:
            best_k = 2
        else:
            best_k = kmeans_fit(data = x)['best_k']
        kmeans_results = kmeans_bhattacharyya(data=x, k=best_k)

        ## match the cluster results back to the original data
        highest_cluster_previous = highest_cluster # save previous cluster results

        merged_df = pd.merge(pred_df_rr_step2up, kmeans_results['clusters'][['predicted_response_rate', 'cluster_number']], on='predicted_response_rate')
        cluster_with_highest_rate = kmeans_results['clusters'].groupby('cluster_number')['predicted_response_rate'].mean().idxmax()
        highest_cluster = merged_df[merged_df['cluster_number']==cluster_with_highest_rate].reset_index(drop=True)
        highest_cluster.sort_values(by='predicted_response_rate', ascending=False, inplace=True)

        # p_vec_previous = p_vec_next # save p_vec of previous round
        p_vec_next = np.array(highest_cluster['predicted_response_rate']/np.sum(highest_cluster['predicted_response_rate']))

        highest_cluster['p_vec'] = p_vec_next

        highest_cluster = pd.merge(highest_cluster, dt_design_5_step2up[['recruitment_plan','plan_response_rate']].drop_duplicates(), how='left', on='recruitment_plan')

        ## prepare to chance of better performance:
        temp = dt_design_5_step2up[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
        event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
        event_list_adaptive += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
        # print("step"+str(round), np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
        # print(temp)

        ## check early stopping predicted ORR:
        orr_df = pd.merge(highest_cluster_previous, highest_cluster[['recruitment_plan','predicted_response_rate','p_vec']], on='recruitment_plan', how='left')
        orr_df.fillna(0, inplace=True)
        p_orr_1 = np.dot(np.array(orr_df['p_vec_x']), np.array(orr_df['predicted_response_rate_y']))
        p_orr_2 = np.dot(np.array(orr_df['p_vec_y']), np.array(orr_df['predicted_response_rate_y']))
        print("orr termination", p_orr_1, p_orr_2, p_orr_2 - p_orr_1)

        if (p_orr_2 - p_orr_1 < epsilon):
            # step 3 use the same strategy of step2
            print(i, p_orr_1, p_orr_2, "early stop at Round " + str(round))
            rr_dict["step"+str(round+1)+"_rr"].append(np.dot(np.array(highest_cluster['p_vec']), np.array(highest_cluster['plan_response_rate'])))

            ## save benchmark results for last round:
            dt_benchmark = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                                n_patient_per_plan = n_patient_per_plan, seed=seed+round)
            maxidx = np.argmax(dt_benchmark['plan_response_rate'])
            random_rr_dict["step"+str(round+1)+'_random_max_rr'].append(dt_benchmark.iloc[maxidx,6])
            random_rr_dict["step"+str(round+1)+'_random_mean_rr'].append(dt_benchmark['plan_response_rate'].mean())

            ### update remaining size:
            remaining_size -= len(dt_design_5_step2up)
            print("early stop, remaining size for Round" + str(round + 1) + ": " + str(remaining_size))
            sample_size_dict["step"+str(round)+"_sample_size"].append(remaining_size)
            last_round_sample_size.append(remaining_size)

            for r in range(round+2, n_rounds+1):
                sample_size_dict["step"+str(r)+"_sample_size"].append(np.nan)

            dt_design_5_rest = generate_data_step2up(highest_cluster=highest_cluster, p_vec = p_vec_next, 
                                                      design_number = n_design, n_rounds = n_rounds,
                                                      n_patient_per_plan = n_patient_per_plan, size = int(remaining_size), seed=seed)
            plan_list = dt_design_5_rest['recruitment_plan'].unique()
            supp = dt_design_5_step2up_overall[dt_design_5_step2up_overall['recruitment_plan'].isin(plan_list)]
            dt_design_5_rest_overall = pd.concat([dt_design_5_rest, supp.reset_index(drop=True)], axis = 0)
            dt_design_5_rest_overall.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)

            stopping_dict['early_stopping'].append(1)
            stopping_dict['early_stopping_plan'].append(0)
            stopping_dict['early_stopping_orr'].append(1)
            stopping_dict['early_stopping_size'].append(0)

            ## record final plan number
            final_plan_number.append(len(p_vec_next))

            ## record highest responserate overall
            highest_rr_overall.append(np.max(dt_design_5_rest_overall['plan_response_rate']))

            for r in range(round+2, n_rounds+1):
                rr_dict["step"+str(r)+"_rr"].append(np.nan)

            ## prepare to chance of better performance:
            temp = dt_design_5_rest[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
            event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            event_list_adaptive += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print("step"+str(round+1), np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print(temp)

            result = binomtest(int(event_list), n=total_n, p=p0)
            # print(int(event_list), total_n, p0)
            orr_total.append(event_list)
            orr_total_adaptive.append(event_list_adaptive)
            better_chance.append(1 if result.pvalue < 0.05 else 0)
            # print(result.pvalue)

            break



1
['step1_rr: 0', 'step2_rr: 0', 'step3_rr: 0', 'step4_rr: 0', 'step5_rr: 0']
empirical response rate 0.06653133577310155
remaining size for Round 2: 140000
Calculated size for Round 2: 4908.31412781265
orr termination 0.03895676704720295 0.10257938368964734 0.06362261664244438
remaining size for Round 3: 135092
Calculated size for Round 3: 9850.56733540076
orr termination 0.07092448455767018 0.10713998041296541 0.036215495855295224
remaining size for Round 4: 125242
Calculated size for Round 4: 10021.544687168987
orr termination 0.05845345753472988 0.11124273056845918 0.0527892730337293
remaining size for last Round 5: 115221
2
['step1_rr: 1', 'step2_rr: 1', 'step3_rr: 1', 'step4_rr: 1', 'step5_rr: 1']
empirical response rate 0.056867566331198535
remaining size for Round 2: 140000
Calculated size for Round 2: 4261.723102741266
orr termination 0.07460288906472581 0.09403652205512442 0.019433632990398603
remaining size for Round 3: 135739
Calculated size for Round 3: 12606.396181121103


### Results summary

In [53]:
rr_df = pd.DataFrame(rr_dict)
rr_df

Unnamed: 0,step1_rr,step2_rr,step3_rr,step4_rr,step5_rr
0,0.054501,0.086022,0.090516,0.094509,0.098086
1,0.057829,0.096148,0.096278,0.096657,0.098775
2,0.062228,0.080180,0.092400,0.093620,0.094612
3,0.054011,0.093791,0.093398,,
4,0.056766,0.091205,0.098844,,
...,...,...,...,...,...
95,0.052518,0.079866,0.094072,0.099877,
96,0.042686,0.063392,0.083913,0.083357,0.085995
97,0.053834,0.088423,0.091198,0.095783,0.097313
98,0.045935,0.090896,,,


In [54]:
# Calculate average rounds:
row_non_nan_counts = rr_df.count(axis=1)

print(f"{np.mean(row_non_nan_counts):.1f} ({np.std(row_non_nan_counts):.1f})")

4.2 (0.8)


In [55]:
# orr for each round:
result_dict = {}
for key, values in rr_df.items():
    mean_value = np.mean(values)
    std_value = np.std(values)
    result_dict[key] = f"{mean_value:.3f} ({std_value:.3f})"

for key, value in list(result_dict.items())[:12]:
    print(f"{key}: {value}")

step1_rr: 0.055 (0.005)
step2_rr: 0.085 (0.009)
step3_rr: 0.092 (0.005)
step4_rr: 0.095 (0.004)
step5_rr: 0.096 (0.004)


In [56]:
# overall orr:
result_dict = np.array(orr_total) / total_n
mean_result = np.mean(result_dict)
std_result = np.std(result_dict)

print(f"Mean: {mean_result:.3f} ({std_result:.3f})")

Mean: 0.087 (0.003)


In [57]:
# adaptive learning orr:
result_dict = np.array(orr_total_adaptive) / (total_n-34976)
mean_result = np.mean(result_dict)
std_result = np.std(result_dict)

print(f"Mean: {mean_result:.3f} ({std_result:.3f})")

Mean: 0.095 (0.003)


In [58]:
# highest true rr:
mean_result = (np.mean(highest_true_rr))
pristd_result = (np.std(highest_true_rr))

print(f"Mean: {mean_result:.3f} ({std_result:.3f})")

Mean: 0.096 (0.003)


In [59]:
# average plan number for each round:
result_dict = {}
for key, values in plan_number_dict.items():
    mean_value = np.mean(values)
    std_value = np.std(values)
    result_dict[key] = f"{mean_value:.1f} ({std_value:.1f})"
#     result_dict[key + "_std"] = std_value
for key, value in list(result_dict.items())[:12]:
    print(f"{key}: {value}")

step1_plan_number: nan (nan)
step2_plan_number: 10.2 (6.6)
step3_plan_number: 3.9 (3.0)
step4_plan_number: 2.3 (1.6)
step5_plan_number: 1.7 (1.3)


In [60]:
# early stopping probabilities:
means = {key: np.mean(values) for key, values in stopping_dict.items()}
means

{'early_stopping': 0.55,
 'early_stopping_plan': 0.55,
 'early_stopping_orr': 0.0,
 'early_stopping_size': 0.0}

In [61]:
# sample size for the last round:
print(np.mean(last_round_sample_size), np.std(last_round_sample_size))

125433.78 7209.832760307273


In [62]:
# probability of better performance compared to the benchmark:
np.mean(better_chance)

1.0

In [63]:
# plan_number_dict = {"step{}_plan_number".format(r): [] for r in range(1, n_rounds + 1)}
result_dict = {}
for key, values in sample_size_dict.items():
    mean_value = np.nanmean(values)
    std_value = np.nanstd(values)
    result_dict[key] = f"{mean_value:.3f} ({std_value:.3f})"
#     result_dict[key + "_std"] = std_value
for key, value in list(result_dict.items())[:12]:
    print(f"{key}: {value}")

step1_sample_size: 34976.000 (0.000)
step2_sample_size: 6824.589 (19030.802)
step3_sample_size: 34700.899 (51487.474)
step4_sample_size: 57955.227 (57970.277)
step5_sample_size: 119921.200 (3050.399)


## Ensemble learning - 7 methods

In [64]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import ensemble, linear_model
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier

def ensemble_model_fit(data, data_pred):
    X_train, X_test, y_train, y_test = train_test_split(
        data.drop(['recruitment_plan', 'plan_response_rate', 'group_size', 'response'], axis=1),
        data['response'],
        test_size=0.2,
        random_state=0
    )

    # Define the VotingClassifier with the individual classifiers
    voting_classifier = ensemble.VotingClassifier(
        estimators=[
            ('LR', linear_model.LogisticRegression(penalty = 'none', max_iter=200, random_state=0)),
            ('Lasso', linear_model.LogisticRegression(penalty = "l1", max_iter=200, random_state=0,
                                                     solver="liblinear")),
            ('Ridge', linear_model.LogisticRegression(penalty = "l2", max_iter=200, random_state=0)),
            ('GBM', ensemble.GradientBoostingClassifier(random_state=0)),
            ('RF', ensemble.RandomForestClassifier(random_state=0)),
            ('XGB', XGBClassifier(random_state=0)),
            ('NN', MLPClassifier(random_state=0))
        ],
        voting='soft'
    )

    # Define the hyperparameter grid to search
#     param_grid = {
#         'Lasso__C': [0.01, 0.1, 1.0],  # Regularization parameter for lasso regression
#         'Ridge__C': [0.01, 0.1, 1.0],  # Regularization parameter for ridge regression
#         'GBM__learning_rate': [0.01, 0.1, 0.5],  # Learning rate for gradient boosting machine
#         'GBM__n_estimators': [50, 100, 200],  # Number of trees for gradient boosting machine
#         'RF__n_estimators': [50, 100, 200],  # Number of trees for random forest
#         'XGB__learning_rate': [0.01, 0.1, 0.5],  # Learning rate for XGBoost
#         'XGB__n_estimators': [50, 100, 200],  # Number of trees for XGBoost
#         'NN__hidden_layer_sizes': [(50,), (100,), (50, 50)],  # Size of hidden layers for neural networks
#         'NN__alpha': [0.0001, 0.001, 0.01]  # Regularization parameter for neural networks
#     }
    param_grid = {
        'Lasso__C': [0.1],  # Regularization parameter for lasso regression
        'Ridge__C': [0.01],  # Regularization parameter for ridge regression
        'GBM__learning_rate': [0.01],  # Learning rate for gradient boosting machine
        'GBM__n_estimators': [50],  # Number of trees for gradient boosting machine
        'RF__n_estimators': [50],  # Number of trees for random forest
        'XGB__learning_rate': [0.01],  # Learning rate for XGBoost
        'XGB__n_estimators': [50],  # Number of trees for XGBoost
        'NN__hidden_layer_sizes': [(50,)],  # Size of hidden layers for neural networks
        'NN__alpha': [0.01]  # Regularization parameter for neural networks
    }

    # Create a GridSearchCV object
    grid_search = GridSearchCV(voting_classifier, param_grid, cv=10, scoring='roc_auc')

    # Perform the grid search on the training data
    grid_search.fit(X_train, y_train)

    # Get the best hyperparameters
    best_params = grid_search.best_params_
#     print("Best fit parameters:", best_params)

    # Train the final VotingClassifier with the best hyperparameters on the full training set
    final_voting_classifier = grid_search.best_estimator_
    final_voting_classifier.fit(X_train, y_train)

    # Predict probabilities instead of binary outcomes on the test set
    y_pred_proba_test = final_voting_classifier.predict_proba(X_test)
    y_pred_test = final_voting_classifier.predict(X_test)
    X_dt = data_pred.drop(['recruitment_plan', 'plan_response_rate', 'group_size', 'response'], axis=1)
    y_pred = final_voting_classifier.predict_proba(X_dt)

    return y_pred


In [65]:
# ensemble_model_fit(dt_design_5, dt_design_5)

In [66]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import train_test_split

# def ensemble_model_fit(data, data_pred):
#     # Split the data into training and testing sets
#     X_train, X_test, y_train, y_test = train_test_split(
#         data.drop(['recruitment_plan', 'plan_response_rate', 'group_size','response'], axis=1),
#         data['response'],
#         test_size=0.2,
#         random_state=0
#     )

#     # Define the Logistic Regression model
#     logistic_regression = LogisticRegression(C=0.01, max_iter=200,random_state=0)

#     # Fit the Logistic Regression model on the training data
#     logistic_regression.fit(X_train, y_train)

#     # Print out the coefficients of the logistic regression model
#     print("Coefficients:", logistic_regression.coef_)
#     print("Intercept:", logistic_regression.intercept_)
#     print("Coefficients shape:", logistic_regression.coef_.shape)

#     # Predict probabilities instead of binary outcomes on the test set
#     y_pred_proba_test = logistic_regression.predict_proba(X_test)
#     y_pred_test = logistic_regression.predict(X_test)

#     # You can also predict probabilities for the prediction data (data_pred)
#     X_dt = data_pred.drop(['recruitment_plan', 'plan_response_rate', 'group_size','response'], axis=1)
#     y_pred = logistic_regression.predict_proba(X_dt)

#     return y_pred

# # Call the function
# # ensemble_model_fit(data, data_pred)


### Simulation starts

In [67]:
n_sim = 100
# create a list of random seeds
random.seed(42)
random_seeds = [random.randint(1, 100000) for _ in range(n_sim)]

design_list = [5,8,10]
patient_n_list = [5468,680,170] # n_patient_per_plan

n_design = 5
n_patient_per_plan = 5468
n_rounds = 5
total_n = n_patient_per_plan * (2**n_design)

## sample size determination
beta = 0.2
power = 1 - beta
alpha = 0.05
delta = 0.01 # effect size

## early stopping
epsilon = 0.001

In [68]:
rr_dict = {"step{}_rr".format(r): [] for r in range(1, n_rounds + 1)}
plan_number_dict = {"step{}_plan_number".format(r): [] for r in range(1, n_rounds + 1)}
sample_size_dict = {"step{}_sample_size".format(r): [] for r in range(1, n_rounds + 1)}
last_round_sample_size = []
random_rr_dict = {"step{}_random_max_rr".format(r): [] for r in range(1, n_rounds + 1)}
random_rr_dict.update({
    "step{}_random_mean_rr".format(r): [] for r in range(1, n_rounds + 1)
})

stopping_dict = {"early_stopping": [],
                 "early_stopping_plan":[],
                 "early_stopping_orr":[],
                 "early_stopping_size":[]}
final_plan_number = []
highest_rr_overall = []
highest_true_rr = []

better_chance = []

orr_total = []
orr_total_adaptive = []



i = 0

for seed in random_seeds:
    i += 1
    print(i)

    print([f"{key}: {len(value)}" for key, value in rr_dict.items()])



    event_list = 0
    event_list_adaptive = 0
    max_rr = []

    # step 1:
    ## Generate dataset
    dt_design_5 = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                      n_patient_per_plan = n_patient_per_plan, seed=seed)
    highest_true_rr.append(np.max(dt_design_5['plan_response_rate']))
    print("empirical response rate", dt_design_5['response'].mean())
    ## save for benchmark results
    maxidx = np.argmax(dt_design_5['plan_response_rate'])
    random_rr_dict['step1_random_max_rr'].append(dt_design_5.iloc[maxidx,6])
    random_rr_dict['step1_random_mean_rr'].append(dt_design_5['plan_response_rate'].mean())

    ## Ensemble modelling:
    y_pred = ensemble_model_fit(data=dt_design_5, data_pred = dt_design_5)

    ## select recruitment plan
    pred_df = pd.DataFrame(np.hstack((dt_design_5,  y_pred[:, 1].reshape(-1, 1))),
                             columns=list(dt_design_5.columns) + ['predicted_response_rate'])
    pred_df_rr = pred_df.groupby([f'Design_Feature_{i+1}' for i in range(n_design)]+['recruitment_plan'])['predicted_response_rate'].mean().reset_index(name='predicted_response_rate')

    x = pred_df_rr['predicted_response_rate'].values
    if len(x) <= 10:
        best_k = 2
    else:
        best_k = kmeans_fit(data = x)['best_k']
    kmeans_results = kmeans_bhattacharyya(data=x, k=best_k)

    merged_df = pd.merge(pred_df_rr, kmeans_results['clusters'][['predicted_response_rate', 'cluster_number']], on='predicted_response_rate')
    cluster_with_highest_rate = kmeans_results['clusters'].groupby('cluster_number')['predicted_response_rate'].mean().idxmax()
    highest_cluster = merged_df[merged_df['cluster_number']==cluster_with_highest_rate].reset_index(drop=True)
    highest_cluster.sort_values(by='predicted_response_rate', ascending=False, inplace=True)


    p_vec_next = np.array(highest_cluster['predicted_response_rate']/np.sum(highest_cluster['predicted_response_rate']))
    highest_cluster['p_vec'] = p_vec_next
    highest_cluster = pd.merge(highest_cluster, dt_design_5[['recruitment_plan','plan_response_rate']].drop_duplicates(), how='left', on='recruitment_plan')

    # highest_cluster = pred_df_rr

    # highest_cluster = pd.merge(highest_cluster, dt_design_5[['recruitment_plan','plan_response_rate']].drop_duplicates(), how='left', on='recruitment_plan')
    # highest_cluster['cluster_number'] = highest_cluster['recruitment_plan']

    ## prepare to chance of better performance:
    temp = dt_design_5[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
    event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
    # print("step1", np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
    # print(temp)
    p0 = (temp['plan_response_rate'].mean())

    rr_dict["step1_rr"].append(dt_design_5['plan_response_rate'].mean())
    sample_size_dict["step1_sample_size"].append(len(dt_design_5))

    for round in range(2, n_rounds+1):

        rr_dict["step"+str(round)+"_rr"].append(np.dot(np.array(highest_cluster['p_vec']), np.array(highest_cluster['plan_response_rate'])))
        plan_number_dict["step"+str(round)+"_plan_number"].append(len(p_vec_next))

        ## when it comes to the last round:
        if round == n_rounds:

            ## save benchmark results for last round:
            dt_benchmark = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                      n_patient_per_plan = n_patient_per_plan, seed=seed+round)
            maxidx = np.argmax(dt_benchmark['plan_response_rate'])
            random_rr_dict["step"+str(round)+'_random_max_rr'].append(dt_benchmark.iloc[maxidx,6])
            random_rr_dict["step"+str(round)+'_random_mean_rr'].append(dt_benchmark['plan_response_rate'].mean())

            if round == 2: # if the last round is round 2
                remaining_size = total_n - len(dt_design_5) # apply to the rest of the patients
            else:
                remaining_size -= len(dt_design_5_step2up)

            print("remaining size for last Round " + str(round) + ": " + str(remaining_size))
            sample_size_dict["step"+str(round)+"_sample_size"].append(remaining_size)
            last_round_sample_size.append(remaining_size)

            ## data generation, combine previous data:
            dt_design_5_rest = generate_data_step2up(highest_cluster=highest_cluster, p_vec = p_vec_next, 
                                                          design_number = n_design, n_rounds = n_rounds,
                                                          n_patient_per_plan = n_patient_per_plan, size = int(remaining_size), seed=seed)
            plan_list = dt_design_5_rest['recruitment_plan'].unique()
            if round == 2:
                supp = dt_design_5[dt_design_5['recruitment_plan'].isin(plan_list)] # data from step 1
            else:
                # step from previous rounds
                supp = dt_design_5_step2up_overall[dt_design_5_step2up_overall['recruitment_plan'].isin(plan_list)]
            dt_design_5_rest_overall = pd.concat([dt_design_5_rest, supp.reset_index(drop=True)], axis = 0)
            dt_design_5_rest_overall.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)

            stopping_dict['early_stopping'].append(0)
            stopping_dict['early_stopping_plan'].append(0)
            stopping_dict['early_stopping_orr'].append(0)
            stopping_dict['early_stopping_size'].append(0)

            ## record final plan number
            final_plan_number.append(len(p_vec_next))

            ## record highest responserate overall
            highest_rr_overall.append(np.max(dt_design_5_rest_overall['plan_response_rate']))

            ## prepare to chance of better performance:
            temp = dt_design_5_rest[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
            event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            event_list_adaptive += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print("step"+str(round), np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print(len(temp), temp['plan_response_rate'].mean())
            # print(temp)

            result = binomtest(int(event_list), n=total_n, p=p0)
            # print(int(event_list), total_n, p0)
            orr_total.append(event_list)
            orr_total_adaptive.append(event_list_adaptive)
            better_chance.append(1 if result.pvalue < 0.05 else 0)
            # print(result.pvalue)

            break



        ## If haven't reached the last round:
        ## check remaining sample size:
        if round == 2:
            remaining_size = total_n - len(dt_design_5) # apply to the rest of the patients
        else:
            remaining_size -= len(dt_design_5_step2up)

        print("remaining size for Round " + str(round) + ": " + str(remaining_size))

        ## determine whether move on to step 2:
        if len(highest_cluster) == 1: # if there is only one plan left

            sample_size_dict["step"+str(round)+"_sample_size"].append(remaining_size)
            last_round_sample_size.append(remaining_size) # record the last round sample size
            for r in range(round+1, n_rounds+1):
                sample_size_dict["step"+str(r)+"_sample_size"].append(np.nan)

            ## save benchmark results for last round:
            dt_benchmark = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                      n_patient_per_plan = n_patient_per_plan, seed=seed+round)
            maxidx = np.argmax(dt_benchmark['plan_response_rate'])
            random_rr_dict["step"+str(round)+'_random_max_rr'].append(dt_benchmark.iloc[maxidx,6])
            random_rr_dict["step"+str(round)+'_random_mean_rr'].append(dt_benchmark['plan_response_rate'].mean())

            dt_design_5_step2up = generate_data_step2up(highest_cluster=highest_cluster, p_vec = p_vec_next,
                                                      design_number = n_design, n_rounds = n_rounds,
                                                      n_patient_per_plan = n_patient_per_plan, size = int(remaining_size), seed=seed)
            plan_list = dt_design_5_step2up['recruitment_plan'].unique()
            if round == 2:
                supp = dt_design_5[dt_design_5['recruitment_plan'].isin(plan_list)] # data from step 1
            else:
                # step from previous rounds
                supp = dt_design_5_step2up_overall[dt_design_5_step2up_overall['recruitment_plan'].isin(plan_list)]

            dt_design_5_step2up_overall = pd.concat([dt_design_5_step2up, supp.reset_index(drop=True)], axis = 0)
            dt_design_5_step2up_overall.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)

            stopping_dict['early_stopping'].append(1)
            stopping_dict['early_stopping_plan'].append(1)
            stopping_dict['early_stopping_orr'].append(0)
            stopping_dict['early_stopping_size'].append(0)

            ## record final plan number
            final_plan_number.append(len(p_vec_next))

            ## record highest responserate overall
            highest_rr_overall.append(np.max(dt_design_5_step2up_overall['plan_response_rate']))

            for r in range(round+1, n_rounds+1):
                rr_dict["step"+str(r)+"_rr"].append(np.nan)

            ## prepare for chance of better performance:
            temp = dt_design_5_step2up[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
            event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            event_list_adaptive += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print("step"+str(round), np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print(temp)

            result = binomtest(int(event_list), n=total_n, p=p0)
            # print(int(event_list), total_n, p0)
            orr_total.append(event_list)
            orr_total_adaptive.append(event_list_adaptive)
            better_chance.append(1 if result.pvalue < 0.05 else 0)
            # print(result.pvalue)
            break

        ## sample size determination:
        if round == 2:
            dt_design_5_step2up_overall = dt_design_5
        orr_1 = dt_design_5_step2up_overall['response'].mean() # observed overall response rates for previous rounds
        orr_2 = orr_1 + delta
        n_1 = len(dt_design_5_step2up_overall)

        size_step2up = sample_size_calc(orr_1, n_1, delta=delta, alpha=alpha, power=power) # total size for dataset
        if size_step2up > 0 and size_step2up < 1000:
            size_step2up = 1000 # if size in [0,1000], then it is 1000 for this round.
        elif size_step2up >= 1000:
            size_step2up = min(size_step2up, int(total_n/n_rounds)) # dataset size capped by the n_patient_per_plan
        else:
        # if size_step2up <= 0:
            # the process stops at this step
            print('Calculated size for Round ' + str(round) + ': ' + str(size_step2up) + 'lt 0, break')
            sample_size_dict["step"+str(round)+"_sample_size"].append(remaining_size)
            last_round_sample_size.append(remaining_size)
            for r in range(round+1, n_rounds+1):
                sample_size_dict["step"+str(r)+"_sample_size"].append(np.nan)

            dt_benchmark = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                      n_patient_per_plan = n_patient_per_plan, seed=seed+round)
            maxidx = np.argmax(dt_benchmark['plan_response_rate'])
            random_rr_dict["step"+str(round)+'_random_max_rr'].append(dt_benchmark.iloc[maxidx,6])
            random_rr_dict["step"+str(round)+'_random_mean_rr'].append(dt_benchmark['plan_response_rate'].mean())

            dt_design_5_step2up = generate_data_step2up(highest_cluster=highest_cluster, p_vec = p_vec_next,
                                                      design_number = n_design, n_rounds = n_rounds,
                                                      n_patient_per_plan = n_patient_per_plan, size = int(remaining_size), seed=seed)
            plan_list = dt_design_5_step2up['recruitment_plan'].unique()
            if round == 2:
                supp = dt_design_5[dt_design_5['recruitment_plan'].isin(plan_list)] # data from step 1
            else:
                # step from previous rounds
                supp = dt_design_5_step2up_overall[dt_design_5_step2up_overall['recruitment_plan'].isin(plan_list)]

            dt_design_5_step2up_overall = pd.concat([dt_design_5_step2up, supp.reset_index(drop=True)], axis = 0)
            dt_design_5_step2up_overall.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)

            stopping_dict['early_stopping'].append(1)
            stopping_dict['early_stopping_plan'].append(0)
            stopping_dict['early_stopping_orr'].append(0)
            stopping_dict['early_stopping_size'].append(1)

            ## record final plan number
            final_plan_number.append(len(p_vec_next))

            ## record highest responserate overall
            highest_rr_overall.append(np.max(dt_design_5_step2up_overall['plan_response_rate']))

            for r in range(round+1, n_rounds+1):
                rr_dict["step"+str(r)+"_rr"].append(np.nan)

            ## prepare for chance of better performance:
            temp = dt_design_5_step2up[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
            event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            event_list_adaptive += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print("step"+str(round), np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print(temp)

            result = binomtest(int(event_list), n=total_n, p=p0)
            # print(int(event_list), total_n, p0)
            orr_total.append(event_list)
            orr_total_adaptive.append(event_list_adaptive)
            better_chance.append(1 if result.pvalue < 0.05 else 0)
            # print(result.pvalue)
            break

        print('Calculated size for Round ' + str(round) + ': ' + str(size_step2up))

        sample_size_dict["step"+str(round)+"_sample_size"].append(size_step2up)

        ## save benchmark results for last round:
        dt_benchmark = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                            n_patient_per_plan = n_patient_per_plan, seed=seed+round)
        maxidx = np.argmax(dt_benchmark['plan_response_rate'])
        random_rr_dict["step"+str(round)+'_random_max_rr'].append(dt_benchmark.iloc[maxidx,6])
        random_rr_dict["step"+str(round)+'_random_mean_rr'].append(dt_benchmark['plan_response_rate'].mean())

        ## data generation, combine previous data:
        dt_design_5_step2up = generate_data_step2up(highest_cluster=highest_cluster, p_vec = p_vec_next,
                                                  design_number = n_design, n_rounds = n_rounds,
                                                  n_patient_per_plan = n_patient_per_plan, size = int(size_step2up), seed=seed)

        plan_list = dt_design_5_step2up['recruitment_plan'].unique()
        if round == 2:
            supp = dt_design_5[dt_design_5['recruitment_plan'].isin(plan_list)]
        else:
            supp = dt_design_5_step2up_overall[dt_design_5_step2up_overall['recruitment_plan'].isin(plan_list)]
        dt_design_5_step2up_overall = pd.concat([dt_design_5_step2up, supp.reset_index(drop=True)], axis = 0)
        dt_design_5_step2up_overall.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)

        ## Ensemble model fitting:
        dt_design_5_step2up.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)
        y_pred2up = ensemble_model_fit(data = dt_design_5_step2up_overall, data_pred = dt_design_5_step2up)
        pred_df_step2up = pd.DataFrame(np.hstack((dt_design_5_step2up,  y_pred2up[:, 1].reshape(-1, 1))),
                                    columns=list(dt_design_5_step2up.columns) + ['predicted_response_rate'])
        pred_df_rr_step2up = pred_df_step2up.groupby([f'Design_Feature_{i+1}' for i in range(n_design)]+['recruitment_plan'])['predicted_response_rate'].mean().reset_index(name='predicted_response_rate')

        ## select recruitment plans:
        x = pred_df_rr_step2up['predicted_response_rate'].values

        if len(x) <= 10:
            best_k = 2
        else:
            best_k = kmeans_fit(data = x)['best_k']
        kmeans_results = kmeans_bhattacharyya(data=x, k=best_k)

        ## match the cluster results back to the original data
        highest_cluster_previous = highest_cluster # save previous cluster results

        merged_df = pd.merge(pred_df_rr_step2up, kmeans_results['clusters'][['predicted_response_rate', 'cluster_number']], on='predicted_response_rate')
        cluster_with_highest_rate = kmeans_results['clusters'].groupby('cluster_number')['predicted_response_rate'].mean().idxmax()
        highest_cluster = merged_df[merged_df['cluster_number']==cluster_with_highest_rate].reset_index(drop=True)
        highest_cluster.sort_values(by='predicted_response_rate', ascending=False, inplace=True)

        # p_vec_previous = p_vec_next # save p_vec of previous round
        p_vec_next = np.array(highest_cluster['predicted_response_rate']/np.sum(highest_cluster['predicted_response_rate']))

        highest_cluster['p_vec'] = p_vec_next

        highest_cluster = pd.merge(highest_cluster, dt_design_5_step2up[['recruitment_plan','plan_response_rate']].drop_duplicates(), how='left', on='recruitment_plan')

        ## prepare to chance of better performance:
        temp = dt_design_5_step2up[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
        event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
        event_list_adaptive += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
        # print("step"+str(round), np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
        # print(temp)

        ## check early stopping predicted ORR:
        orr_df = pd.merge(highest_cluster_previous, highest_cluster[['recruitment_plan','predicted_response_rate','p_vec']], on='recruitment_plan', how='left')
        orr_df.fillna(0, inplace=True)
        p_orr_1 = np.dot(np.array(orr_df['p_vec_x']), np.array(orr_df['predicted_response_rate_y']))
        p_orr_2 = np.dot(np.array(orr_df['p_vec_y']), np.array(orr_df['predicted_response_rate_y']))
        print("orr termination", p_orr_1, p_orr_2, p_orr_2 - p_orr_1)

        if (p_orr_2 - p_orr_1 < epsilon):
            # step 3 use the same strategy of step2
            print(i, p_orr_1, p_orr_2, "early stop at Round " + str(round))
            rr_dict["step"+str(round+1)+"_rr"].append(np.dot(np.array(highest_cluster['p_vec']), np.array(highest_cluster['plan_response_rate'])))

            ## save benchmark results for last round:
            dt_benchmark = generate_data_step1(design_number = n_design, n_rounds = n_rounds,
                                                n_patient_per_plan = n_patient_per_plan, seed=seed+round)
            maxidx = np.argmax(dt_benchmark['plan_response_rate'])
            random_rr_dict["step"+str(round+1)+'_random_max_rr'].append(dt_benchmark.iloc[maxidx,6])
            random_rr_dict["step"+str(round+1)+'_random_mean_rr'].append(dt_benchmark['plan_response_rate'].mean())

            ### update remaining size:
            remaining_size -= len(dt_design_5_step2up)
            print("early stop, remaining size for Round" + str(round + 1) + ": " + str(remaining_size))
            sample_size_dict["step"+str(round)+"_sample_size"].append(remaining_size)
            last_round_sample_size.append(remaining_size)

            for r in range(round+2, n_rounds+1):
                sample_size_dict["step"+str(r)+"_sample_size"].append(np.nan)

            dt_design_5_rest = generate_data_step2up(highest_cluster=highest_cluster, p_vec = p_vec_next, 
                                                      design_number = n_design, n_rounds = n_rounds,
                                                      n_patient_per_plan = n_patient_per_plan, size = int(remaining_size), seed=seed)
            plan_list = dt_design_5_rest['recruitment_plan'].unique()
            supp = dt_design_5_step2up_overall[dt_design_5_step2up_overall['recruitment_plan'].isin(plan_list)]
            dt_design_5_rest_overall = pd.concat([dt_design_5_rest, supp.reset_index(drop=True)], axis = 0)
            dt_design_5_rest_overall.drop(['predicted_response_rate','cluster_number','p_vec'], axis=1, inplace=True)

            stopping_dict['early_stopping'].append(1)
            stopping_dict['early_stopping_plan'].append(0)
            stopping_dict['early_stopping_orr'].append(1)
            stopping_dict['early_stopping_size'].append(0)

            ## record final plan number
            final_plan_number.append(len(p_vec_next))

            ## record highest responserate overall
            highest_rr_overall.append(np.max(dt_design_5_rest_overall['plan_response_rate']))

            for r in range(round+2, n_rounds+1):
                rr_dict["step"+str(r)+"_rr"].append(np.nan)

            ## prepare to chance of better performance:
            temp = dt_design_5_rest[['recruitment_plan','plan_response_rate', 'group_size']].drop_duplicates()
            event_list += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            event_list_adaptive += (np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print("step"+str(round+1), np.dot(temp['plan_response_rate'].values, temp['group_size'].values))
            # print(temp)

            result = binomtest(int(event_list), n=total_n, p=p0)
            # print(int(event_list), total_n, p0)
            orr_total.append(event_list)
            orr_total_adaptive.append(event_list_adaptive)
            better_chance.append(1 if result.pvalue < 0.05 else 0)
            # print(result.pvalue)

            break



1
['step1_rr: 0', 'step2_rr: 0', 'step3_rr: 0', 'step4_rr: 0', 'step5_rr: 0']
empirical response rate 0.06653133577310155
remaining size for Round 2: 140000
Calculated size for Round 2: 4908.31412781265
orr termination 0.07239084048838401 0.108864950035636 0.036474109547251984
remaining size for Round 3: 135092
Calculated size for Round 3: 18789.28283219626
orr termination 0.056070479665173506 0.10859591843757599 0.05252543877240248
remaining size for Round 4: 116303
2
['step1_rr: 1', 'step2_rr: 1', 'step3_rr: 1', 'step4_rr: 1', 'step5_rr: 1']
empirical response rate 0.056867566331198535
remaining size for Round 2: 140000
Calculated size for Round 2: 4261.723102741266
orr termination 0.06990515775553588 0.0918522797419365 0.021947121986400614
remaining size for Round 3: 135739
Calculated size for Round 3: 6630.994202519784
orr termination 0.04900261103229436 0.09670133451491603 0.04769872348262166
remaining size for Round 4: 129109
Calculated size for Round 4: 6891.340307285635
orr ter



orr termination 0.05171217200559506 0.10486987576033742 0.05315770375474236
remaining size for Round 4: 121332
68
['step1_rr: 67', 'step2_rr: 67', 'step3_rr: 67', 'step4_rr: 67', 'step5_rr: 67']
empirical response rate 0.06229986276303751
remaining size for Round 2: 140000
Calculated size for Round 2: 4624.3281956557985
orr termination 0.08399250365101185 0.11589993426244644 0.031907430611434584
remaining size for Round 3: 135376
Calculated size for Round 3: 12291.595071104552
orr termination 0.024155777012510617 0.11789179829696794 0.09373602128445732
remaining size for Round 4: 123085
69
['step1_rr: 68', 'step2_rr: 68', 'step3_rr: 68', 'step4_rr: 68', 'step5_rr: 68']
empirical response rate 0.06352927721866422
remaining size for Round 2: 140000
Calculated size for Round 2: 4706.700674534622
orr termination 0.057528218637091455 0.09700799615946554 0.03947977752237408
remaining size for Round 3: 135294
Calculated size for Round 3: 7755.920343795331
orr termination 0.07062767933997714 0

### Results summary

In [69]:
rr_df = pd.DataFrame(rr_dict)
rr_df

Unnamed: 0,step1_rr,step2_rr,step3_rr,step4_rr,step5_rr
0,0.054501,0.092187,0.094354,0.098086,
1,0.057829,0.080840,0.086158,0.095127,0.095439
2,0.062228,0.091219,0.096603,0.097474,
3,0.054011,0.081631,0.088647,0.093782,0.093964
4,0.056766,0.083785,0.089598,0.091335,0.098844
...,...,...,...,...,...
95,0.052518,0.084990,0.094254,,
96,0.042686,0.062228,0.085422,0.084863,
97,0.053834,0.087139,0.089993,0.097313,
98,0.045935,0.059973,0.077932,0.082792,0.088458


In [70]:
# Calculate average rounds:
row_non_nan_counts = rr_df.count(axis=1)

print(f"{np.mean(row_non_nan_counts):.1f} ({np.std(row_non_nan_counts):.1f})")

4.0 (0.9)


In [71]:
# orr for each round:
result_dict = {}
for key, values in rr_df.items():
    mean_value = np.mean(values)
    std_value = np.std(values)
    result_dict[key] = f"{mean_value:.3f} ({std_value:.3f})"

for key, value in list(result_dict.items())[:12]:
    print(f"{key}: {value}")

step1_rr: 0.055 (0.005)
step2_rr: 0.084 (0.008)
step3_rr: 0.092 (0.005)
step4_rr: 0.095 (0.004)
step5_rr: 0.096 (0.004)


In [72]:
# overall orr:
result_dict = np.array(orr_total) / total_n
mean_result = np.mean(result_dict)
std_result = np.std(result_dict)

print(f"Mean: {mean_result:.3f}, StD: {std_result:.3f}")

Mean: 0.087, StD: 0.003


In [73]:
# highest true rr:
print(np.mean(highest_true_rr))
print(np.std(highest_true_rr))

0.09698231905749155
0.003161117477580081


In [74]:
# adaptive learning orr:
result_dict = np.array(orr_total_adaptive) / (total_n-34976)
mean_result = np.mean(result_dict)
std_result = np.std(result_dict)

print(f"Mean: {mean_result:.3f}, StD: {std_result:.3f}")

Mean: 0.095, StD: 0.004


In [75]:
# average plan number for each round:
result_dict = {}
for key, values in plan_number_dict.items():
    mean_value = np.mean(values)
    std_value = np.std(values)
    result_dict[key] = f"{mean_value:.1f} ({std_value:.1f})"
#     result_dict[key + "_std"] = std_value
for key, value in list(result_dict.items())[:12]:
    print(f"{key}: {value}")

step1_plan_number: nan (nan)
step2_plan_number: 8.6 (5.8)
step3_plan_number: 3.4 (2.5)
step4_plan_number: 2.1 (1.5)
step5_plan_number: 1.5 (1.0)


In [76]:
# early stopping probabilities:
means = {key: np.mean(values) for key, values in stopping_dict.items()}
means

{'early_stopping': 0.63,
 'early_stopping_plan': 0.63,
 'early_stopping_orr': 0.0,
 'early_stopping_size': 0.0}

In [77]:
# sample size for the last round:
print(np.mean(last_round_sample_size), np.std(last_round_sample_size))

126510.88 7489.491129950018


In [78]:
# plan_number_dict = {"step{}_plan_number".format(r): [] for r in range(1, n_rounds + 1)}
result_dict = {}
for key, values in sample_size_dict.items():
    mean_value = np.nanmean(values)
    std_value = np.nanstd(values)
    result_dict[key] = f"{mean_value:.3f} ({std_value:.3f})"
#     result_dict[key + "_std"] = std_value
for key, value in list(result_dict.items())[:12]:
    print(f"{key}: {value}")

step1_sample_size: 34976.000 (0.000)
step2_sample_size: 9549.985 (26631.908)
step3_sample_size: 42045.446 (55815.379)
step4_sample_size: 64586.162 (58503.240)
step5_sample_size: 119543.865 (2966.416)


In [79]:
# probability of better performance compared to the benchmark:
np.mean(better_chance)

1.0