In [None]:
import ipywidgets as widgets 
import pandas as pd
import numpy as np
import operator 
import matplotlib.pyplot as plt
import statistics
import math

# 1. Load Classification Results

In [None]:
fold =4
dataset = 'i'
fold = str(fold)

df_results_of_each_param = pd.read_pickle("./X"+dataset+"_results/results/fold" + fold + "/outerfold" + fold + "_results_for_each_param_setting.pkl")
df_results_of_each_fold = pd.read_pickle("./X"+dataset+"_results/results/fold" + fold + "/outerfold" + fold + "_results_for_each_fold.pkl")
good_results = pd.read_pickle("./X"+dataset+"_results/results/fold" + fold + "/outerfold" + fold + "_good_results.pkl")

In [None]:
print(good_results.at[0,'precision_score'])

In [None]:
df_results_of_each_param.head()

In [None]:
df_results_of_each_param.info()

In [None]:
tes = df_results_of_each_param.at[0, "test_index"]

In [None]:
print(tes)

In [None]:
df_results_of_each_fold.head()

In [None]:
good_results.info()

In [None]:
print(good_results)

# 2. Helper Functions

### 2.1 Calculate how many instances of a particular subtype are correctly classified

In [None]:
def calculateCorrectlyClassifiedSubjects(y_pred, y_act, test_indexes, subtype, num_P, num_S):
    """Function to calculate the number of correctly classified instances.
    
    Arguments
    ---------
        y_pred: numpy.array
            Array of predicted results.
        y_act: numpy.array
            Array of actual results.
        test_indexes: list
            List of test indexes.
        subtype: int
            The subtype.
        num_P: int
            Number of Progressor subtypes.
        num_S: int
            Number of Stable subtypes.
        weighting: int
            Weighting value for classifier.
        
    Returns
    -------
        count: int
            Number of correctly classified instances.
        index_of_intances_correctly_classified: list
            Indexes of intances correctly classified.
    """
    count = 0
#     print("y_pred: \n", y_pred)
#     print("y_act: \n", y_act)    
#     print("test_indexes: \n", test_indexes)
#     print(("subtype: {}, num_P: {}, num_S: {}").format(subtype, num_P, num_S))
    n_class = num_P + num_S
    
    P_range, S_range = np.arange(0,num_P), np.arange(num_P, n_class)  
#     print("P_range", P_range)
#     print("S_range", S_range)
    index_of_intances_correctly_classified = []
    
    if subtype in P_range:
        for i in range(len(y_pred)):
            # (1=stable, 2=progressor)
            if y_pred[i] == subtype and y_act[i] == 2:
                count += 1
#                 print(i)
                index_of_intances_correctly_classified.append(test_indexes[i])
    
    else:
        for i in range(len(y_pred)):
            if y_pred[i] == subtype and y_act[i] == 1:
                count += 1 
#                 print(i)
                index_of_intances_correctly_classified.append(test_indexes[i])
                
    return count, index_of_intances_correctly_classified

In [None]:
# test_c =pd.read_pickle("./Xs_results/results/fold0/arranged_good.pkl")
# print(test_c.iloc[8])

In [None]:
# # print('test_indexes')
# indexes_t = test_c.iloc[8]['test_index']
# # print(test_c.iloc[0]['test_index'])

# # print('pred_y')
# pred_y_t = test_c.iloc[8]['pred_y']
# # print(test_c.iloc[0]['pred_y'])
# # print('act_y')
# act_y_t = test_c.iloc[8]['act_y']
# # print(test_c.iloc[0]['act_y'])

# results = calculateCorrectlyClassifiedSubjects(pred_y_t, act_y_t, indexes_t, 2, 4, 4)

### 2.2 Calculate percentage of mutual features selected across all cross validation folds for a particular parameter setting

In [None]:
def featureSelectionValidation(df_results_of_each_fold, num_P, num_S, classifier, weighting, num_Features):
    """Function to unwrap the information of the feature selection for a particular
        parameter setting result
    
    Arguments
    ---------
        df_results_of_each_fold: pandas.DataFrame
            Dataframe that stores results for each fold.
        num_P: int
            Number of Progressor subtypes.
        num_S: int
            Number of Stable subtypes.
        y_test: numpy.ndarray
            The testing data labels.
        classifiers: integer
            A integer representing the classifiers used. 
            0: linear SVM
            1: RandomForest
            2: Logistic Regression
        num_Features: int
            Number of features to be selected during the feature selection step.
        
    Returns
    -------
        percentage_mutual_features: float
            Percentage of mutually selected features across all folds over specified number of features
            to be selected
        list_mf: list
            List of the mutually selected features
        num_mf: 
            Number of the mutually selected features
    """

    df = df_results_of_each_fold[
                                        (df_results_of_each_fold['num_P'] == num_P) & 
                                        (df_results_of_each_fold['num_S'] == num_S) & 
                                        (df_results_of_each_fold['classifier'] == classifier) & 
                                        (df_results_of_each_fold['num_Features'] == num_Features) & 
                                        (df_results_of_each_fold['weighting'] == weighting)
    ]['columns_selected']
    features_selected = []
#     print(df)
    for row in df:
        features_selected.append(row)
#         print(features_selected)
    mutual_features = set(features_selected[0]).intersection(*features_selected)
#     print(len(mutual_features))
    num_mf = len(mutual_features)
    percentage_mutual_features = float(num_mf/num_Features)
    list_mf = list(mutual_features)
    return percentage_mutual_features, list_mf, num_mf

### 2.3 Calculate the frequencies of features selected

In [None]:
def computeFeatureFrequencies(df_results_of_each_fold, num_P, num_S, classifier, num_Features, weighting):
    """Function to unwrap the information of the feature selection for a particular
        parameter setting result
    
    Arguments
    ---------
        df_results_of_each_fold: pandas.DataFrame
            Dataframe that stores results for each fold.
        num_P: int
            Number of Progressor subtypes.
        num_S: int
            Number of Stable subtypes.
        y_test: numpy.ndarray
            The testing data labels.
        classifiers: integer
            A integer representing the classifiers used. 
            0: linear SVM
            1: RandomForest
            2: Logistic Regression
        num_Features: int
            Number of features to be selected during the feature selection step.
        
    Returns
    -------
        dic: dictionary
            Dictionary containing all features selected as keys and corresponding frequencies as values.
        features_list: list
            List of the all features selected.
    """

    df = df_results_of_each_fold[
                                        (df_results_of_each_fold['num_P'] == num_P) & 
                                        (df_results_of_each_fold['num_S'] == num_S) & 
                                        (df_results_of_each_fold['num_Features'] == num_Features) &
                                        (df_results_of_each_fold['classifier'] == classifier) &
                                        (df_results_of_each_fold['weighting'] == weighting)
                                        
    ]['columns_selected']
    features_selected = []
    
    dic = {}
    
    for row in df:
        features_selected.append(row)
        for i in row:
            if i in dic:
                dic[i] += 1
            else:
                dic[i] = 1
                
    # print(features_selected)
    features_list = [j for sub in features_selected for j in sub]
    # print(features_list)

    return dic, features_list

In [None]:
import math
def featuresThreshold(features_frequencies, features_selected_list, threshold):
#     print("features_frequencies", features_frequencies)
#     print("features_selected_list", features_selected_list)
#     print("len features_frequencies", len(features_frequencies))
#     print("len features_selected_list", len(features_selected_list))
    
    n_features = len(features_selected_list)
    cut_off = math.ceil(n_features * threshold)
#     print(cut_off)
    
    for index, feat in enumerate(features_selected_list[:cut_off]):
        freq = features_frequencies[index]
#         print(("index : {}, feat: {}, freq: {}").format(index, feat, features_frequencies[index]))
        if freq < math.floor(threshold * 12):
            return False
        
    return True

### 2.4 Plot histogram for frequencies of the features selected

In [None]:
def plotFigures(df_results_of_each_param, feature_frequencies, PPV, num_c, num_P, num_S, classifier, num_Features, subtype, stability, weighting, indexes, analysis, path):
    """Function to plot the bar chart for frequencies of the features selected across all folds and the confusion
        matrix and save them into one png file.
    
    Arguments
    ---------
        df_results_of_each_fold: pandas.DataFrame
            Dataframe that stores results for each fold.
        feature_frequencies: dictionary:
            Dictionary containing all features as keys and the frequencies as values.
        PPV: float
            PPV value of the parameter setting.
        num_c: int
            Number of subjects correctly classified.
        num_P: int
            Number of Progressor subtypes.
        num_S: int
            Number of Stable subtypes.
        classifiers: integer
            A integer representing the classifiers used. 
            0: linear SVM
            1: RandomForest
            2: Logistic Regression
        num_Features: int
            Number of features to be selected during the feature selection step.
        subtype: int
            Subtype with highest PPV value.
        stability: int
            Stability of the parameter setting.
        analysis: int
            Analysis number.
        path: string
            Parent path to store the images.
        
    Returns
    -------
    """
    
    # Plot histogram
    ######################################################################################

    df = df_results_of_each_param[
                                        (df_results_of_each_param['num_P'] == num_P) & 
                                        (df_results_of_each_param['num_S'] == num_S) & 
                                        (df_results_of_each_param['num_Features'] == num_Features) &
                                        (df_results_of_each_param['classifier'] == classifier) &
                                        (df_results_of_each_param['weighting'] == weighting)
                                        
    ]

    test_index = df['test_index'].values.tolist()[0]
    pred_y = df['pred_y'].values.tolist()[0]
    act_y = df['act_y'].values.tolist()[0]
    
    sorted_test_index = sorted(test_index)
    sorted_pred_y = [y for x,y in sorted(zip(test_index,pred_y))] 
    sorted_act_y = [y for x,y in sorted(zip(test_index,act_y))] 

    
    for i,v in enumerate(sorted_act_y):
        if v == 1.0:
            index_stable = i
            break
    
    sort = {k: v for k, v in sorted(feature_frequencies.items(), key=lambda item: item[1], reverse = True)}
    label = sort.keys()
    frequencies = list(sort.values())
    index = np.arange(len(label))  
    
    plt.figure(figsize=(20,10), dpi=80)

    ax1 = plt.subplot(211)
    
    title = f"PPV: {PPV}\n Number of Subjects Correctly Classified: {num_c}\n Number of Progressor Subtypes: {num_P} \n Number of Stable Subtypes: {num_S} \n Classifier: {classifier} \n Number of Features: {num_Features}\n Class with High PPV:{subtype}\n Stability: {stability}\n Weighting: {weighting}"
    plt.title(title)
    plt.xticks(index, label,rotation=50)
    
    for i in index:
        if i < num_Features:
            plt.bar(i, frequencies[i], color = 'red')
        else:
            plt.bar(i, frequencies[i], color = 'blue')
        ax1.annotate(frequencies[i],

                        (i, frequencies[i]),

                         xytext=(0, 0), 

                         textcoords='offset points')

    ax1.set_xlabel('Feature Index')
    ax1.set_ylabel('Frequency')
    
    # Plot confusion matrix
    ######################################################################################

    ax2 = plt.subplot(212)
    ax2.title.set_text('Confusion Matrix')
    
    for i,v in enumerate(sorted_pred_y):
        if sorted_test_index[i] in indexes:
            plt.plot(sorted_test_index[i], v, 'x', color='red')
        else:
            plt.plot(sorted_test_index[i], v, 'x', color='black')
        
    plt.axvline(x=250.5,color='r', linestyle='-')
    plt.axhline(y=num_P-0.5, color='k', linestyle='-')
    
    # plt.axis([0, 302, 0, num_P+num_S])
    plt.yticks(np.arange(0, num_P+num_S, 1.0))
    ax2.set_xlabel('Instance indexes')
    
    plt.text(0.25, 0.06, "Progressor Scans", fontsize=16, transform=plt.gcf().transFigure)
    plt.text(0.75, 0.06, "Stables Scans", fontsize=16, transform=plt.gcf().transFigure)

    plt.tight_layout(pad=3.0)
    
    name = 'analysis{}.png'.format(str(analysis))
    file_path = path + name
    
    plt.savefig(file_path)
    
    plt.show()

#     plt.show()
#     return fig, ax

# Main Entry Point of the Program

In [None]:
param_set = []
num_P_column = np.array(good_results['num_P'])
num_S_column = np.array(good_results['num_S'])
classifier_column = np.array(good_results['classifier'])
weighting_column = np.array(good_results['weighting'])

for i in range(good_results.shape[0]):
    param = [(num_P_column[i], num_S_column[i], classifier_column[i], weighting_column[i]), i]
    print(param)
    param_set.append(param)

In [None]:
print(len(param_set))
print(good_results.at[0,'precision_score'])

In [None]:
unique_param_set = list(set([i[0] for i in param_set]))
print(unique_param_set)
print(len(unique_param_set))

In [None]:
result_df_col = ['PPV', 'Subjects Correctly Classified', 
                 'Number of Progressor Subtypes', 
                 'Number of Stable Subtypes', 
                 'Number of Features', 
                 'Classifier',
                 'Weighting', 
                 'Class with High PPV', 
                 'Stability of High PPV',
                 'Number of features mutually selected', 
                 'Percentage of Mutual Features Selected across all folds', 
                 'Features selected with highest frequencies across all folds', 
                 'Frequencies of the features',
                 'Indexes of Instances Correctly Classified'
                ]

result_df = pd.DataFrame(columns = result_df_col)


In [None]:
print(unique_param_set[0])

In [None]:
print(good_results.at[param[1], 'precision_score'])

In [None]:
print(good_results.columns)
dfs = []
for u in unique_param_set:
    print(u)
    test_df = good_results[
                                        (good_results['num_P'] == u[0]) & 
                                        (good_results['num_S'] == u[1]) & 
                                        (good_results['classifier'] == u[2]) & 
                                        (good_results['weighting'] == u[3])
    ]
    dfs.append(test_df)  
#     print(test_df)
demo_output = pd.concat(dfs, ignore_index=True)
# demo_output = demo_output.drop(columns=['act_y', 'test_index','pred_y'])
demo_output.to_csv("./X"+dataset + "_results/results/fold" + fold+ "/arranged_good.csv")
demo_output.to_pickle("./X"+dataset + "_results/results/fold" + fold+ "/arranged_good.pkl")

In [None]:
param_set = []
num_P_column = np.array(demo_output['num_P'])
num_S_column = np.array(demo_output['num_S'])
classifier_column = np.array(demo_output['classifier'])
weighting_column = np.array(demo_output['weighting'])
n_f = np.array(demo_output['num_Features'])

for i in range(demo_output.shape[0]):
    param = [(num_P_column[i], num_S_column[i], classifier_column[i], weighting_column[i]), i]
    print(param)
    print('num_Features', n_f[i])
    param_set.append(param)

In [None]:
loop = 0
for ii, unique_param in enumerate(unique_param_set):
#     if ii != len(unique_param_set)-1:
#         continue
#     print("unique_param", unique_param)
    for c in range(3):
        if unique_param[2] == 0:
            classifier = "SVM"
        elif unique_param[2] == 1:
            classifier = "RF"
        elif unique_param[2] == 2:
            classifier = "LR"
#     print("classifier ", classifier)
    score_list = []
    original_index = []
    for param in param_set:
        if param[0] == unique_param:
#             print("param", param)
            keys = []
            for k, v in demo_output.at[param[1], 'precision_score'].items():
                keys.append(v)
            score_list.append(keys)
            original_index.append(param[1])
            
    score_arr = np.array(score_list)

    print(score_arr.shape)
#     print("score_arr\n", score_arr)
#     print("original_index", original_index)
    
    subtype_max_list = np.max(score_arr,axis=0)
    
#     print("subtype_max_list", subtype_max_list)
    
    # [ expression for item in list if conditional ]
    good_ppv_subtypes = [(i,v) for i,v in enumerate(subtype_max_list) if v > 0.8]
    
#     print("good_ppv_subtypes", good_ppv_subtypes)
    
    for subtype,v in good_ppv_subtypes:
#         print("subtype", subtype)
#         print("v", v)
        queue = [(index,score_arr[index][subtype]) for index in range(score_arr.shape[0]) if score_arr[index][subtype] > 0.8]
#         print("queue\n", queue)
        if len(queue) < 3:
            continue
        streaks,streak = [],[]
        queue_cpy = queue.copy()
        count = demo_output.at[original_index[0], 'num_Features']
#         print("count ", count)
        for j,ppv in queue:
#             print(("j: {}, ppv:{}").format(j,ppv))
            n_fea = demo_output.at[original_index[j], 'num_Features']
#             print("num_features", n_fea)
            if count + 1 == n_fea:
                streak.append((queue_cpy.pop(0),j))
#                 print("streak", streak)
                count += 1
            else:
                if len(streak) >= 3:
                    streaks.append(streak)
                streak = []
                streak.append((queue_cpy.pop(0),j))
                count = demo_output.at[original_index[j], 'num_Features']
                
        if len(streak) != 0 and len(streak) > 3:
            streaks.append(streak)
        
#         for s in streaks:
#             print(s)
        
        final_streaks = []
        
        if len(streaks) == 0:
            continue
            
        for stk in streaks:
#             print("stk", stk)
#             print("stability", len(stk))
            max_index = np.argmax([i[0][1] for i in stk], axis=0)
            max_ppv = stk[max_index][0][1]
#             print("max_ppv", max_ppv)
            if max_ppv > 0.94:
                final_streaks.append(stk)
                
        for fstk in final_streaks:
#             print("fstk", fstk)
            stability = len(fstk)
#             print("stability", stability)
            max_index = np.argmax([i[0][1] for i in fstk], axis=0)
            max_ppv = fstk[max_index][0][1]
#             print("max_ppv", max_ppv)
            indexes = [x[1] for i, x in enumerate(fstk) if x[0][1] == max_ppv]
#             print("indexes", indexes)
            init_indexes = [original_index[i] for i in indexes]
#             print("init_indexes", init_indexes)
            num_Features = [demo_output.at[i, 'num_Features'] for i in init_indexes]
#             print("num_features", num_Features)
#             print("subtype", subtype)
            correctly_classified = [
                        calculateCorrectlyClassifiedSubjects(
                        demo_output.at[i, 'pred_y'], 
                        demo_output.at[i, 'act_y'],
                        demo_output.at[i, 'test_index'],
                        subtype,
                        demo_output.at[i, 'num_P'],
                        demo_output.at[i, 'num_S'])
                for i in init_indexes
            ]
                
#             print("correctly_classified", correctly_classified)

            mfs = [featureSelectionValidation(
                                                                                        df_results_of_each_fold, 
                                                                                        unique_param[0],
                                                                                        unique_param[1],
                                                                                        unique_param[2],
                                                                                        unique_param[3],
                                                                                        j 
                                                                                    )
                                                           for j in num_Features
                                                          ]
            
#             print("featureSelectionValidation", mfs)
            
            percentage_mf = [i[0] for i in mfs]
            num_mf = [i[2] for i in mfs]
            
#             print("percentage_mf", percentage_mf)
#             print("num_mf", num_mf)
            
            best_index = np.argmax([x for x,v in correctly_classified], axis=0)
            max_subject = correctly_classified[best_index][0]
            
#             print("best_index: ", best_index)
#             print("max_subject: ", max_subject)   
            
            best_indexes = [i for i, x in enumerate([i for (i,x) in correctly_classified]) if x == max_subject]
#             print("best_indexes", best_indexes)
            if max_subject > 1:
#                 print("max_subject", max_subject)
                if len(best_indexes) > 1:
                    # print("pct mfs", [percentage_mf[i] for i in best_indexes])
#                     best_fss = [(percentage_mf[i],i) for i in best_indexes]
#                     best_fs_i = np.argmax([val[0] for val in best_fss], axis=0)
#                     best_fs_index = best_fss[best_fs_i][1]

                    nfeats = [num_Features[i] for i in best_indexes]
                    nfeas_mid = statistics.mean(nfeats)   
                    bi = np.argmin([abs(d-nfeas_mid) for d in nfeats], axis=0)
                
                    best_fs_index = best_indexes[bi]
                    
                    
#                     print("best_fs_index", best_fs_index)
                    # print("correctly_classified", correctly_classified[best_fs_index])
                else:
                    best_fs_index = best_indexes[0]
                    
                dic, features_list = computeFeatureFrequencies(df_results_of_each_fold, 
                                                                     unique_param[0], 
                                                                     unique_param[1], 
                                                                     unique_param[2], 
                                                                     num_Features[best_fs_index],
                                                                     unique_param[3]
                                                                )
                

                sorted_features_frequencies = {k: v for k, v in sorted(dic.items(), key=lambda item: item[1], reverse = True)}
                features_frequencies = list(sorted_features_frequencies.values())[:num_Features[best_fs_index]]
                features_selected_list = list(sorted_features_frequencies.keys())[:num_Features[best_fs_index]]
                
                if featuresThreshold(features_frequencies, features_selected_list, 0.5) == False:
                    continue
                

#                 print('Number of Features', num_Features[best_fs_index])
#                 print('features_frequencies', features_frequencies)
#                 print('features_selected_list', features_selected_list)
#                 print('len features_selected_list', len(features_selected_list))
#                 print('len features_frequencies', len(features_frequencies))
                
                row = {'PPV': max_ppv, 
                    'Subjects Correctly Classified': correctly_classified[best_fs_index][0], 
                    'Number of Progressor Subtypes': unique_param[0], 
                    'Number of Stable Subtypes': unique_param[1], 
                    'Number of Features': num_Features[best_fs_index], 
                    'Classifier': classifier, 
                    'Weighting': unique_param[3],
                    'Class with High PPV': subtype,
                    'Stability of High PPV': stability,
                    'Number of features mutually selected': num_mf[best_fs_index],
                    'Percentage of Mutual Features Selected across all folds':percentage_mf[best_fs_index],
                    'Features selected with highest frequencies across all folds': features_selected_list, 
                    'Frequencies of the features':features_frequencies,
                    'Indexes of Instances Correctly Classified': correctly_classified[best_fs_index][1]
                }
                result_df = result_df.append(row, ignore_index=True)
    loop = loop +1

In [None]:
result_df.to_csv("./X"+dataset+"_results/results/fold" + fold + "/filtered_good_results_fold" + fold + ".csv")
result_df.to_pickle("./X"+dataset+"_results/results/fold" + fold  + "/filtered_good_results_fold" + fold + ".pkl")

In [None]:
filtered_results_filtered = pd.read_pickle("./X"+dataset+"_results/results/fold" + fold + "/filtered_good_results_fold" + fold + ".pkl")
path = './X'+dataset+'_results/results/fold' + fold + '/inner_cv_analysis/'

for index, row in filtered_results_filtered.iterrows():
    ppv = row['PPV']
    subject_correctly_classified = row['Subjects Correctly Classified']
    num_c = row['Subjects Correctly Classified']
    num_P = row['Number of Progressor Subtypes']
    num_S = row['Number of Stable Subtypes']
    num_Features = row['Number of Features']
    convert = lambda x: 0 if (x=="SVM") else 1 if (x=="RF") else 2
    classifier = convert(row['Classifier'])
    index_of_correct_instances = row['Indexes of Instances Correctly Classified']
    stability = row['Stability of High PPV']
    subtype = row['Class with High PPV']
    weighting = row['Weighting']
    
    dic,_ = computeFeatureFrequencies(df_results_of_each_fold, num_P, num_S, classifier, num_Features, weighting)
    
    plotFigures(df_results_of_each_param, 
                    dic, 
                    ppv, 
                    num_c, 
                    num_P, 
                    num_S, 
                    classifier, 
                    num_Features, 
                    subtype, 
                    stability, 
                    weighting, 
                    index_of_correct_instances, 
                    index, 
                    path 
               )
    