In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score

In [2]:
%run feature_selection_methods.ipynb
%run data_normalization.ipynb
%run gridsearch.ipynb

In [None]:
model, model_name, gridsearch_para = select_class_model(request)

## classification

In [None]:
#preprocessing
train_set, test_set, blind_set = split_train_test(inputdata)
data, label = classification_process(train_set)
label3, classes = label_pre(label)
validation_data = []  # 预先定义
if len(test_set) > 0:
    validation_data, validation_label = classification_process(test_set)
    validation_label, ll = label_pre(validation_label)

# feature selection
if fsm == 'A':
    features = selectkbest_top20(data, label3, k=50)
    Fsm = 'ANOVA'
elif fsm == 'M':
    features = mrmr_fs(data, label3, form_action) #form_action: classification or regression
    Fsm = 'MRMR'

data3 = data.loc[:, features]
train_index, test_index = RSKFold(data3, label3)  # 十次五折交叉验证

In [None]:
if feature_select_method == 'TopK':
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=10)
    clf_num, ms = pre_screening(data3, label3, svc, features, cv=cv) #svc: machine_learning model
    tests, estimators, mean_accs, preds, f_names = train_top3(svc, data3, label3, clf_num,
                                                                     train_index, test_index, features)  ##
    max_features = f_names
    line_chart_data = []
    line_trace = {
        'mode': 'lines+markers',
        'name': clf_name,
        'type': 'scatter',
        'x': list(range(1, len(ms) + 1)),
        'y': ms
    }
    line_chart_data.append(line_trace)
    final_reports, f_describe = customized_report(clf_name, estimators, data3, label3, preds,
                                                  test_index, f_names, tests)
    
elif feature_select_method == 'FSS' or feature_select_method == 'BSS':
    print("run FSS or BSS")
    cv2 = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=10)
    start = time.perf_counter()
    if feature_select_method == 'FSS':
        selected_feature, max_scores = FSS_fun(features, svc, data3, label3, cv2, n_jobs=5)  
    else:
        selected_feature, max_scores = BSS_fun(features, svc, data3, label3, cv2, n_jobs=5)  
    # 得到最值
    max_index = max_scores.index(np.nanmax(max_scores))
    max_score = max(max_scores)
    max_features = selected_feature[:max_index + 1]
    preds, tests, estimators = [], [], []
    for i in range(len(train_index)):
        xtrain, ytrain = data3.iloc[train_index[i], :], label3[train_index[i]]
        xtest, ytest = data3.iloc[test_index[i], :], label3[test_index[i]]
        xtrain, xtest = xtrain[max_features], xtest[max_features]
        estimator, test_acc, predict = train_estimator(svc, xtrain, ytrain, xtest, ytest)
        tests.append(test_acc), estimators.append(estimator), preds.append(predict)
    end = time.perf_counter()
    print(round(end - start, 2))
    line_chart_data = []
    trace = {
        'mode': 'lines+markers',
        'name': clf_name,
        'type': 'scatter',
        'x': list(range(1, len(max_scores) + 1)),
        'y': max_scores
    }
    line_chart_data.append(trace)
    final_reports, f_describe = customized_report(clf_name, estimators, data3, label3, preds, test_index,
                                                  max_features, tests)


    final_reports_dict = df2bp(final_reports)
    f_describe = np.round(f_describe.loc[("mean", 'min', 'max', 'std'), :],
                          3).reset_index().rename(columns={'index': 'Method'})  # 测试集准确率指数
    f_describe_dict = f_describe.to_dict('records')


In [None]:
# gridsearch
start = time.perf_counter()
grid_search = gridsearch_bulid(svc, gridsearch_para, clf_name)
grid_search.fit(data3[max_features], label3)
end = time.perf_counter()
print('gridserach time: ', round(end - start, 2))
# 比较
for i in range(len(train_index)):
    xtrain, ytrain = data3.iloc[train_index[i], :], label3[train_index[i]]
    xtest, ytest = data3.iloc[test_index[i], :], label3[test_index[i]]
    xtrain, xtest = xtrain[max_features], xtest[max_features]
    grid_estimator, test_acc, predict = train_estimator(grid_search.best_estimator_, xtrain, ytrain, xtest, ytest)
    grid_tests.append(test_acc), grid_estimators.append(grid_estimator), grid_preds.append(predict)
grid_reports, grid_describe = customized_report(clf_name, grid_estimators, data3, label3,grid_preds, test_index, max_features, grid_tests)
if grid_describe.loc['mean', 'test_accuracy'] > f_describe.loc['mean', 'test_accuracy']:
    print('using gridsearch para')
    preds, tests, estimators = grid_preds, grid_tests, grid_estimators
    final_reports, f_describe = grid_reports, grid_describe
else:
    print('raw')
    best_para = "using raw parameters"

In [None]:
tmodels = copy.deepcopy(estimators[0])
tmodels.fit(data3[max_features], label3)
best_esti.append(tmodels)#best_esti: The optimal model
#validation
validate_predict, validate_report = pre_valid(best_esti[0], validation_data, validation_label,max_features)

## regression 

In [None]:
train_set, test_set, blind_set = split_train_test(inputdata)
nordata4, nor_age4 = regression_preprocess(train_set)
if len(test_set) > 0:
    validation_data, validation_label = regression_preprocess(test_set)


if fsm == 'A':
    features = selectkbest_top20(nordata4, nor_age4, k=50)
    Fsm = 'ANOVA'
elif fsm == 'M':
    features = mrmr_fs(nordata4, nor_age4,form_action,k=50)
    Fsm = 'MRMR'
    
nordata4 = nordata4[features]
train_index, test_index = RegressionKFold(nordata4, nor_age4)
cv = RepeatedKFold(n_splits=5, n_repeats=1, random_state=10)

In [None]:
if feature_select_method == 'FSS' or feature_select_method == 'BSS':
    if feature_select_method == 'BSS':
        sf, ms = BSS_fun(features, reg_cust_model, nordata4, nor_age4, cv, n_jobs=5)
    else:
        sf, ms = FSS_fun(features, reg_cust_model, nordata4, nor_age4, cv, n_jobs=5)
    max_index = np.array(ms).argmax()
    # max_index = ms.index(np.nanmax(ms))
    max_score = max(ms)
    max_features = (sf[:max_index + 1])
    preds, tests, estimators = [], [], []
    for i in range(len(train_index)):
        xtrain, ytrain = nordata4.iloc[train_index[i], :], nor_age4[train_index[i]]
        xtest, ytest = nordata4.iloc[test_index[i], :], nor_age4[test_index[i]]
        xtrain, xtest = xtrain[max_features], xtest[max_features]
        estimator, test_acc, predict = train_estimator_reg(reg_cust_model, xtrain, ytrain, xtest, ytest)
        tests.append(test_acc), estimators.append(estimator), preds.append(predict)
else: #topK method
    clf_num, ms = pre_screening(nordata4, nor_age4, reg_cust_model, features)
    tests, estimators, mean_accs, preds, res = train_top3_reg(reg_cust_model, nordata4, nor_age4,
                                                                          clf_num, train_index, test_index,
                                                                          features)

    # max_features = list(nordata4.iloc[:, clf_num].columns)
    max_features = res
line_chart_data = []
line_trace = {
    'mode': 'lines+markers',
    'name': reg_model_name,
    'type': 'scatter',
    'x': list(range(1, len(ms)+1)),
    'y': ms
}
line_chart_data.append(line_trace)

cust_reports, cust_reports_describe = cust_cv_reports(preds, test_index, nor_age4, tests)



In [None]:
# gridsearch
start = time.perf_counter()
grid_search = gridsearch_bulid(reg_cust_model, gridsearch_para, reg_model_name)
grid_search.fit(nordata4[max_features], nor_age4)
end = time.perf_counter()
print('gridserach time: ', round(end - start, 2))
for i in range(len(train_index)):
    xtrain, ytrain = nordata4.iloc[train_index[i], :], nor_age4[train_index[i]]
    xtest, ytest = nordata4.iloc[test_index[i], :], nor_age4[test_index[i]]
    xtrain, xtest = xtrain[max_features], xtest[max_features]
    grid_estimator, test_acc, predict = train_estimator_reg(grid_search.best_estimator_, xtrain, ytrain, xtest, ytest)
    grid_tests.append(test_acc), grid_estimators.append(grid_estimator), grid_preds.append(predict)
grid_reports, grid_describe = cust_cv_reports(grid_preds, test_index, nor_age4, grid_tests)

if grid_describe.loc['mean', 'R-square'] > cust_reports_describe.loc['mean', 'R-square']:
    print('using gridsearch para')
    preds, tests, estimators = grid_preds, grid_tests, grid_estimators
    final_reports, f_describe = grid_reports, grid_describe
    best_para = grid_search.best_params_
else:
    print('raw')
    best_para = "using raw parameters"

In [None]:
cust_reports_dict = df2bp(cust_reports)
cust_reports_describe_ = cust_reports_describe.reset_index().rename(columns={'index': 'Method'})  # 测试集准确率指数
cust_reports_describe_dict = cust_reports_describe_.to_dict('records')

In [None]:
tmodels = copy.deepcopy(estimators[0])
tmodels.fit(nordata4[max_features], nor_age4)
best_esti.append(tmodels)
#validation
val_report = reg_cust_val(best_esti,validation_data,validation_label,max_features, reg_model_name)
validate_predict = best_esti[0].predict(validation_data[max_features])
val_report = np.round(val_report, 3)
val_report = val_report.reset_index().rename(columns={'index': 'Method'})
val_report_dict = val_report.to_dict('records')

## survival analysis 

In [None]:
train_set, test_set, blind_set = split_train_test(inputdata, datatype='survival')
sur, sur_label = sur_data_process(train_set)
if len(test_set) > 0:
    validation_data, validation_label = sur_data_process(test_set)
    ifval = True
else:
    ifval = False
cv = KFold(n_splits=5, shuffle=True, random_state=10)
            features = cox_selection(sur, sur_label)
            sur_d = sur[features]
            train_index, test_index = sur_RSKFold(sur_d, sur_label)

In [None]:
if feature_select_method != 'TopK':
    if feature_select_method == 'FSS':
        sf, ms = FSS_fun(features, sur_model, sur_d, sur_label, cv, n_jobs=4)
    else:
        sf, ms = BSS_fun(features, sur_model, sur_d, sur_label, cv, n_jobs=4)
    max_index = np.array(ms).argmax()
    max_score = max(ms)
    max_features = (sf[:max_index + 1])
    preds, tests, estimators = [], [], []
    for i in range(len(train_index)):
        xtrain, ytrain = sur_d.iloc[train_index[i], :], sur_label[train_index[i]]
        xtest, ytest = sur_d.iloc[test_index[i], :], sur_label[test_index[i]]
        xtrain, xtest = xtrain[max_features], xtest[max_features]
        estimator, test_acc, predict = train_estimator(sur_model, xtrain, ytrain, xtest, ytest)
        tests.append(test_acc), estimators.append(estimator), preds.append(predict)
else:
    clf_num, ms = pre_screening(sur_d, sur_label, sur_model, features)
    tests, estimators, mean_accs, preds, res = train_top3(sur_model, sur_d, sur_label, clf_num, train_index,
                                                                     test_index, features)
    max_features = res
test_acc_reports = pd.DataFrame(data=tests)
test_acc_reports.columns = [sur_model_name]
test_acc_describe = np.round(test_acc_reports.describe().loc[("mean", 'min', 'max', 'std'), :], 3)

In [None]:
if len(gridsearch_para) > 0:
    start = time.perf_counter()
    #构建tau截断时间
    lower, upper = np.percentile(sur_label['time'], [0, 100])
    sur_times = np.arange(lower, upper + 1)
    grid_search = gridsearch_bulid(sur_model, gridsearch_para, sur_model_name,sur_times)
    grid_search.fit(sur_d[max_features], sur_label)
    end = time.perf_counter()
    print('gridserach time: ', round(end - start, 2))
    # 比较
    grid_preds, grid_tests, grid_estimators = [], [], []
                for i in range(len(train_index)):
                    xtrain, ytrain = sur_d.iloc[train_index[i], :], sur_label[train_index[i]]
                    xtest, ytest = sur_d.iloc[test_index[i], :], sur_label[test_index[i]]
                    xtrain, xtest = xtrain[max_features], xtest[max_features]
                    grid_estimator, test_acc, predict = train_estimator(grid_search.best_estimator_.estimator, xtrain,
                                                                   ytrain, xtest, ytest)
                    grid_tests.append(test_acc), grid_estimators.append(grid_estimator), grid_preds.append(
                        predict)
                grid_acc_reports = pd.DataFrame(data=grid_tests)
                grid_describe = np.round(grid_acc_reports.describe().loc[("mean", 'min', 'max', 'std'), :], 3)
                grid_describe.columns = [sur_model_name]
    if grid_describe.loc['mean', sur_model_name] > test_acc_describe.loc['mean', sur_model_name]:
        print('using gridsearch para')
        test_acc_describe, tests, estimators = grid_preds, grid_tests, grid_estimators
        f_describe = grid_describe
    else:
        print('raw')
        best_para = "using raw parameters"

In [None]:
test_acc_reports_dict = df2bp(test_acc_reports)
test_acc_describe_ = test_acc_describe.reset_index().rename(columns={'index': 'Method'})  # 测试集准确率指数
test_acc_describe_dict = test_acc_describe_.to_dict('records')

tmodels = copy.deepcopy(estimators[0])
tmodels.fit(sur_d[max_features], sur_label)
best_esti.append(tmodels)