In [None]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle 
from sklearn.preprocessing import StandardScaler
import joblib
from sklearn.model_selection import cross_val_score,GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
import random

In [None]:
rs = random.randint(2000, 100000)
print(rs)
# rs = 17

In [None]:
data_dir = './P10/plan2/'
testandvali_dir = './P10/'

In [None]:
types = ['cl', 'ra', 'me']
datasets = ['test', 'vali']

In [None]:
types = ['cl', 'ra', 'me', 'clra', 'clme', 'rame','clrame']
datasets = ['train', 'test', 'vali']

In [None]:
from sklearn.metrics import precision_recall_curve,average_precision_score
from sklearn.metrics import f1_score,precision_score,recall_score,roc_auc_score,accuracy_score,roc_curve
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score

In [None]:
# class_weight = 'balanced_subsample'
class_weight = 'balanced'
results_df = pd.DataFrame(columns=['dataset', 'Key', 'max_depth', 'max_features', 'n_estimators', 'Best Score'])
S

for key in types:
    data = pd.read_csv(data_dir+f'train_{key}_RFE.csv')
    X = data[data.columns[2:]]
    y = data['label']
    X = X.apply(pd.to_numeric, errors='ignore')
    colNames = X.columns
    X = X.fillna(0)
    X = X.astype(np.float64)
    X = StandardScaler().fit_transform(X)
    X = pd.DataFrame(X)
    X.columns = colNames
    print(X.shape)

    param_grid = {
        'n_estimators': range(1, 3),
        'max_depth': range(1, 3),
        'max_features': range(1, 3)
    }

    rfc = RandomForestClassifier(random_state=rs, n_jobs=-1,class_weight=class_weight)

    GS = GridSearchCV(rfc, param_grid, cv=10, scoring='accuracy', n_jobs=-1)

    GS.fit(X, y)

    best_params = GS.best_params_
    best_score = GS.best_score_

    result = pd.DataFrame({'dataset': ['train'],
                           'Key': [key],
                           'max_depth': [best_params['max_depth']],
                           'max_features': [best_params['max_features']],
                           'n_estimators': [best_params['n_estimators']],
                           'Best Score': [best_score]})
    results_df = pd.concat([results_df, result], ignore_index=True)
    
    best_params = GS.best_params_
    joblib.dump(best_params, data_dir+f'model1/rf_train_{key}_best_params.joblib')
    
    forest = RandomForestClassifier(
        n_estimators=best_params['n_estimators'],
        max_depth=best_params['max_depth'],
        max_features=best_params['max_features'],
        random_state=rs,
        n_jobs=-1,
        class_weight=class_weight
    )

    forest.fit(X, y)
    joblib.dump(forest, data_dir+f'model1/rf_train_{key}.pkl')
    
    forest_y_pre = forest.predict(X)
    forest_y_proba = forest.predict_proba(X)

    forest_acc = accuracy_score(y,forest_y_pre)
    forest_preci = precision_score(y,forest_y_pre)
    forest_recall = recall_score(y,forest_y_pre)
    forest_f1 = f1_score(y,forest_y_pre)
    forest_auc = roc_auc_score(y,forest_y_proba[:,1])
    
    forest_cm = confusion_matrix(y, forest_y_pre)

    true_negative = forest_cm[0, 0]
    false_positive = forest_cm[0, 1]
    false_negative = forest_cm[1, 0]
    true_positive = forest_cm[1, 1]

    sensitivity = true_positive / (true_positive + false_negative)

    specificity = true_negative / (true_negative + false_positive)

    ppv = true_positive / (true_positive + false_positive)

    npv = true_negative / (true_negative + false_negative)
    
    metric = pd.DataFrame({'dataset': ['train'],
                           'Key': [key],
                           'acc': [forest_acc],
                           'auc': [forest_auc],
                           'forest_preci': [forest_preci],
                           'sensitivity': [sensitivity],
                           'specificity': [specificity],
                           'ppv': [ppv],
                           'npv': [npv],
                            'forest_f1': [forest_f1]})
    metrics_df = pd.concat([metrics_df, metric], ignore_index=True)
    
print(results_df)

    


In [None]:
models = ['cl', 'ra', 'me', 'clra', 'clme', 'rame','clrame']

In [None]:
metrics_df = metrics_df[metrics_df['dataset'] != 'test']
for key in models:
    print(key)
    data = pd.read_csv(data_dir+f'test_{key}_RFE.csv')
    X = data[data.columns[2:]]
    y = data['label']
    X = X.apply(pd.to_numeric,errors = 'ignore') 
    colNames = X.columns 
    X = X.fillna(0)
    X = X.astype(np.float64)
    X = StandardScaler().fit_transform(X)
    X = pd.DataFrame(X)
    X.columns = colNames
    print(X.shape)
    
    forest = joblib.load(data_dir+f'model1/rf_train_{key}.pkl')
    
    forest_y_pre = forest.predict(X)
    forest_y_proba = forest.predict_proba(X)

    forest_acc = accuracy_score(y,forest_y_pre)
    forest_preci = precision_score(y,forest_y_pre)
    forest_recall = recall_score(y,forest_y_pre)
    forest_f1 = f1_score(y,forest_y_pre)
    forest_auc = roc_auc_score(y,forest_y_proba[:,1])
    
    forest_cm = confusion_matrix(y, forest_y_pre)

    true_negative = forest_cm[0, 0]
    false_positive = forest_cm[0, 1]
    false_negative = forest_cm[1, 0]
    true_positive = forest_cm[1, 1]

    sensitivity = true_positive / (true_positive + false_negative)

    specificity = true_negative / (true_negative + false_positive)

    ppv = true_positive / (true_positive + false_positive)

    npv = true_negative / (true_negative + false_negative)
    
    metric = pd.DataFrame({'dataset': ['test'],
                           'Key': [key],
                           'acc': [forest_acc],
                           'auc': [forest_auc],
                           'forest_preci': [forest_preci],
                           'sensitivity': [sensitivity],
                           'specificity': [specificity],
                           'ppv': [ppv],
                           'npv': [npv],
                            'forest_f1': [forest_f1]})
    metrics_df = pd.concat([metrics_df, metric], ignore_index=True)


In [None]:
metrics_df = metrics_df[metrics_df['dataset'] != 'vali']
for key in models:
    data = pd.read_csv(data_dir+f'vali_{key}_RFE.csv')
    X = data[data.columns[2:]]
    y = data['label']
    X = X.apply(pd.to_numeric,errors = 'ignore') 
    colNames = X.columns 
    X = X.fillna(0)
    X = X.astype(np.float64)
    X = StandardScaler().fit_transform(X)
    X = pd.DataFrame(X)
    X.columns = colNames
    print(X.shape)
    
    forest = joblib.load(data_dir+f'model1/rf_train_{key}.pkl')
    
    forest_y_pre = forest.predict(X)
    forest_y_proba = forest.predict_proba(X)

    forest_acc = accuracy_score(y,forest_y_pre)
    forest_preci = precision_score(y,forest_y_pre)
    forest_recall = recall_score(y,forest_y_pre)
    forest_f1 = f1_score(y,forest_y_pre)
    forest_auc = roc_auc_score(y,forest_y_proba[:,1])
    
    forest_cm = confusion_matrix(y, forest_y_pre)

    true_negative = forest_cm[0, 0]
    false_positive = forest_cm[0, 1]
    false_negative = forest_cm[1, 0]
    true_positive = forest_cm[1, 1]

    sensitivity = true_positive / (true_positive + false_negative)

    specificity = true_negative / (true_negative + false_positive)

    ppv = true_positive / (true_positive + false_positive)

    npv = true_negative / (true_negative + false_negative)
    
    metric = pd.DataFrame({'dataset': ['vali'],
                           'Key': [key],
                           'acc': [forest_acc],
                           'auc': [forest_auc],
                           'forest_preci': [forest_preci],
                           'sensitivity': [sensitivity],
                           'specificity': [specificity],
                           'ppv': [ppv],
                           'npv': [npv],
                            'forest_f1': [forest_f1]})
    metrics_df = pd.concat([metrics_df, metric], ignore_index=True)
