In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

df_protein = pd.read_csv('merge.train.csv',header=None).iloc[:,2:]
test_df_protein = pd.read_csv('merge.test.csv',header= None).iloc[:,2:]

In [50]:
import numpy as np
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import roc_curve, auc, precision_recall_curve
import matplotlib.pyplot as plt
import joblib

def train_and_evaluate_lr(data, test_data, encode, save_model=True):
    # Separate features and labels
    X = data.iloc[:, 1:]
    y = data.iloc[:, 0]
    X_new_test = test_data.iloc[:, 1:]
    y_new_test = test_data.iloc[:, 0]

    #5-fold cross-validation
    kf = KFold(n_splits=5, shuffle=True, random_state=22)
    TPs, TNs, FPs, FNs = [], [], [], []
    precisions, recalls, specificities, accuracies, MCCs, f1_scores, AUROCs, AUPRCs = [], [], [], [], [], [], [], []
    fprs, tprs, aucs = [], [], []
    precision_list, recall_list, pr_auc_list = [], [], []
    y_pred_proba_new_list = []
    best_score = 0
    best_model = None
    best_n = 0
    # Iterate through each fold of cross-validation
    for n, (train_index, test_index) in enumerate(kf.split(X), start=1):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        lr = LogisticRegression(solver='liblinear')
        params = {
            'penalty': ['l1', 'l2'],
            'C': [0.01, 0.1, 1.0, 10.0],
            'class_weight': [None, 'balanced']
        }
        gs = GridSearchCV(lr, param_grid=params, cv=5, n_jobs=-1)
        gs.fit(X_train, y_train)

        rf_best = LogisticRegression(**gs.best_params_, solver='liblinear')
        rf_best.fit(X_train, y_train)
        # Predict on the test set
        y_pred = rf_best.predict(X_test)
        y_pred_proba = rf_best.predict_proba(X_test)[:, 1]
        TP = np.sum((y_pred == 1) & (y_test == 1))
        TN = np.sum((y_pred == 0) & (y_test == 0))
        FP = np.sum((y_pred == 1) & (y_test == 0))
        FN = np.sum((y_pred == 0) & (y_test == 1))
        precision = TP / (TP + FP)
        recall = TP / (TP + FN)
        specificity = TN / (TN + FP)
        accuracy = (TP + TN) / (TP + TN + FP + FN)
        MCC = (TP * TN - FP * FN) / np.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN))
        f1_score = 2 * (precision * recall) / (precision + recall)
        fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
        roc_auc = auc(fpr, tpr)
        precision_curve, recall_curve, _ = precision_recall_curve(y_test, y_pred_proba)
        pr_auc = auc(recall_curve, precision_curve)

        y_pred_new = rf_best.predict(X_test)
        y_pred_proba_new = rf_best.predict_proba(X_test)[:, 1]
        y_pred_proba_new_list.append(y_pred_proba_new)
        TP_new = np.sum((y_pred_new == 1) & (y_test == 1))
        TN_new = np.sum((y_pred_new == 0) & (y_test == 0))
        FP_new = np.sum((y_pred_new == 1) & (y_test == 0))
        FN_new = np.sum((y_pred_new == 0) & (y_test == 1))
        precision_new = TP_new / (TP_new + FP_new)
        recall_new = TP_new / (TP_new + FN_new)
        specificity_new = TN_new / (TN_new + FP_new)
        accuracy_new = (TP_new + TN_new) / (TP_new + TN_new + FP_new + FN_new)
        MCC_new = (TP_new * TN_new - FP_new * FN_new) / np.sqrt(
            (TP_new + FP_new) * (TP_new + FN_new) * (TN_new + FP_new) * (TN_new + FN_new))
        f1_score_new = 2 * (precision_new * recall_new) / (precision_new + recall_new)
        fpr_new, tpr_new, _ = roc_curve(y_test, y_pred_proba_new)
        roc_auc_new = auc(fpr_new, tpr_new)
        precision_curve_new, recall_curve_new, _ = precision_recall_curve(y_test, y_pred_proba_new)
        pr_auc_new = auc(recall_curve_new, precision_curve_new)
        if best_score < roc_auc_new:
            best_score = roc_auc_new
            best_n = n
            best_model = rf_best
        independent_test = pd.DataFrame(y_pred_proba_new, columns=['predictions'])
        merge_df = pd.concat([test_data, independent_test], axis=1)
        # Record performance metrics
        TPs.append(TP_new)
        TNs.append(TN_new)
        FPs.append(FP_new)
        FNs.append(FN_new)
        precisions.append(precision_new)
        recalls.append(recall_new)
        specificities.append(specificity_new)
        accuracies.append(accuracy_new)
        MCCs.append(MCC_new)
        f1_scores.append(f1_score_new)
        AUROCs.append(roc_auc_new)
        AUPRCs.append(pr_auc_new)
        precision_list.append(precision_curve_new)
        recall_list.append(recall_curve_new)
        pr_auc_list.append(pr_auc_new)
        fprs.append(fpr_new)
        tprs.append(tpr_new)
        aucs.append(roc_auc_new)
    col_means = [np.mean(col) for col in zip(*y_pred_proba_new_list)]
    mean_independent_test = pd.DataFrame(col_means, columns=['predictions'])
    merge_df = pd.concat([test_data, mean_independent_test], axis=1)
    if save_model and best_model is not None:
        joblib.dump(best_model, f'{encode}.lr.best.pkl')

    # Print the performance summary for each fold
    print_performance_summary(TPs, TNs, FPs, FNs, precisions, recalls, specificities, accuracies, MCCs, f1_scores, AUROCs, AUPRCs)

def print_performance_summary(TPs, TNs, FPs, FNs, precisions, recalls, specificities, accuracies, MCCs, f1_scores, AUROCs, AUPRCs):
    metrics = {
        'TP': TPs, 'TN': TNs, 'FP': FPs, 'FN': FNs,
        'Precision': precisions, 'Recall': recalls, 'Specificity': specificities,
        'Accuracy': accuracies, 'MCC': MCCs, 'F1 score': f1_scores,
        'AUROC': AUROCs, 'AUPRC': AUPRCs
    }
    for name, values in metrics.items():
        print(f"{name}: mean = {np.mean(values)}, std = {np.std(values)}")


In [51]:
train_and_evaluate_lr(df_protein, test_df_protein, encode="Intergration", save_model=True)

In [56]:
#predict
import joblib
import numpy as np
from zzd.utils.assess import multi_scores as scores

def load_and_predict(model_path, train_data, test_data, train_file, test_file, output_train, output_test):
    """
    Load the model and make predictions, saving the results to specified files.
    
    Parameters:
    - model_path: Path to the model file
    - train_data: DataFrame for training data
    - test_data: DataFrame for test data
    - train_file: Path to the training set file for generating the prediction table
    - test_file: Path to the test set file for generating the prediction table
    - output_train: Output filename for training set predictions
    - output_test: Output filename for test set predictions
    """
    try:
        with open(model_path, 'rb') as model_file:
            loaded_model = joblib.load(model_file)
    
        model_params = loaded_model.get_params()
        print("Model hyperparameters:", model_params)
        X_train = train_data.iloc[:, 1:]
        y_train = train_data.iloc[:, 0]
        X_test = test_data.iloc[:, 1:]
        y_test = test_data.iloc[:, 0]

        y_train_pred = loaded_model.predict_proba(X_train)[:, 1]
        y_test_pred = loaded_model.predict_proba(X_test)[:, 1]
        pred_table_train = np.hstack((np.genfromtxt(train_file, str), y_train_pred.reshape(-1, 1)))
        pred_table_test = np.hstack((np.genfromtxt(test_file, str), y_test_pred.reshape(-1, 1)))
        print("Test set evaluation results:")
        result_test = scores(pred_table_test[:, -2], pred_table_test[:, -1], show=True)

        # Save prediction results
        with open(output_train, 'w') as output_file_train:
            for row in pred_table_train:
                output_file_train.write('\t'.join(map(str, row)) + '\n')

        with open(output_test, 'w') as output_file_test:
            for row in pred_table_test:
                output_file_test.write('\t'.join(map(str, row)) + '\n')

        print(f"Prediction results have been saved to {output_train} and {output_test}")

    except Exception as e:
        print("Error occurred during model loading or prediction:", str(e))


In [64]:
train_data = df_protein
test_data = test_df_protein
model_path = 'DeepISO.LR.pkl'
train_file = 'train.txt'  
test_file = 'test.txt'   

load_and_predict(
    model_path=model_path,
    train_data=train_data,
    test_data=test_data,
    train_file=train_file,
    test_file=test_file,
    output_train='DeepISO.train.txt',
    output_test='DeepISO.test.txt'
)


Model hyperparameters: {'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l1', 'random_state': None, 'solver': 'liblinear', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
Test set evaluation results:
TP	TN	FP	FN	precision	recall	specificity	Acc	MCC	f1	AUROC	AUPRC
121	114	37	28	0.7658	0.812	0.755	0.783	0.568	0.788	0.851	0.846
Prediction results have been saved to DeepISO.train.txt and DeepISO.test.txt
