In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.feature_selection import SelectKBest, chi2, f_regression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.ensemble import VotingClassifier, BaggingClassifier, AdaBoostClassifier,\
      GradientBoostingClassifier, StackingClassifier, VotingClassifier
from xgboost import XGBClassifier
import json


## 数据处理部分

In [3]:
data = pd.read_pickle("data/data_processed_augmented_missing_threshold_2_outlier_std_threshold_3.pkl")
X_orig = data.drop(columns=['Number of nights in CITY', 'Purpose of visit to CITY']).to_numpy()
y = data['Purpose of visit to CITY'].to_numpy()

In [4]:
# ss = np.load('data/outliers_unscaled.npz')
# outliers = ss['outliers'].astype(int)
# rows_to_remove = outliers
# X_orig = np.delete(X_orig, rows_to_remove, axis=0)
# y = np.delete(y, rows_to_remove)

In [5]:
pca_50 = PCA(n_components= 50)  
X_pca_50 = pca_50.fit_transform(X_orig)

pca_100 = PCA(n_components= 100)  
X_pca_100 = pca_100.fit_transform(X_orig)

pca_200 = PCA(n_components= 200)  
X_pca_200 = pca_200.fit_transform(X_orig)

In [6]:
lda = LDA(n_components=min(len(np.unique(y)) - 1, X_orig.shape[1]))
X_lda = lda.fit_transform(X_orig, y)

In [None]:
select_50 = SelectKBest(k=50)
X_selected_50 = select_50.fit_transform(X_orig, y)

select_100 = SelectKBest(k=100)
X_selected_100 = select_100.fit_transform(X_orig, y)

select_200 = SelectKBest(k=200)
X_selected_200 = select_200.fit_transform(X_orig, y)

In [7]:
tscv = TimeSeriesSplit(n_splits=5)

In [8]:
print(X_orig.shape)

(71214, 226)


In [9]:
print(y)

[3. 2. 1. ... 1. 1. 1.]


In [10]:
y = y.astype(int)
print(y)

[3 2 1 ... 1 1 1]


## Logistic Regression

In [None]:
Logistic_Reg_params = {
    'C': [0.01, 0.1, 1.0, 5]
}
name = 'Logistic Regression'

feature_sets = {
    'Original': X_orig,
    'PCA_50': X_pca_50,
    'PCA_100': X_pca_100,
    'PCA_200': X_pca_200,
    'Select50Best': X_selected_50,
    'Select100Best': X_selected_100,
    'Select200Best': X_selected_200,
    'LDA': X_lda,
    'Scaled_Original': StandardScaler().fit_transform(X_orig),
    'Scaled_PCA_50': StandardScaler().fit_transform(X_pca_50),
    'Scaled_PCA_100': StandardScaler().fit_transform(X_pca_100),
    'Scaled_PCA_200': StandardScaler().fit_transform(X_pca_200),
    'Scaled_Select50Best': StandardScaler().fit_transform(X_selected_50),
    'Scaled_Select100Best': StandardScaler().fit_transform(X_selected_100),
    'Scaled_Select200Best': StandardScaler().fit_transform(X_selected_200),
    'Scaled_LDA': StandardScaler().fit_transform(X_lda),
}

# Dictionary to store results for all feature sets
results = {
    'model_name': name,
    'hyperparameters': Logistic_Reg_params,
    'feature_sets': {}
}

for feature_name, X in feature_sets.items():
    # Lists to store metrics and model parameters for each fold
    fold_metrics = []
    
    for fold, (train_idx, test_idx) in enumerate(tqdm(tscv.split(X), desc=f'Processing {feature_name}')):
        X_train, X_test = X[train_idx], X[test_idx]
        y_clf_train, y_clf_test = y[train_idx], y[test_idx]
        
        # Define and train model with GridSearchCV
        model = LogisticRegression(max_iter=1000, solver='saga', n_jobs=20)
        grid = GridSearchCV(model, Logistic_Reg_params, cv=2, n_jobs=20)
        grid.fit(X_train, y_clf_train)
        model = grid.best_estimator_
        
        # Make predictions and compute metrics
        y_pred = model.predict(X_test)
        precision, recall, f1, _ = precision_recall_fscore_support(y_clf_test, y_pred, average='weighted', zero_division=1)
        auc = roc_auc_score(y_clf_test, model.predict_proba(X_test), multi_class='ovr', average='weighted')
        
        # Store fold metrics
        fold_result = {
            'fold': fold,
            'accuracy': float(accuracy_score(y_clf_test, y_pred)),
            'precision': float(precision),
            'recall': float(recall),
            'f1': float(f1),
            'auc': float(auc)
        }
        fold_metrics.append(fold_result)
                        
        # Print per-fold results
        print(f'{name}-{feature_name} - Fold {fold} - Accuracy: {fold_result["accuracy"]:.2f}, '
              f'Precision: {fold_result["precision"]:.2f}, Recall: {fold_result["recall"]:.2f}, '
              f'F1: {fold_result["f1"]:.2f}, AUC: {fold_result["auc"]:.2f}')
    
    # Compute average metrics across folds
    avg_metrics = {
        'avg_accuracy': float(np.mean([m['accuracy'] for m in fold_metrics])),
        'avg_precision': float(np.mean([m['precision'] for m in fold_metrics])),
        'avg_recall': float(np.mean([m['recall'] for m in fold_metrics])),
        'avg_f1': float(np.mean([m['f1'] for m in fold_metrics])),
        'avg_auc': float(np.mean([m['auc'] for m in fold_metrics]))
    }
    
    # Store results for this feature set
    results['feature_sets'][feature_name] = {
        'fold_metrics': fold_metrics,
        'average_metrics': avg_metrics
    }
    
    # Print average results
    print(f'\nAverage Performance Across Folds for {feature_name}:')
    print(f'{name}-{feature_name} - Accuracy: {avg_metrics["avg_accuracy"]:.2f}, '
          f'Precision: {avg_metrics["avg_precision"]:.2f}, Recall: {avg_metrics["avg_recall"]:.2f}, '
          f'F1: {avg_metrics["avg_f1"]:.2f}, AUC: {avg_metrics["avg_auc"]:.2f}\n')

# Save results to JSON file
with open(f'results/clf_aug/{name}_results.json', 'w') as f:
    json.dump(results, f, indent=4)

print("Results and model parameters have been saved to 'logistic_regression_results.json'")

Processing Original: 1it [01:14, 74.75s/it]

Logistic Regression-Original - Fold 0 - Accuracy: 0.63, Precision: 0.52, Recall: 0.63, F1: 0.55, AUC: 0.54


Processing Original: 2it [03:49, 122.07s/it]

Logistic Regression-Original - Fold 1 - Accuracy: 0.70, Precision: 0.70, Recall: 0.70, F1: 0.58, AUC: 0.58


Processing Original: 3it [07:29, 166.73s/it]

Logistic Regression-Original - Fold 2 - Accuracy: 0.71, Precision: 0.68, Recall: 0.71, F1: 0.59, AUC: 0.54


Processing Original: 4it [12:34, 221.06s/it]

Logistic Regression-Original - Fold 3 - Accuracy: 0.72, Precision: 0.64, Recall: 0.72, F1: 0.60, AUC: 0.59


Processing Original: 5it [18:54, 226.92s/it]


Logistic Regression-Original - Fold 4 - Accuracy: 0.69, Precision: 0.65, Recall: 0.69, F1: 0.57, AUC: 0.57

Average Performance Across Folds for Original:
Logistic Regression-Original - Accuracy: 0.69, Precision: 0.64, Recall: 0.69, F1: 0.58, AUC: 0.56



Processing PCA_50: 1it [00:10, 10.41s/it]

Logistic Regression-PCA_50 - Fold 0 - Accuracy: 0.64, Precision: 0.57, Recall: 0.64, F1: 0.59, AUC: 0.57


Processing PCA_50: 2it [00:34, 18.30s/it]

Logistic Regression-PCA_50 - Fold 1 - Accuracy: 0.54, Precision: 0.59, Recall: 0.54, F1: 0.56, AUC: 0.59


Processing PCA_50: 3it [01:09, 25.91s/it]

Logistic Regression-PCA_50 - Fold 2 - Accuracy: 0.57, Precision: 0.59, Recall: 0.57, F1: 0.58, AUC: 0.57


Processing PCA_50: 4it [01:55, 33.80s/it]

Logistic Regression-PCA_50 - Fold 3 - Accuracy: 0.52, Precision: 0.60, Recall: 0.52, F1: 0.55, AUC: 0.59


Processing PCA_50: 5it [02:54, 34.90s/it]


Logistic Regression-PCA_50 - Fold 4 - Accuracy: 0.54, Precision: 0.57, Recall: 0.54, F1: 0.55, AUC: 0.57

Average Performance Across Folds for PCA_50:
Logistic Regression-PCA_50 - Accuracy: 0.56, Precision: 0.59, Recall: 0.56, F1: 0.56, AUC: 0.58



Processing PCA_100: 1it [00:20, 20.16s/it]

Logistic Regression-PCA_100 - Fold 0 - Accuracy: 0.64, Precision: 0.57, Recall: 0.64, F1: 0.59, AUC: 0.57


Processing PCA_100: 2it [00:59, 31.52s/it]

Logistic Regression-PCA_100 - Fold 1 - Accuracy: 0.54, Precision: 0.59, Recall: 0.54, F1: 0.56, AUC: 0.59


Processing PCA_100: 3it [02:03, 46.30s/it]

Logistic Regression-PCA_100 - Fold 2 - Accuracy: 0.57, Precision: 0.59, Recall: 0.57, F1: 0.58, AUC: 0.57


Processing PCA_100: 4it [03:27, 61.36s/it]

Logistic Regression-PCA_100 - Fold 3 - Accuracy: 0.52, Precision: 0.60, Recall: 0.52, F1: 0.55, AUC: 0.59


Processing PCA_100: 5it [05:15, 63.09s/it]


Logistic Regression-PCA_100 - Fold 4 - Accuracy: 0.54, Precision: 0.57, Recall: 0.54, F1: 0.55, AUC: 0.57

Average Performance Across Folds for PCA_100:
Logistic Regression-PCA_100 - Accuracy: 0.56, Precision: 0.59, Recall: 0.56, F1: 0.56, AUC: 0.58



Processing PCA_200: 1it [00:37, 37.38s/it]

Logistic Regression-PCA_200 - Fold 0 - Accuracy: 0.64, Precision: 0.57, Recall: 0.64, F1: 0.59, AUC: 0.57


Processing PCA_200: 2it [01:54, 61.03s/it]

Logistic Regression-PCA_200 - Fold 1 - Accuracy: 0.54, Precision: 0.59, Recall: 0.54, F1: 0.56, AUC: 0.59


Processing PCA_200: 3it [03:59, 89.88s/it]

Logistic Regression-PCA_200 - Fold 2 - Accuracy: 0.57, Precision: 0.59, Recall: 0.57, F1: 0.58, AUC: 0.57


Processing PCA_200: 4it [06:42, 118.83s/it]

Logistic Regression-PCA_200 - Fold 3 - Accuracy: 0.52, Precision: 0.60, Recall: 0.52, F1: 0.55, AUC: 0.59


Processing PCA_200: 5it [10:07, 121.59s/it]


Logistic Regression-PCA_200 - Fold 4 - Accuracy: 0.54, Precision: 0.57, Recall: 0.54, F1: 0.55, AUC: 0.57

Average Performance Across Folds for PCA_200:
Logistic Regression-PCA_200 - Accuracy: 0.56, Precision: 0.59, Recall: 0.56, F1: 0.56, AUC: 0.58



Processing Select50Best: 1it [00:10, 10.94s/it]

Logistic Regression-Select50Best - Fold 0 - Accuracy: 0.85, Precision: 0.85, Recall: 0.85, F1: 0.81, AUC: 0.92


Processing Select50Best: 2it [00:33, 17.49s/it]

Logistic Regression-Select50Best - Fold 1 - Accuracy: 0.86, Precision: 0.82, Recall: 0.86, F1: 0.82, AUC: 0.92


Processing Select50Best: 3it [01:07, 25.17s/it]

Logistic Regression-Select50Best - Fold 2 - Accuracy: 0.85, Precision: 0.83, Recall: 0.85, F1: 0.82, AUC: 0.92


Processing Select50Best: 4it [01:53, 33.46s/it]

Logistic Regression-Select50Best - Fold 3 - Accuracy: 0.87, Precision: 0.85, Recall: 0.87, F1: 0.84, AUC: 0.93


Processing Select50Best: 5it [02:53, 34.61s/it]


Logistic Regression-Select50Best - Fold 4 - Accuracy: 0.87, Precision: 0.84, Recall: 0.87, F1: 0.83, AUC: 0.93

Average Performance Across Folds for Select50Best:
Logistic Regression-Select50Best - Accuracy: 0.86, Precision: 0.84, Recall: 0.86, F1: 0.82, AUC: 0.92



Processing Select100Best: 1it [00:19, 19.18s/it]

Logistic Regression-Select100Best - Fold 0 - Accuracy: 0.67, Precision: 0.62, Recall: 0.67, F1: 0.58, AUC: 0.49


Processing Select100Best: 2it [00:58, 31.13s/it]

Logistic Regression-Select100Best - Fold 1 - Accuracy: 0.68, Precision: 0.64, Recall: 0.68, F1: 0.61, AUC: 0.50


Processing Select100Best: 3it [02:00, 45.19s/it]

Logistic Regression-Select100Best - Fold 2 - Accuracy: 0.69, Precision: 0.63, Recall: 0.69, F1: 0.61, AUC: 0.49


Processing Select100Best: 4it [03:24, 60.50s/it]

Logistic Regression-Select100Best - Fold 3 - Accuracy: 0.69, Precision: 0.63, Recall: 0.69, F1: 0.62, AUC: 0.49


Processing Select100Best: 5it [05:17, 63.50s/it]


Logistic Regression-Select100Best - Fold 4 - Accuracy: 0.68, Precision: 0.62, Recall: 0.68, F1: 0.58, AUC: 0.50

Average Performance Across Folds for Select100Best:
Logistic Regression-Select100Best - Accuracy: 0.68, Precision: 0.63, Recall: 0.68, F1: 0.60, AUC: 0.49



Processing Select200Best: 1it [00:37, 37.54s/it]

Logistic Regression-Select200Best - Fold 0 - Accuracy: 0.63, Precision: 0.52, Recall: 0.63, F1: 0.55, AUC: 0.54


Processing Select200Best: 2it [01:55, 61.08s/it]

Logistic Regression-Select200Best - Fold 1 - Accuracy: 0.70, Precision: 0.70, Recall: 0.70, F1: 0.58, AUC: 0.58


Processing Select200Best: 3it [03:56, 88.53s/it]

Logistic Regression-Select200Best - Fold 2 - Accuracy: 0.71, Precision: 0.68, Recall: 0.71, F1: 0.59, AUC: 0.54


Processing Select200Best: 4it [06:40, 118.47s/it]

Logistic Regression-Select200Best - Fold 3 - Accuracy: 0.72, Precision: 0.64, Recall: 0.72, F1: 0.60, AUC: 0.59


Processing Select200Best: 5it [10:05, 121.04s/it]


Logistic Regression-Select200Best - Fold 4 - Accuracy: 0.69, Precision: 0.65, Recall: 0.69, F1: 0.57, AUC: 0.57

Average Performance Across Folds for Select200Best:
Logistic Regression-Select200Best - Accuracy: 0.69, Precision: 0.64, Recall: 0.69, F1: 0.58, AUC: 0.56



Processing LDA: 2it [00:00,  2.45it/s]

Logistic Regression-LDA - Fold 0 - Accuracy: 0.87, Precision: 0.87, Recall: 0.87, F1: 0.87, AUC: 0.95
Logistic Regression-LDA - Fold 1 - Accuracy: 0.87, Precision: 0.86, Recall: 0.87, F1: 0.86, AUC: 0.95


Processing LDA: 3it [00:01,  1.54it/s]

Logistic Regression-LDA - Fold 2 - Accuracy: 0.88, Precision: 0.87, Recall: 0.88, F1: 0.87, AUC: 0.95


Processing LDA: 4it [00:02,  1.93it/s]

Logistic Regression-LDA - Fold 3 - Accuracy: 0.88, Precision: 0.88, Recall: 0.88, F1: 0.88, AUC: 0.96


Processing LDA: 5it [00:02,  1.99it/s]


Logistic Regression-LDA - Fold 4 - Accuracy: 0.88, Precision: 0.87, Recall: 0.88, F1: 0.87, AUC: 0.95

Average Performance Across Folds for LDA:
Logistic Regression-LDA - Accuracy: 0.88, Precision: 0.87, Recall: 0.88, F1: 0.87, AUC: 0.95



Processing Scaled_Original: 1it [01:11, 71.23s/it]

Logistic Regression-Scaled_Original - Fold 0 - Accuracy: 0.87, Precision: 0.86, Recall: 0.87, F1: 0.87, AUC: 0.95


Processing Scaled_Original: 2it [03:38, 115.67s/it]

Logistic Regression-Scaled_Original - Fold 1 - Accuracy: 0.86, Precision: 0.86, Recall: 0.86, F1: 0.86, AUC: 0.94


Processing Scaled_Original: 3it [07:25, 166.60s/it]

Logistic Regression-Scaled_Original - Fold 2 - Accuracy: 0.88, Precision: 0.87, Recall: 0.88, F1: 0.87, AUC: 0.95


Processing Scaled_Original: 4it [12:30, 221.21s/it]

Logistic Regression-Scaled_Original - Fold 3 - Accuracy: 0.89, Precision: 0.88, Recall: 0.89, F1: 0.88, AUC: 0.95


Processing Scaled_Original: 5it [17:17, 207.41s/it]


Logistic Regression-Scaled_Original - Fold 4 - Accuracy: 0.88, Precision: 0.87, Recall: 0.88, F1: 0.87, AUC: 0.95

Average Performance Across Folds for Scaled_Original:
Logistic Regression-Scaled_Original - Accuracy: 0.88, Precision: 0.87, Recall: 0.88, F1: 0.87, AUC: 0.95



Processing Scaled_PCA_50: 1it [00:10, 10.72s/it]

Logistic Regression-Scaled_PCA_50 - Fold 0 - Accuracy: 0.76, Precision: 0.72, Recall: 0.76, F1: 0.73, AUC: 0.82


Processing Scaled_PCA_50: 2it [00:26, 13.88s/it]

Logistic Regression-Scaled_PCA_50 - Fold 1 - Accuracy: 0.79, Precision: 0.77, Recall: 0.79, F1: 0.75, AUC: 0.84


Processing Scaled_PCA_50: 3it [00:51, 18.80s/it]

Logistic Regression-Scaled_PCA_50 - Fold 2 - Accuracy: 0.80, Precision: 0.77, Recall: 0.80, F1: 0.76, AUC: 0.85


Processing Scaled_PCA_50: 4it [01:24, 24.32s/it]

Logistic Regression-Scaled_PCA_50 - Fold 3 - Accuracy: 0.80, Precision: 0.77, Recall: 0.80, F1: 0.77, AUC: 0.85


Processing Scaled_PCA_50: 5it [02:07, 25.54s/it]


Logistic Regression-Scaled_PCA_50 - Fold 4 - Accuracy: 0.80, Precision: 0.78, Recall: 0.80, F1: 0.76, AUC: 0.85

Average Performance Across Folds for Scaled_PCA_50:
Logistic Regression-Scaled_PCA_50 - Accuracy: 0.79, Precision: 0.76, Recall: 0.79, F1: 0.75, AUC: 0.84



Processing Scaled_PCA_100: 1it [00:08,  8.67s/it]

Logistic Regression-Scaled_PCA_100 - Fold 0 - Accuracy: 0.86, Precision: 0.85, Recall: 0.86, F1: 0.85, AUC: 0.93


Processing Scaled_PCA_100: 2it [00:41, 22.91s/it]

Logistic Regression-Scaled_PCA_100 - Fold 1 - Accuracy: 0.87, Precision: 0.85, Recall: 0.87, F1: 0.85, AUC: 0.94


Processing Scaled_PCA_100: 3it [01:19, 29.98s/it]

Logistic Regression-Scaled_PCA_100 - Fold 2 - Accuracy: 0.86, Precision: 0.85, Recall: 0.86, F1: 0.85, AUC: 0.94


Processing Scaled_PCA_100: 4it [02:06, 36.43s/it]

Logistic Regression-Scaled_PCA_100 - Fold 3 - Accuracy: 0.88, Precision: 0.86, Recall: 0.88, F1: 0.86, AUC: 0.94


Processing Scaled_PCA_100: 5it [03:08, 37.76s/it]


Logistic Regression-Scaled_PCA_100 - Fold 4 - Accuracy: 0.87, Precision: 0.86, Recall: 0.87, F1: 0.86, AUC: 0.94

Average Performance Across Folds for Scaled_PCA_100:
Logistic Regression-Scaled_PCA_100 - Accuracy: 0.87, Precision: 0.86, Recall: 0.87, F1: 0.85, AUC: 0.94



Processing Scaled_PCA_200: 1it [00:17, 17.94s/it]

Logistic Regression-Scaled_PCA_200 - Fold 0 - Accuracy: 0.87, Precision: 0.86, Recall: 0.87, F1: 0.86, AUC: 0.95


Processing Scaled_PCA_200: 2it [01:34, 52.69s/it]

Logistic Regression-Scaled_PCA_200 - Fold 1 - Accuracy: 0.88, Precision: 0.87, Recall: 0.88, F1: 0.87, AUC: 0.95


Processing Scaled_PCA_200: 3it [02:44, 60.55s/it]

Logistic Regression-Scaled_PCA_200 - Fold 2 - Accuracy: 0.85, Precision: 0.85, Recall: 0.85, F1: 0.84, AUC: 0.93


Processing Scaled_PCA_200: 4it [04:22, 75.06s/it]

Logistic Regression-Scaled_PCA_200 - Fold 3 - Accuracy: 0.88, Precision: 0.88, Recall: 0.88, F1: 0.88, AUC: 0.95


Processing Scaled_PCA_200: 5it [06:34, 78.98s/it]


Logistic Regression-Scaled_PCA_200 - Fold 4 - Accuracy: 0.88, Precision: 0.87, Recall: 0.88, F1: 0.87, AUC: 0.95

Average Performance Across Folds for Scaled_PCA_200:
Logistic Regression-Scaled_PCA_200 - Accuracy: 0.87, Precision: 0.86, Recall: 0.87, F1: 0.86, AUC: 0.95



Processing Scaled_Select50Best: 1it [00:07,  7.47s/it]

Logistic Regression-Scaled_Select50Best - Fold 0 - Accuracy: 0.86, Precision: 0.84, Recall: 0.86, F1: 0.83, AUC: 0.93


Processing Scaled_Select50Best: 2it [00:27, 15.12s/it]

Logistic Regression-Scaled_Select50Best - Fold 1 - Accuracy: 0.86, Precision: 0.83, Recall: 0.86, F1: 0.84, AUC: 0.93


Processing Scaled_Select50Best: 3it [00:44, 15.87s/it]

Logistic Regression-Scaled_Select50Best - Fold 2 - Accuracy: 0.86, Precision: 0.84, Recall: 0.86, F1: 0.84, AUC: 0.93


Processing Scaled_Select50Best: 4it [01:02, 16.56s/it]

Logistic Regression-Scaled_Select50Best - Fold 3 - Accuracy: 0.87, Precision: 0.85, Recall: 0.87, F1: 0.85, AUC: 0.93


Processing Scaled_Select50Best: 5it [01:24, 16.90s/it]


Logistic Regression-Scaled_Select50Best - Fold 4 - Accuracy: 0.87, Precision: 0.84, Recall: 0.87, F1: 0.84, AUC: 0.93

Average Performance Across Folds for Scaled_Select50Best:
Logistic Regression-Scaled_Select50Best - Accuracy: 0.87, Precision: 0.84, Recall: 0.87, F1: 0.84, AUC: 0.93



Processing Scaled_Select100Best: 1it [00:08,  8.79s/it]

Logistic Regression-Scaled_Select100Best - Fold 0 - Accuracy: 0.87, Precision: 0.85, Recall: 0.87, F1: 0.85, AUC: 0.94


Processing Scaled_Select100Best: 2it [00:28, 15.48s/it]

Logistic Regression-Scaled_Select100Best - Fold 1 - Accuracy: 0.87, Precision: 0.85, Recall: 0.87, F1: 0.85, AUC: 0.94


Processing Scaled_Select100Best: 3it [01:07, 25.79s/it]

Logistic Regression-Scaled_Select100Best - Fold 2 - Accuracy: 0.87, Precision: 0.86, Recall: 0.87, F1: 0.86, AUC: 0.94


Processing Scaled_Select100Best: 4it [02:29, 48.08s/it]

Logistic Regression-Scaled_Select100Best - Fold 3 - Accuracy: 0.88, Precision: 0.87, Recall: 0.88, F1: 0.87, AUC: 0.94


Processing Scaled_Select100Best: 5it [04:01, 48.26s/it]


Logistic Regression-Scaled_Select100Best - Fold 4 - Accuracy: 0.88, Precision: 0.86, Recall: 0.88, F1: 0.86, AUC: 0.94

Average Performance Across Folds for Scaled_Select100Best:
Logistic Regression-Scaled_Select100Best - Accuracy: 0.87, Precision: 0.86, Recall: 0.87, F1: 0.86, AUC: 0.94



Processing Scaled_Select200Best: 1it [00:36, 36.97s/it]

Logistic Regression-Scaled_Select200Best - Fold 0 - Accuracy: 0.88, Precision: 0.87, Recall: 0.88, F1: 0.87, AUC: 0.95


Processing Scaled_Select200Best: 2it [01:55, 61.55s/it]

Logistic Regression-Scaled_Select200Best - Fold 1 - Accuracy: 0.87, Precision: 0.86, Recall: 0.87, F1: 0.87, AUC: 0.95


Processing Scaled_Select200Best: 3it [03:44, 83.28s/it]

Logistic Regression-Scaled_Select200Best - Fold 2 - Accuracy: 0.88, Precision: 0.87, Recall: 0.88, F1: 0.87, AUC: 0.95


Processing Scaled_Select200Best: 4it [05:41, 96.38s/it]

Logistic Regression-Scaled_Select200Best - Fold 3 - Accuracy: 0.89, Precision: 0.88, Recall: 0.89, F1: 0.88, AUC: 0.95


Processing Scaled_Select200Best: 5it [08:12, 98.48s/it] 


Logistic Regression-Scaled_Select200Best - Fold 4 - Accuracy: 0.88, Precision: 0.87, Recall: 0.88, F1: 0.87, AUC: 0.95

Average Performance Across Folds for Scaled_Select200Best:
Logistic Regression-Scaled_Select200Best - Accuracy: 0.88, Precision: 0.87, Recall: 0.88, F1: 0.87, AUC: 0.95



Processing Scaled_LDA: 1it [00:00,  9.94it/s]

Logistic Regression-Scaled_LDA - Fold 0 - Accuracy: 0.87, Precision: 0.87, Recall: 0.87, F1: 0.87, AUC: 0.95


Processing Scaled_LDA: 2it [00:00,  9.56it/s]

Logistic Regression-Scaled_LDA - Fold 1 - Accuracy: 0.87, Precision: 0.86, Recall: 0.87, F1: 0.86, AUC: 0.95


Processing Scaled_LDA: 3it [00:00,  7.69it/s]

Logistic Regression-Scaled_LDA - Fold 2 - Accuracy: 0.88, Precision: 0.87, Recall: 0.88, F1: 0.87, AUC: 0.95


Processing Scaled_LDA: 4it [00:00,  5.22it/s]

Logistic Regression-Scaled_LDA - Fold 3 - Accuracy: 0.88, Precision: 0.88, Recall: 0.88, F1: 0.88, AUC: 0.96


Processing Scaled_LDA: 5it [00:00,  5.13it/s]

Logistic Regression-Scaled_LDA - Fold 4 - Accuracy: 0.88, Precision: 0.87, Recall: 0.88, F1: 0.87, AUC: 0.95

Average Performance Across Folds for Scaled_LDA:
Logistic Regression-Scaled_LDA - Accuracy: 0.88, Precision: 0.87, Recall: 0.88, F1: 0.87, AUC: 0.95

Results and model parameters have been saved to 'logistic_regression_results.json'





## Decision Tree

In [None]:
Decision_Tree_params = {
    'max_depth': [50, 100, 200]
    }
name = 'Decision Tree'

feature_sets = {
    'Original': X_orig,
    'PCA_50': X_pca_50,
    'PCA_100': X_pca_100,
    'PCA_200': X_pca_200,
    'Select50Best': X_selected_50,
    'Select100Best': X_selected_100,
    'Select200Best': X_selected_200,
    'LDA': X_lda,
    'Scaled_Original': StandardScaler().fit_transform(X_orig),
    'Scaled_PCA_50': StandardScaler().fit_transform(X_pca_50),
    'Scaled_PCA_100': StandardScaler().fit_transform(X_pca_100),
    'Scaled_PCA_200': StandardScaler().fit_transform(X_pca_200),
    'Scaled_Select50Best': StandardScaler().fit_transform(X_selected_50),
    'Scaled_Select100Best': StandardScaler().fit_transform(X_selected_100),
    'Scaled_Select200Best': StandardScaler().fit_transform(X_selected_200),
    'Scaled_LDA': StandardScaler().fit_transform(X_lda),
}

# Dictionary to store results for all feature sets
results = {
    'model_name': name,
    'hyperparameters': Decision_Tree_params,
    'feature_sets': {}
}

for feature_name, X in feature_sets.items():
    # Lists to store metrics and model parameters for each fold
    fold_metrics = []
    
    for fold, (train_idx, test_idx) in enumerate(tqdm(tscv.split(X), desc=f'Processing {feature_name}')):
        X_train, X_test = X[train_idx], X[test_idx]
        y_clf_train, y_clf_test = y[train_idx], y[test_idx]
        
        # Define and train model with GridSearchCV
        model = DecisionTreeClassifier()
        grid = GridSearchCV(model, Decision_Tree_params, cv=2, n_jobs=20)
        grid.fit(X_train, y_clf_train)
        model = grid.best_estimator_
        
        # Make predictions and compute metrics
        y_pred = model.predict(X_test)
        precision, recall, f1, _ = precision_recall_fscore_support(y_clf_test, y_pred, average='weighted', zero_division=1)
        auc = roc_auc_score(y_clf_test, model.predict_proba(X_test), multi_class='ovr', average='weighted')
        
        # Store fold metrics
        fold_result = {
            'fold': fold,
            'accuracy': float(accuracy_score(y_clf_test, y_pred)),
            'precision': float(precision),
            'recall': float(recall),
            'f1': float(f1),
            'auc': float(auc)
        }
        fold_metrics.append(fold_result)
                        
        # Print per-fold results
        print(f'{name}-{feature_name} - Fold {fold} - Accuracy: {fold_result["accuracy"]:.2f}, '
              f'Precision: {fold_result["precision"]:.2f}, Recall: {fold_result["recall"]:.2f}, '
              f'F1: {fold_result["f1"]:.2f}, AUC: {fold_result["auc"]:.2f}')
    
    # Compute average metrics across folds
    avg_metrics = {
        'avg_accuracy': float(np.mean([m['accuracy'] for m in fold_metrics])),
        'avg_precision': float(np.mean([m['precision'] for m in fold_metrics])),
        'avg_recall': float(np.mean([m['recall'] for m in fold_metrics])),
        'avg_f1': float(np.mean([m['f1'] for m in fold_metrics])),
        'avg_auc': float(np.mean([m['auc'] for m in fold_metrics]))
    }
    
    # Store results for this feature set
    results['feature_sets'][feature_name] = {
        'fold_metrics': fold_metrics,
        'average_metrics': avg_metrics
    }
    
    # Print average results
    print(f'\nAverage Performance Across Folds for {feature_name}:')
    print(f'{name}-{feature_name} - Accuracy: {avg_metrics["avg_accuracy"]:.2f}, '
          f'Precision: {avg_metrics["avg_precision"]:.2f}, Recall: {avg_metrics["avg_recall"]:.2f}, '
          f'F1: {avg_metrics["avg_f1"]:.2f}, AUC: {avg_metrics["avg_auc"]:.2f}\n')

# Save results to JSON file
with open(f'results/clf_aug/{name}_results.json', 'w') as f:
    json.dump(results, f, indent=4)

print("Results and model parameters have been saved to 'logistic_regression_results.json'")

## SVC

In [None]:
name = 'SVC'

feature_sets = {
    # 'Original': X_orig,
    # 'PCA_50': X_pca_50,
    # 'PCA_100': X_pca_100,
    # 'PCA_200': X_pca_200,
    # 'Select50Best': X_selected_50,
    # 'Select100Best': X_selected_100,
    # 'Select200Best': X_selected_200,
    # 'LDA': X_lda,
    # 'Scaled_Original': StandardScaler().fit_transform(X_orig),
    'Scaled_PCA_50': StandardScaler().fit_transform(X_pca_50),
    # 'Scaled_PCA_100': StandardScaler().fit_transform(X_pca_100),
    # 'Scaled_PCA_200': StandardScaler().fit_transform(X_pca_200),
    'Scaled_Select50Best': StandardScaler().fit_transform(X_selected_50),
    # 'Scaled_Select100Best': StandardScaler().fit_transform(X_selected_100),
    # 'Scaled_Select200Best': StandardScaler().fit_transform(X_selected_200),
    'Scaled_LDA': StandardScaler().fit_transform(X_lda),
}

# Dictionary to store results for all feature sets
results = {
    'model_name': name,
    'feature_sets': {}
}

for feature_name, X in feature_sets.items():
    # Lists to store metrics and model parameters for each fold
    fold_metrics = []
    
    for fold, (train_idx, test_idx) in enumerate(tqdm(tscv.split(X), desc=f'Processing {feature_name}')):
        X_train, X_test = X[train_idx], X[test_idx]
        y_clf_train, y_clf_test = y[train_idx], y[test_idx]
        
        # Define and train model with GridSearchCV
        model = SVC(verbose=1, probability=True, C=0.1, kernel='rbf')
        model.fit(X_train, y_clf_train)
        
        # Make predictions and compute metrics
        y_pred = model.predict(X_test)
        precision, recall, f1, _ = precision_recall_fscore_support(y_clf_test, y_pred, average='weighted', zero_division=1)
        auc = roc_auc_score(y_clf_test, model.predict_proba(X_test), multi_class='ovr', average='weighted')
        
        # Store fold metrics
        fold_result = {
            'fold': fold,
            'accuracy': float(accuracy_score(y_clf_test, y_pred)),
            'precision': float(precision),
            'recall': float(recall),
            'f1': float(f1),
            'auc': float(auc)
        }
        fold_metrics.append(fold_result)
                        
        # Print per-fold results
        print(f'{name}-{feature_name} - Fold {fold} - Accuracy: {fold_result["accuracy"]:.2f}, '
              f'Precision: {fold_result["precision"]:.2f}, Recall: {fold_result["recall"]:.2f}, '
              f'F1: {fold_result["f1"]:.2f}, AUC: {fold_result["auc"]:.2f}')
    
    # Compute average metrics across folds
    avg_metrics = {
        'avg_accuracy': float(np.mean([m['accuracy'] for m in fold_metrics])),
        'avg_precision': float(np.mean([m['precision'] for m in fold_metrics])),
        'avg_recall': float(np.mean([m['recall'] for m in fold_metrics])),
        'avg_f1': float(np.mean([m['f1'] for m in fold_metrics])),
        'avg_auc': float(np.mean([m['auc'] for m in fold_metrics]))
    }
    
    # Store results for this feature set
    results['feature_sets'][feature_name] = {
        'fold_metrics': fold_metrics,
        'average_metrics': avg_metrics
    }
    
    # Print average results
    print(f'\nAverage Performance Across Folds for {feature_name}:')
    print(f'{name}-{feature_name} - Accuracy: {avg_metrics["avg_accuracy"]:.2f}, '
          f'Precision: {avg_metrics["avg_precision"]:.2f}, Recall: {avg_metrics["avg_recall"]:.2f}, '
          f'F1: {avg_metrics["avg_f1"]:.2f}, AUC: {avg_metrics["avg_auc"]:.2f}\n')

# Save results to JSON file
with open(f'results/clf_aug/{name}_results.json', 'w') as f:
    json.dump(results, f, indent=4)

print("Results and model parameters have been saved to 'logistic_regression_results.json'")

Processing Scaled_PCA_50: 0it [00:00, ?it/s]

[LibSVM]

Processing Scaled_PCA_50: 1it [00:26, 26.01s/it]

SVC-Scaled_PCA_50 - Fold 0 - Accuracy: 0.76, Precision: 0.79, Recall: 0.76, F1: 0.70, AUC: 0.83
[LibSVM]

Processing Scaled_PCA_50: 2it [01:54, 62.55s/it]

SVC-Scaled_PCA_50 - Fold 1 - Accuracy: 0.78, Precision: 0.80, Recall: 0.78, F1: 0.74, AUC: 0.85
[LibSVM]

Processing Scaled_PCA_50: 3it [05:36, 135.67s/it]

SVC-Scaled_PCA_50 - Fold 2 - Accuracy: 0.79, Precision: 0.81, Recall: 0.79, F1: 0.75, AUC: 0.84
[LibSVM]

Processing Scaled_PCA_50: 4it [12:53, 254.35s/it]

SVC-Scaled_PCA_50 - Fold 3 - Accuracy: 0.80, Precision: 0.81, Recall: 0.80, F1: 0.75, AUC: 0.84
[LibSVM]

Processing Scaled_PCA_50: 5it [25:59, 311.90s/it]


SVC-Scaled_PCA_50 - Fold 4 - Accuracy: 0.78, Precision: 0.80, Recall: 0.78, F1: 0.73, AUC: 0.84

Average Performance Across Folds for Scaled_PCA_50:
SVC-Scaled_PCA_50 - Accuracy: 0.78, Precision: 0.80, Recall: 0.78, F1: 0.73, AUC: 0.84



Processing Scaled_Select50Best: 0it [00:00, ?it/s]

[LibSVM]

Processing Scaled_Select50Best: 1it [00:16, 16.44s/it]

SVC-Scaled_Select50Best - Fold 0 - Accuracy: 0.85, Precision: 0.87, Recall: 0.85, F1: 0.81, AUC: 0.92
[LibSVM]

Processing Scaled_Select50Best: 2it [01:08, 37.34s/it]

SVC-Scaled_Select50Best - Fold 1 - Accuracy: 0.85, Precision: 0.86, Recall: 0.85, F1: 0.81, AUC: 0.93
[LibSVM]

Processing Scaled_Select50Best: 3it [03:00, 71.55s/it]

SVC-Scaled_Select50Best - Fold 2 - Accuracy: 0.84, Precision: 0.85, Recall: 0.84, F1: 0.81, AUC: 0.92
[LibSVM]

Processing Scaled_Select50Best: 4it [06:44, 131.69s/it]

SVC-Scaled_Select50Best - Fold 3 - Accuracy: 0.87, Precision: 0.88, Recall: 0.87, F1: 0.84, AUC: 0.92
[LibSVM]

Processing Scaled_Select50Best: 5it [13:23, 160.71s/it]


SVC-Scaled_Select50Best - Fold 4 - Accuracy: 0.86, Precision: 0.87, Recall: 0.86, F1: 0.83, AUC: 0.91

Average Performance Across Folds for Scaled_Select50Best:
SVC-Scaled_Select50Best - Accuracy: 0.86, Precision: 0.86, Recall: 0.86, F1: 0.82, AUC: 0.92



Processing Scaled_LDA: 0it [00:00, ?it/s]

[LibSVM]

Processing Scaled_LDA: 1it [00:06,  6.56s/it]

SVC-Scaled_LDA - Fold 0 - Accuracy: 0.88, Precision: 0.87, Recall: 0.88, F1: 0.86, AUC: 0.93
[LibSVM]

Processing Scaled_LDA: 2it [00:27, 14.79s/it]

SVC-Scaled_LDA - Fold 1 - Accuracy: 0.87, Precision: 0.86, Recall: 0.87, F1: 0.86, AUC: 0.92
[LibSVM]

Processing Scaled_LDA: 3it [01:09, 27.56s/it]

SVC-Scaled_LDA - Fold 2 - Accuracy: 0.88, Precision: 0.87, Recall: 0.88, F1: 0.87, AUC: 0.93
[LibSVM]

Processing Scaled_LDA: 4it [02:22, 45.34s/it]

SVC-Scaled_LDA - Fold 3 - Accuracy: 0.89, Precision: 0.88, Recall: 0.89, F1: 0.88, AUC: 0.91
[LibSVM]

Processing Scaled_LDA: 5it [04:13, 50.64s/it]

SVC-Scaled_LDA - Fold 4 - Accuracy: 0.88, Precision: 0.87, Recall: 0.88, F1: 0.87, AUC: 0.91

Average Performance Across Folds for Scaled_LDA:
SVC-Scaled_LDA - Accuracy: 0.88, Precision: 0.87, Recall: 0.88, F1: 0.87, AUC: 0.92

Results and model parameters have been saved to 'logistic_regression_results.json'





# 集成算法

## Random Forest

In [None]:
rf_params = {
    'n_estimators': [50, 100], 
    'max_depth': [100, 200]
    }
name = 'Random Forest'

feature_sets = {
    'Original': X_orig,
    'PCA_50': X_pca_50,
    # 'PCA_100': X_pca_100,
    # 'PCA_200': X_pca_200,
    'Select50Best': X_selected_50,
    # 'Select100Best': X_selected_100,
    # 'Select200Best': X_selected_200,
    'LDA': X_lda,
    'Scaled_Original': StandardScaler().fit_transform(X_orig),
    'Scaled_PCA_50': StandardScaler().fit_transform(X_pca_50),
    # 'Scaled_PCA_100': StandardScaler().fit_transform(X_pca_100),
    # 'Scaled_PCA_200': StandardScaler().fit_transform(X_pca_200),
    'Scaled_Select50Best': StandardScaler().fit_transform(X_selected_50),
    # 'Scaled_Select100Best': StandardScaler().fit_transform(X_selected_100),
    # 'Scaled_Select200Best': StandardScaler().fit_transform(X_selected_200),
    'Scaled_LDA': StandardScaler().fit_transform(X_lda),
}

# Dictionary to store results for all feature sets
results = {
    'model_name': name,
    'hyperparameters': rf_params,
    'feature_sets': {}
}

for feature_name, X in feature_sets.items():
    # Lists to store metrics and model parameters for each fold
    fold_metrics = []
    
    for fold, (train_idx, test_idx) in enumerate(tqdm(tscv.split(X), desc=f'Processing {feature_name}')):
        X_train, X_test = X[train_idx], X[test_idx]
        y_clf_train, y_clf_test = y[train_idx], y[test_idx]
        
        # Define and train model with GridSearchCV
        model = RandomForestClassifier()
        grid = GridSearchCV(model, rf_params, cv=2)
        grid.fit(X_train, y_clf_train)
        model = grid.best_estimator_
        
        # Make predictions and compute metrics
        y_pred = model.predict(X_test)
        precision, recall, f1, _ = precision_recall_fscore_support(y_clf_test, y_pred, average='weighted', zero_division=1)
        auc = roc_auc_score(y_clf_test, model.predict_proba(X_test), multi_class='ovr', average='weighted')
        
        # Store fold metrics
        fold_result = {
            'fold': fold,
            'accuracy': float(accuracy_score(y_clf_test, y_pred)),
            'precision': float(precision),
            'recall': float(recall),
            'f1': float(f1),
            'auc': float(auc)
        }
        fold_metrics.append(fold_result)
                        
        # Print per-fold results
        print(f'{name}-{feature_name} - Fold {fold} - Accuracy: {fold_result["accuracy"]:.2f}, '
              f'Precision: {fold_result["precision"]:.2f}, Recall: {fold_result["recall"]:.2f}, '
              f'F1: {fold_result["f1"]:.2f}, AUC: {fold_result["auc"]:.2f}')
    
    # Compute average metrics across folds
    avg_metrics = {
        'avg_accuracy': float(np.mean([m['accuracy'] for m in fold_metrics])),
        'avg_precision': float(np.mean([m['precision'] for m in fold_metrics])),
        'avg_recall': float(np.mean([m['recall'] for m in fold_metrics])),
        'avg_f1': float(np.mean([m['f1'] for m in fold_metrics])),
        'avg_auc': float(np.mean([m['auc'] for m in fold_metrics]))
    }
    
    # Store results for this feature set
    results['feature_sets'][feature_name] = {
        'fold_metrics': fold_metrics,
        'average_metrics': avg_metrics
    }
    
    # Print average results
    print(f'\nAverage Performance Across Folds for {feature_name}:')
    print(f'{name}-{feature_name} - Accuracy: {avg_metrics["avg_accuracy"]:.2f}, '
          f'Precision: {avg_metrics["avg_precision"]:.2f}, Recall: {avg_metrics["avg_recall"]:.2f}, '
          f'F1: {avg_metrics["avg_f1"]:.2f}, AUC: {avg_metrics["avg_auc"]:.2f}\n')

# Save results to JSON file
with open(f'results/clf_aug/{name}_results.json', 'w') as f:
    json.dump(results, f, indent=4)

print("Results and model parameters have been saved to 'logistic_regression_results.json'")

## Bagging Method

In [None]:
name = 'Bagging'

feature_sets = {
    'Original': X_orig,
    'PCA_50': X_pca_50,
    'PCA_100': X_pca_100,
    'PCA_200': X_pca_200,
    'Select50Best': X_selected_50,
    'Select100Best': X_selected_100,
    'Select200Best': X_selected_200,
    'LDA': X_lda,
    'Scaled_Original': StandardScaler().fit_transform(X_orig),
    'Scaled_PCA_50': StandardScaler().fit_transform(X_pca_50),
    'Scaled_PCA_100': StandardScaler().fit_transform(X_pca_100),
    'Scaled_PCA_200': StandardScaler().fit_transform(X_pca_200),
    'Scaled_Select50Best': StandardScaler().fit_transform(X_selected_50),
    'Scaled_Select100Best': StandardScaler().fit_transform(X_selected_100),
    'Scaled_Select200Best': StandardScaler().fit_transform(X_selected_200),
    'Scaled_LDA': StandardScaler().fit_transform(X_lda),
}

# Dictionary to store results for all feature sets
results = {
    'model_name': name,
    'feature_sets': {}
}

for feature_name, X in feature_sets.items():
    # Lists to store metrics and model parameters for each fold
    fold_metrics = []
    
    for fold, (train_idx, test_idx) in enumerate(tqdm(tscv.split(X), desc=f'Processing {feature_name}')):
        X_train, X_test = X[train_idx], X[test_idx]
        y_clf_train, y_clf_test = y[train_idx], y[test_idx]
        
        # Define and train model with GridSearchCV
        model = BaggingClassifier(estimator=LogisticRegression(C=0.1), n_estimators=10, random_state=42, n_jobs=20)
        model.fit(X_train, y_clf_train)
        
        # Make predictions and compute metrics
        y_pred = model.predict(X_test)
        precision, recall, f1, _ = precision_recall_fscore_support(y_clf_test, y_pred, average='weighted', zero_division=1)
        auc = roc_auc_score(y_clf_test, model.predict_proba(X_test), multi_class='ovr', average='weighted')
        
        # Store fold metrics
        fold_result = {
            'fold': fold,
            'accuracy': float(accuracy_score(y_clf_test, y_pred)),
            'precision': float(precision),
            'recall': float(recall),
            'f1': float(f1),
            'auc': float(auc)
        }
        fold_metrics.append(fold_result)
                        
        # Print per-fold results
        print(f'{name}-{feature_name} - Fold {fold} - Accuracy: {fold_result["accuracy"]:.2f}, '
              f'Precision: {fold_result["precision"]:.2f}, Recall: {fold_result["recall"]:.2f}, '
              f'F1: {fold_result["f1"]:.2f}, AUC: {fold_result["auc"]:.2f}')
    
    # Compute average metrics across folds
    avg_metrics = {
        'avg_accuracy': float(np.mean([m['accuracy'] for m in fold_metrics])),
        'avg_precision': float(np.mean([m['precision'] for m in fold_metrics])),
        'avg_recall': float(np.mean([m['recall'] for m in fold_metrics])),
        'avg_f1': float(np.mean([m['f1'] for m in fold_metrics])),
        'avg_auc': float(np.mean([m['auc'] for m in fold_metrics]))
    }
    
    # Store results for this feature set
    results['feature_sets'][feature_name] = {
        'fold_metrics': fold_metrics,
        'average_metrics': avg_metrics
    }
    
    # Print average results
    print(f'\nAverage Performance Across Folds for {feature_name}:')
    print(f'{name}-{feature_name} - Accuracy: {avg_metrics["avg_accuracy"]:.2f}, '
          f'Precision: {avg_metrics["avg_precision"]:.2f}, Recall: {avg_metrics["avg_recall"]:.2f}, '
          f'F1: {avg_metrics["avg_f1"]:.2f}, AUC: {avg_metrics["avg_auc"]:.2f}\n')

# Save results to JSON file
with open(f'results/clf_aug/{name}_results.json', 'w') as f:
    json.dump(results, f, indent=4)

print("Results and model parameters have been saved to 'logistic_regression_results.json'")

## Voting

In [11]:
name = 'Voting'

estimators = [
    ('dt', DecisionTreeClassifier(max_depth=200)),
    ('svc', SVC(probability=True)), 
    ('LogisticRegression', LogisticRegression(C=0.5))
]

feature_sets = {
    # 'Original': X_orig,
    # 'PCA_50': X_pca_50,
    # 'PCA_100': X_pca_100,
    # 'PCA_200': X_pca_200,
    # 'Select50Best': X_selected_50,
    # 'Select100Best': X_selected_100,
    # 'Select200Best': X_selected_200,
    # 'LDA': X_lda,
    # 'Scaled_Original': StandardScaler().fit_transform(X_orig),
    'Scaled_PCA_50': StandardScaler().fit_transform(X_pca_50),
    # 'Scaled_PCA_100': StandardScaler().fit_transform(X_pca_100),
    # 'Scaled_PCA_200': StandardScaler().fit_transform(X_pca_200),
    'Scaled_Select50Best': StandardScaler().fit_transform(X_selected_50),
    # 'Scaled_Select100Best': StandardScaler().fit_transform(X_selected_100),
    # 'Scaled_Select200Best': StandardScaler().fit_transform(X_selected_200),
    'Scaled_LDA': StandardScaler().fit_transform(X_lda),
}

# Dictionary to store results for all feature sets
results = {
    'model_name': name,
    'feature_sets': {}
}

for feature_name, X in feature_sets.items():
    # Lists to store metrics and model parameters for each fold
    fold_metrics = []
    
    for fold, (train_idx, test_idx) in enumerate(tqdm(tscv.split(X), desc=f'Processing {feature_name}')):
        X_train, X_test = X[train_idx], X[test_idx]
        y_clf_train, y_clf_test = y[train_idx], y[test_idx]
        
        # Define and train model with GridSearchCV
        model = VotingClassifier(estimators=estimators, voting='soft', n_jobs=-1)  # 'hard' for majority voting
        model.fit(X_train, y_clf_train)
        
        # Make predictions and compute metrics
        y_pred = model.predict(X_test)
        precision, recall, f1, _ = precision_recall_fscore_support(y_clf_test, y_pred, average='weighted', zero_division=1)
        auc = roc_auc_score(y_clf_test, model.predict_proba(X_test), multi_class='ovr', average='weighted')
        
        # Store fold metrics
        fold_result = {
            'fold': fold,
            'accuracy': float(accuracy_score(y_clf_test, y_pred)),
            'precision': float(precision),
            'recall': float(recall),
            'f1': float(f1),
            'auc': float(auc)
        }
        fold_metrics.append(fold_result)
                        
        # Print per-fold results
        print(f'{name}-{feature_name} - Fold {fold} - Accuracy: {fold_result["accuracy"]:.2f}, '
              f'Precision: {fold_result["precision"]:.2f}, Recall: {fold_result["recall"]:.2f}, '
              f'F1: {fold_result["f1"]:.2f}, AUC: {fold_result["auc"]:.2f}')
    
    # Compute average metrics across folds
    avg_metrics = {
        'avg_accuracy': float(np.mean([m['accuracy'] for m in fold_metrics])),
        'avg_precision': float(np.mean([m['precision'] for m in fold_metrics])),
        'avg_recall': float(np.mean([m['recall'] for m in fold_metrics])),
        'avg_f1': float(np.mean([m['f1'] for m in fold_metrics])),
        'avg_auc': float(np.mean([m['auc'] for m in fold_metrics]))
    }
    
    # Store results for this feature set
    results['feature_sets'][feature_name] = {
        'fold_metrics': fold_metrics,
        'average_metrics': avg_metrics
    }
    
    # Print average results
    print(f'\nAverage Performance Across Folds for {feature_name}:')
    print(f'{name}-{feature_name} - Accuracy: {avg_metrics["avg_accuracy"]:.2f}, '
          f'Precision: {avg_metrics["avg_precision"]:.2f}, Recall: {avg_metrics["avg_recall"]:.2f}, '
          f'F1: {avg_metrics["avg_f1"]:.2f}, AUC: {avg_metrics["avg_auc"]:.2f}\n')

# Save results to JSON file
with open(f'results/clf_aug/{name}_results.json', 'w') as f:
    json.dump(results, f, indent=4)

print("Results and model parameters have been saved to 'logistic_regression_results.json'")

Processing Scaled_PCA_50: 1it [00:37, 37.01s/it]

Voting-Scaled_PCA_50 - Fold 0 - Accuracy: 0.72, Precision: 0.66, Recall: 0.72, F1: 0.68, AUC: 0.68


Processing Scaled_PCA_50: 2it [03:03, 101.19s/it]

Voting-Scaled_PCA_50 - Fold 1 - Accuracy: 0.73, Precision: 0.67, Recall: 0.73, F1: 0.69, AUC: 0.71


Processing Scaled_PCA_50: 3it [09:25, 229.71s/it]

Voting-Scaled_PCA_50 - Fold 2 - Accuracy: 0.73, Precision: 0.67, Recall: 0.73, F1: 0.69, AUC: 0.70


Processing Scaled_PCA_50: 4it [22:38, 451.83s/it]

Voting-Scaled_PCA_50 - Fold 3 - Accuracy: 0.73, Precision: 0.67, Recall: 0.73, F1: 0.69, AUC: 0.72


Processing Scaled_PCA_50: 5it [49:28, 593.72s/it]


Voting-Scaled_PCA_50 - Fold 4 - Accuracy: 0.90, Precision: 0.86, Recall: 0.90, F1: 0.88, AUC: 0.69

Average Performance Across Folds for Scaled_PCA_50:
Voting-Scaled_PCA_50 - Accuracy: 0.76, Precision: 0.71, Recall: 0.76, F1: 0.72, AUC: 0.70



Processing Scaled_Select50Best: 1it [00:32, 32.01s/it]

Voting-Scaled_Select50Best - Fold 0 - Accuracy: 0.70, Precision: 0.65, Recall: 0.70, F1: 0.67, AUC: 0.71


Processing Scaled_Select50Best: 2it [02:23, 78.64s/it]

Voting-Scaled_Select50Best - Fold 1 - Accuracy: 0.74, Precision: 0.69, Recall: 0.74, F1: 0.70, AUC: 0.74


Processing Scaled_Select50Best: 3it [07:24, 180.17s/it]

Voting-Scaled_Select50Best - Fold 2 - Accuracy: 0.74, Precision: 0.69, Recall: 0.74, F1: 0.70, AUC: 0.71


Processing Scaled_Select50Best: 4it [18:32, 372.79s/it]

Voting-Scaled_Select50Best - Fold 3 - Accuracy: 0.74, Precision: 0.67, Recall: 0.74, F1: 0.69, AUC: 0.72


Processing Scaled_Select50Best: 5it [39:02, 468.57s/it]


Voting-Scaled_Select50Best - Fold 4 - Accuracy: 0.90, Precision: 0.86, Recall: 0.90, F1: 0.88, AUC: 0.66

Average Performance Across Folds for Scaled_Select50Best:
Voting-Scaled_Select50Best - Accuracy: 0.76, Precision: 0.71, Recall: 0.76, F1: 0.73, AUC: 0.71



Processing Scaled_LDA: 1it [00:23, 23.10s/it]

Voting-Scaled_LDA - Fold 0 - Accuracy: 0.72, Precision: 0.65, Recall: 0.72, F1: 0.67, AUC: 0.67


Processing Scaled_LDA: 2it [01:38, 53.62s/it]

Voting-Scaled_LDA - Fold 1 - Accuracy: 0.72, Precision: 0.66, Recall: 0.72, F1: 0.67, AUC: 0.68


Processing Scaled_LDA: 3it [04:32, 108.67s/it]

Voting-Scaled_LDA - Fold 2 - Accuracy: 0.73, Precision: 0.66, Recall: 0.73, F1: 0.68, AUC: 0.67


Processing Scaled_LDA: 4it [09:25, 181.38s/it]

Voting-Scaled_LDA - Fold 3 - Accuracy: 0.72, Precision: 0.66, Recall: 0.72, F1: 0.68, AUC: 0.68


Processing Scaled_LDA: 5it [17:54, 214.94s/it]

Voting-Scaled_LDA - Fold 4 - Accuracy: 0.90, Precision: 0.86, Recall: 0.90, F1: 0.87, AUC: 0.75

Average Performance Across Folds for Scaled_LDA:
Voting-Scaled_LDA - Accuracy: 0.76, Precision: 0.70, Recall: 0.76, F1: 0.71, AUC: 0.69

Results and model parameters have been saved to 'logistic_regression_results.json'



