In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import LeaveOneOut, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from scipy.stats import pearsonr
from sklearn.model_selection import train_test_split
import csv  # For saving hyperparameters, predictions, and metrics
import torch
import torch.nn as nn
import torch.optim as optim

# Load and preprocess data
def load_data(features_file, labels_file, labels_column):
    features_df = pd.read_csv(features_file, index_col=0)
    labels_df = pd.read_csv(labels_file, index_col=0)

    X = features_df
    y = labels_df[labels_column]
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled, y

# Save the best hyperparameters to a CSV file
def save_hyperparameters(model_name, best_params):
    filename = f"{model_name}_best_params.csv"
    with open(filename, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["Parameter", "Value"])
        # Assuming best_params is a list of dictionaries, one for each fold
        for params in best_params:
            for param, value in params.items():
                writer.writerow([param, value])
    print(f"Best hyperparameters for {model_name} saved to {filename}")

# Save the predictions, true labels, and metrics to a CSV file
def save_predictions_and_metrics(model_name, y_true, y_pred, corr, rmse):
    # Save predictions and true labels
    predictions_filename = f"../Results/{model_name}_predictions.csv"
    with open(predictions_filename, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["True Label", "Predicted Label"])
        writer.writerows(zip(y_true, y_pred))
    
    # Save correlation and RMSE metrics
    metrics_filename = f"../Results/{model_name}_metrics.csv"
    with open(metrics_filename, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["Metric", "Value"])
        writer.writerow(["Correlation", corr])
        writer.writerow(["RMSE", rmse])
    
    print(f"Predictions for {model_name} saved to {predictions_filename}")
    print(f"Metrics for {model_name} saved to {metrics_filename}")

# Perform Leave-One-Out Cross-Validation on a single model
def run_loo_cv(model, param_grid, model_name, X, y):
    loo = LeaveOneOut()
    y_true, y_pred, best_params = [], [], []

    for train_index, test_index in loo.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        y_true.append(y_test.values[0])
        
        if param_grid:
            # Add n_jobs=-1 for parallel processing during Grid Search
            grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
            grid_search.fit(X_train, y_train)
            best_model = grid_search.best_estimator_
            pred = best_model.predict(X_test)
            best_params.append(grid_search.best_params_)
        else:
            model.fit(X_train, y_train)
            pred = model.predict(X_test)
        
        y_pred.append(pred[0])
    
    corr, _ = pearsonr(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    
    print(f"{model_name}: Correlation (R) = {corr:.4f}, RMSE = {rmse:.4f}")
    
    # Save hyperparameters, predictions, and metrics
    if param_grid:
        print(f"Best hyperparameters for {model_name}: {best_params}")
        save_hyperparameters(model_name, best_params)
    
    save_predictions_and_metrics(model_name, y_true, y_pred, corr, rmse)

    return best_params

# Define a simple neural network architecture
class NeuralNet(nn.Module):
    def __init__(self):
        super(NeuralNet, self).__init__()
        self.fc1 = nn.Linear(15, 64)  # Input: 15 features, Hidden layer: 64 neurons
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)  # Output: 1 target value
        self.relu = nn.ReLU()
    
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Function to run the neural network on GPU (if available) and save the metrics
# Function to run the neural network with Leave-One-Out Cross-Validation (LOO)
def run_neural_network(X, y):
    # Check if a GPU is available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Initialize Leave-One-Out cross-validator
    loo = LeaveOneOut()
    y_true, y_pred = [], []

    for train_index, test_index in loo.split(X):
        # Split the data into training and test for this fold
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Convert to PyTorch tensors and move to GPU if available
        X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
        X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)
        y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1).to(device)
        y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1).to(device)

        # Define the model, loss, and optimizer
        model = NeuralNet().to(device)  # Move the model to the GPU
        criterion = nn.MSELoss()
        optimizer = optim.Adam(model.parameters(), lr=0.001)

        # Training loop (reinitialize the model for each LOO fold)
        num_epochs = 200  # Reduced to prevent overfitting and speed up LOO
        for epoch in range(num_epochs):
            model.train()
            optimizer.zero_grad()
            outputs = model(X_train_tensor)
            loss = criterion(outputs, y_train_tensor)
            loss.backward()
            optimizer.step()

        # Evaluate on the left-out test sample
        model.eval()
        with torch.no_grad():
            prediction = model(X_test_tensor).cpu().numpy()[0][0]  # Get prediction and move back to CPU

        # Store the true value and predicted value
        y_true.append(y_test.values[0])
        y_pred.append(prediction)

    # Calculate RMSE and Pearson correlation coefficient (R-value)
    test_rmse = mean_squared_error(y_true, y_pred, squared=False)
    corr = pearsonr(y_true, y_pred)

    # Print the metrics
    print(f"Neural Network MLP: Test RMSE: {test_rmse:.4f}")
    print(f"Neural Network MLP: Test R value (correlation): {corr[0]:.4f}")  # Fix here

    # Save predictions and metrics using the existing function
    model_name = 'NeuralNetwork_MLP'
    save_predictions_and_metrics(model_name, y_true, y_pred, corr[0], test_rmse)


# Choose and run individual models

def run_linear_regression(X, y):
    model_name = 'Linear_Regression'
    model = LinearRegression()
    best_params = run_loo_cv(model, None, model_name, X, y)

def run_ridge(X, y):
    model_name = 'Ridge'
    model = Ridge()
    best_params = run_loo_cv(model, param_grids[model_name], model_name, X, y)

def run_lasso(X, y):
    model_name = 'Lasso'
    model = Lasso()
    best_params = run_loo_cv(model, param_grids[model_name], model_name, X, y)

def run_svr(X, y):
    model_name = 'SVR'
    model = SVR()
    best_params = run_loo_cv(model, param_grids[model_name], model_name, X, y)

def run_random_forest(X, y):
    model_name = 'Random Forest'
    model = RandomForestRegressor(n_jobs=-1)  # Enable parallel processing for RandomForest
    best_params = run_loo_cv(model, param_grids[model_name], model_name, X, y)

def run_xgboost(X, y):
    model_name = 'XGBoost'
    model = XGBRegressor(use_label_encoder=False, eval_metric='rmse', n_jobs=-1)  # Enable parallel processing for XGBoost
    best_params = run_loo_cv(model, param_grids[model_name], model_name, X, y)

# Define parameter grids for models
param_grids = {
    'Ridge': {'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]},
    'Lasso': {'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]},
    'SVR': {
        # 'C': [0.1, 1.0, 10.0, 100.0],
        # 'epsilon': [0.001, 0.01, 0.1, 1.0],
        # 'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
    },
    'Random Forest': {
        # 'n_estimators': [100, 200],
        # 'max_depth': [None, 10, 20],
        # 'min_samples_split': [2, 5],
        # 'min_samples_leaf': [1, 2]
    },
    'XGBoost': {
        'n_estimators': [100, 200],
        'learning_rate': [0.001, 0.01, 0.1, 0.3],
        'max_depth': [3, 5, 10],
        'subsample': [0.6, 0.8, 1.0]
    }
}
# Main workflow
if __name__ == '__main__':
    features_file = '../Processed Data/rest_101_participants_40_regions.csv'
    labels_file = '../Processed Data/101_participants_40_regions_target_variable.csv'
    prediction_label = 'Aphasia quotient'
    
    X, y = load_data(features_file, labels_file, prediction_label)
    
    # Choose a model to run (uncomment the model you want to run)
    #run_linear_regression(X, y)
    # run_ridge(X, y)
    # run_lasso(X, y)
    run_svr(X, y)
    run_random_forest(X, y)
    # run_xgboost(X, y)
    # run_neural_network(X, y)




SVR: Correlation (R) = 0.2582, RMSE = 24.5690
Predictions for SVR saved to ../Results/SVR_predictions.csv
Metrics for SVR saved to ../Results/SVR_metrics.csv
Random Forest: Correlation (R) = 0.4760, RMSE = 21.7640
Predictions for Random Forest saved to ../Results/Random Forest_predictions.csv
Metrics for Random Forest saved to ../Results/Random Forest_metrics.csv




In [7]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
from PIL import Image

def merge_metrics(directory):
    metrics_list = []

    # Loop through the files in the directory
    for filename in os.listdir(directory):
        if filename.endswith('metrics.csv'):
            # Read each metrics CSV file
            file_path = os.path.join(directory, filename)
            df = pd.read_csv(file_path)

            # Add a column for the model name (extracted from the filename)
            model_name = filename.replace('_metrics.csv', '')

            # Extract correlation and RMSE values
            r_corr = df.loc[df['Metric'] == 'Correlation', 'Value'].values[0]
            rmse = df.loc[df['Metric'] == 'RMSE', 'Value'].values[0]

            # Append the model name, correlation, and RMSE to the list
            metrics_list.append({'Model': model_name, 'R corr': r_corr, 'RMSE': rmse})

    # Convert the list into a DataFrame
    all_metrics = pd.DataFrame(metrics_list)

    # Save the merged metrics to a new CSV file
    output_file = os.path.join(directory, 'merged_metrics.csv')
    all_metrics.to_csv(output_file, index=False)

    print(f"Merged metrics saved to {output_file}")
    return output_file

def plot_predictions(directory):
    # Loop through the files in the directory
    for filename in os.listdir(directory):
        if filename.endswith('predictions.csv'):
            # Read each predictions CSV file
            file_path = os.path.join(directory, filename)
            df = pd.read_csv(file_path)

            # Extract true and predicted values
            true_values = df['True Label'].values
            predicted_values = df['Predicted Label'].values

            # Calculate RMSE and Pearson correlation coefficient (R-value)
            rmse = mean_squared_error(true_values, predicted_values, squared=False)
            r_corr, _ = pearsonr(true_values, predicted_values)

            # Extract model name from the filename
            model_name = filename.replace('_predictions.csv', '')

            # Create the plot
            plt.figure(figsize=(8, 6))
            plt.scatter(true_values, predicted_values, label=f'R = {r_corr:.4f}\nRMSE = {rmse:.4f}', alpha=0.6)
            plt.plot([min(true_values), max(true_values)], [min(true_values), max(true_values)], color='red', linestyle='--')  # Diagonal line

            # Set title and labels
            plt.title(f'{model_name}')
            plt.xlabel('True')
            plt.ylabel('Predicted')
            
            # Show the legend with R and RMSE
            plt.legend(loc='lower right')

            # Show the plot
            plt.grid(True)            
            # Save the plot as a PNG file
            plot_filename = f"{model_name}_plot.png"
            plot_path = os.path.join(directory, plot_filename)
            plt.savefig(plot_path)
            plt.close()

            print(f"Plot saved to {plot_path}")

def combine_plots_to_pdf(directory):
    """
    Searches the specified directory for PNG files and combines them into a single PDF file.
    
    Args:
        directory (str): The path to the directory containing the PNG plot files.
    """
    # Initialize a list to store paths of PNG files
    image_paths = []

    # Search for PNG files in the directory
    for filename in os.listdir(directory):
        if filename.endswith('.png'):
            image_path = os.path.join(directory, filename)
            image_paths.append(image_path)

    # Combine the images into a single PDF
    if image_paths:
        images = []
        for image_path in image_paths:
            img = Image.open(image_path)

            # Convert image to 'RGB' mode if not already
            if img.mode != 'RGB':
                img = img.convert('RGB')
            
            images.append(img)

        # Save the first image and append the rest into a PDF
        pdf_path = os.path.join(directory, 'all_plots.pdf')
        images[0].save(pdf_path, save_all=True, append_images=images[1:])
        print(f"All plots combined and saved to {pdf_path}")
    else:
        print("No PNG plot files found.")

# Main workflow
if __name__ == '__main__':
    directory_path = '../Results' 

    # Merge the metrics and save to CSV
    merge_metrics(directory_path)

    # Plot all prediction files and save them as PNG
    plot_predictions(directory_path)

    # Combine all the PNG plots into a single PDF
    combine_plots_to_pdf(directory_path)

Merged metrics saved to ../Results\merged_metrics.csv




Plot saved to ../Results\Linear_Regression_plot.png
Plot saved to ../Results\Random Forest_plot.png




Plot saved to ../Results\SVR_plot.png
All plots combined and saved to ../Results\all_plots.pdf
