### Import Libraries and Define Paths

This section imports the necessary libraries for statistical analysis and file handling. It also defines the paths to the datasets that will be processed.

In [35]:
import pandas as pd
import scipy.stats as stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
import os

In [36]:
# Paths to datasets
datasets = [
    "/home/aghasemi/CompBio481/datasets/processed_datasets/usable_datasets_branch2/NC_vs_AD_train.csv",
    "/home/aghasemi/CompBio481/datasets/processed_datasets/usable_datasets_branch2/NC_vs_DLB_train.csv",
    "/home/aghasemi/CompBio481/datasets/processed_datasets/usable_datasets_branch2/NC_vs_MCI_train.csv",
    "/home/aghasemi/CompBio481/datasets/processed_datasets/usable_datasets_branch2/NC_vs_NPH_train.csv",
    "/home/aghasemi/CompBio481/datasets/processed_datasets/usable_datasets_branch2/NC_vs_VaD_train.csv",
]

### Statistical Analysis: Shapiro-Wilk, Levene's, and Bartlett's Tests

This section performs statistical tests to evaluate the normality of residuals and the equality of variances across groups in each dataset. The results are collected and stored for further analysis.

In [37]:
# Define significance level for statistical tests
significance_level = 0.05

# Initialize a list to store results from all datasets
all_results = []

In [38]:
for dataset_path in datasets:
    data_frame = pd.read_csv(dataset_path)
    # Exclude 'Age', 'Sex', 'APOE4', and 'ID_1'
    data_frame = data_frame.drop(columns=['Age', 'Sex', 'APOE4', 'ID_1'])
    
    variables = data_frame.columns.difference(['Diagnosis'])
    dataset_results = []

    for var in variables:
        # Using Q() to safely include variable names in the formula
        formula = f'Diagnosis ~ Q("{var}")'
        model = ols(formula, data=data_frame).fit()
        
        # Shapiro-Wilk Test on the residuals
        shapiro_p = stats.shapiro(model.resid)[1]
        
        # Collect groups for variance tests
        groups = [data_frame[data_frame['Diagnosis'] == g][var] for g in data_frame['Diagnosis'].unique()]

        # Levene's Test
        levene_p = stats.levene(*groups)[1]
        
        # Bartlett’s Test
        bartlett_p = stats.bartlett(*groups)[1]

        # Append the results for this variable to the dataset results list
        dataset_results.append({
            'Variable': var,
            'Shapiro-Wilk p-value': shapiro_p,
            'Levene’s Test p-value': levene_p,
            'Bartlett’s Test p-value': bartlett_p
        })

    # Append the results from this dataset to the all results list
    all_results.append({
        'Dataset': dataset_path,
        'Results': dataset_results
    })

In [39]:
# Define the directory to save the CSV files
save_directory = "/home/aghasemi/CompBio481/feature_selection/ANOVA_datasets_b2"

# Process each dataset
for result in all_results:
    dataset_path = result['Dataset']
    df = pd.DataFrame(result['Results'])

    # Initialize lists to store features that fail either Levene's or Bartlett's tests
    failed_features = []

    # Check if any features fail Levene's or Bartlett's tests and store their names
    failed_levene = df[df["Levene’s Test p-value"] < significance_level]["Variable"]
    failed_bartlett = df[df["Bartlett’s Test p-value"] < significance_level]["Variable"]
    failed_features.extend(failed_levene)
    failed_features.extend(failed_bartlett)

    # Remove duplicates
    failed_features = list(set(failed_features))

    # Create a DataFrame with the failed features
    failed_features_df = pd.DataFrame(failed_features, columns=["Failed Features"])

    # Extract dataset name from the file path
    dataset_name = os.path.splitext(os.path.basename(dataset_path))[0]

    # Save the DataFrame to a CSV file in the specified directory
    csv_file_path = os.path.join(save_directory, f"{dataset_name}_failed_features.csv")
    failed_features_df.to_csv(csv_file_path, index=False)

    print(f"Saved failed features for dataset {dataset_name} to {csv_file_path}")

Saved failed features for dataset NC_vs_AD_train to /home/aghasemi/CompBio481/feature_selection/ANOVA_datasets_b2/NC_vs_AD_train_failed_features.csv
Saved failed features for dataset NC_vs_DLB_train to /home/aghasemi/CompBio481/feature_selection/ANOVA_datasets_b2/NC_vs_DLB_train_failed_features.csv
Saved failed features for dataset NC_vs_MCI_train to /home/aghasemi/CompBio481/feature_selection/ANOVA_datasets_b2/NC_vs_MCI_train_failed_features.csv
Saved failed features for dataset NC_vs_NPH_train to /home/aghasemi/CompBio481/feature_selection/ANOVA_datasets_b2/NC_vs_NPH_train_failed_features.csv
Saved failed features for dataset NC_vs_VaD_train to /home/aghasemi/CompBio481/feature_selection/ANOVA_datasets_b2/NC_vs_VaD_train_failed_features.csv


In [40]:
# Paths to datasets
datasets = [
    "/home/aghasemi/CompBio481/datasets/processed_datasets/usable_datasets_branch2/NC_vs_AD_train.csv",
    "/home/aghasemi/CompBio481/datasets/processed_datasets/usable_datasets_branch2/NC_vs_DLB_train.csv",
    "/home/aghasemi/CompBio481/datasets/processed_datasets/usable_datasets_branch2/NC_vs_MCI_train.csv",
    "/home/aghasemi/CompBio481/datasets/processed_datasets/usable_datasets_branch2/NC_vs_NPH_train.csv",
    "/home/aghasemi/CompBio481/datasets/processed_datasets/usable_datasets_branch2/NC_vs_VaD_train.csv",
]

In [28]:
datasets = [
    "/home/aghasemi/CompBio481/datasets/processed_datasets/usable_datasets_branch1/NC_vs_AD_male.csv",
    "/home/aghasemi/CompBio481/datasets/processed_datasets/usable_datasets_branch1/NC_vs_DLB_male.csv",
    "/home/aghasemi/CompBio481/datasets/processed_datasets/usable_datasets_branch1/NC_vs_MCI_male.csv",
    "/home/aghasemi/CompBio481/datasets/processed_datasets/usable_datasets_branch1/NC_vs_NPH_male.csv",
    "/home/aghasemi/CompBio481/datasets/processed_datasets/usable_datasets_branch1/NC_vs_VaD_male.csv",
    
    "/home/aghasemi/CompBio481/datasets/processed_datasets/usable_datasets_branch1/NC_vs_AD_female.csv",
    "/home/aghasemi/CompBio481/datasets/processed_datasets/usable_datasets_branch1/NC_vs_DLB_female.csv",
    "/home/aghasemi/CompBio481/datasets/processed_datasets/usable_datasets_branch1/NC_vs_MCI_female.csv",
    "/home/aghasemi/CompBio481/datasets/processed_datasets/usable_datasets_branch1/NC_vs_NPH_female.csv",
    "/home/aghasemi/CompBio481/datasets/processed_datasets/usable_datasets_branch1/NC_vs_VaD_female.csv",
]

In [41]:
# Define the directory to save the filtered datasets
save_directory = "/home/aghasemi/CompBio481/feature_selection/ANOVA_datasets_b2"

# Process each dataset
for dataset_path in datasets:
    # Read the dataset
    df = pd.read_csv(dataset_path)

    # Extract dataset name from the file path
    dataset_name = os.path.splitext(os.path.basename(dataset_path))[0]

    # Load the list of failed features
    failed_features_path = os.path.join(save_directory, f"{dataset_name}_failed_features.csv")
    failed_features_df = pd.read_csv(failed_features_path)
    failed_features = failed_features_df['Failed Features'].tolist()

    # Filter the dataset to keep specified features and features that failed Levene's or Bartlett's tests
    features_to_keep = ['ID_1', 'Age', 'Sex', 'APOE4', 'Diagnosis']
    features_to_keep.extend(failed_features)
    filtered_df = df[features_to_keep]

    # Save the filtered dataset to a CSV file
    filtered_dataset_path = os.path.join(save_directory, f"{dataset_name}_ANOVA_failed_features.csv")
    filtered_df.to_csv(filtered_dataset_path, index=False)

    print(f"Filtered dataset saved to {filtered_dataset_path}")

    # Create a dataset without the failed features
    df_without_failed_features = df.drop(columns=failed_features)
    dataset_without_failed_features_path = os.path.join(save_directory, f"{dataset_name}_ANOVA_not_failed_features.csv")
    df_without_failed_features.to_csv(dataset_without_failed_features_path, index=False)

    print(f"Dataset without failed features saved to {dataset_without_failed_features_path}")

Filtered dataset saved to /home/aghasemi/CompBio481/feature_selection/ANOVA_datasets_b2/NC_vs_AD_train_ANOVA_failed_features.csv
Dataset without failed features saved to /home/aghasemi/CompBio481/feature_selection/ANOVA_datasets_b2/NC_vs_AD_train_ANOVA_not_failed_features.csv
Filtered dataset saved to /home/aghasemi/CompBio481/feature_selection/ANOVA_datasets_b2/NC_vs_DLB_train_ANOVA_failed_features.csv
Dataset without failed features saved to /home/aghasemi/CompBio481/feature_selection/ANOVA_datasets_b2/NC_vs_DLB_train_ANOVA_not_failed_features.csv
Filtered dataset saved to /home/aghasemi/CompBio481/feature_selection/ANOVA_datasets_b2/NC_vs_MCI_train_ANOVA_failed_features.csv
Dataset without failed features saved to /home/aghasemi/CompBio481/feature_selection/ANOVA_datasets_b2/NC_vs_MCI_train_ANOVA_not_failed_features.csv
Filtered dataset saved to /home/aghasemi/CompBio481/feature_selection/ANOVA_datasets_b2/NC_vs_NPH_train_ANOVA_failed_features.csv
Dataset without failed features sav