### Load and Filter Datasets

This section of the code processes and filters datasets based on selected features:
1. **Setup**:
   - **Output Directory**: Creates the output directory if it does not already exist.
   - **Essential Columns**: List of essential columns to keep in all datasets (`"Age"`, `"Sex"`, `"APOE4"`, `"Diagnosis"`, `"ID_1"`).

2. **Processing Each Directory and Dataset**:
   - **Directory and Dataset Pairing**: Matches each directory with its corresponding dataset.
   - **Feature Extraction**:
     - **Load RFE Results**: Reads `rfe_results.csv` to get the list of selected features.
     - **Extract Features**: Assumes the `Feature` column in `rfe_results.csv` lists the features to keep.
   - **Dataset Loading**:
     - **Ensure 'ID_1'**: Moves the `'ID_1'` column to the first position in the dataset.
     - **Filter Columns**: Combines essential columns with selected features, ensuring `'ID_1'` is first.
   - **Save Filtered Dataset**: Saves the filtered dataset to the specified output directory with the condition's name.

3. **Results**:
   - **Return**: Provides a dictionary with the status of dataset processing and saving.

In [1]:
import pandas as pd
import os

In [2]:
# directories = [
#     "/home/aghasemi/CompBio481/feature_selection/feat_select_res_branch1_overall_p2/ad_vs_nc",
#     "/home/aghasemi/CompBio481/feature_selection/feat_select_res_branch1_overall_p2/dlb_vs_nc",
#     "/home/aghasemi/CompBio481/feature_selection/feat_select_res_branch1_overall_p2/mci_vs_nc",
#     "/home/aghasemi/CompBio481/feature_selection/feat_select_res_branch1_overall_p2/nph_vs_nc",
#     "/home/aghasemi/CompBio481/feature_selection/feat_select_res_branch1_overall_p2/vad_vs_nc",
# ]

directories = [
    "/home/aghasemi/CompBio481/feature_selection/feat_select_res_branch1_sex_specific_p2_refinement/vad_nc_male",
    "/home/aghasemi/CompBio481/feature_selection/feat_select_res_branch1_sex_specific_p2_refinement/vad_nc_female",
    "/home/aghasemi/CompBio481/feature_selection/feat_select_res_branch1_sex_specific_p2_refinement/nph_nc_male",
    "/home/aghasemi/CompBio481/feature_selection/feat_select_res_branch1_sex_specific_p2_refinement/nph_nc_female",
    "/home/aghasemi/CompBio481/feature_selection/feat_select_res_branch1_sex_specific_p2_refinement/mci_nc_male",
    "/home/aghasemi/CompBio481/feature_selection/feat_select_res_branch1_sex_specific_p2_refinement/mci_nc_female",
    "/home/aghasemi/CompBio481/feature_selection/feat_select_res_branch1_sex_specific_p2_refinement/dlb_nc_male",
    "/home/aghasemi/CompBio481/feature_selection/feat_select_res_branch1_sex_specific_p2_refinement/dlb_nc_female",
    "/home/aghasemi/CompBio481/feature_selection/feat_select_res_branch1_sex_specific_p2_refinement/ad_nc_male",
    "/home/aghasemi/CompBio481/feature_selection/feat_select_res_branch1_sex_specific_p2_refinement/ad_nc_female",
]

In [5]:
# datasets = [
#     "/home/aghasemi/CompBio481/datasets/filtered_datasets_after_rank_feat_select_branch1_overall/ad_vs_nc.csv",
#     "/home/aghasemi/CompBio481/datasets/filtered_datasets_after_rank_feat_select_branch1_overall/dlb_vs_nc.csv",
#     "/home/aghasemi/CompBio481/datasets/filtered_datasets_after_rank_feat_select_branch1_overall/mci_vs_nc.csv",
#     "/home/aghasemi/CompBio481/datasets/filtered_datasets_after_rank_feat_select_branch1_overall/nph_vs_nc.csv",
#     "/home/aghasemi/CompBio481/datasets/filtered_datasets_after_rank_feat_select_branch1_overall/vad_vs_nc.csv",
# ]
datasets = [
    "/home/aghasemi/CompBio481/datasets/filtered_datasets_after_rank_feat_select_branch1_sex_specific/vad_nc_male.csv",
    "/home/aghasemi/CompBio481/datasets/filtered_datasets_after_rank_feat_select_branch1_sex_specific/vad_nc_female.csv",
    "/home/aghasemi/CompBio481/datasets/filtered_datasets_after_rank_feat_select_branch1_sex_specific/nph_nc_male.csv",
    "/home/aghasemi/CompBio481/datasets/filtered_datasets_after_rank_feat_select_branch1_sex_specific/nph_nc_female.csv",
    "/home/aghasemi/CompBio481/datasets/filtered_datasets_after_rank_feat_select_branch1_sex_specific/mci_nc_male.csv",
    "/home/aghasemi/CompBio481/datasets/filtered_datasets_after_rank_feat_select_branch1_sex_specific/mci_nc_female.csv",
    "/home/aghasemi/CompBio481/datasets/filtered_datasets_after_rank_feat_select_branch1_sex_specific/dlb_nc_male.csv",
    "/home/aghasemi/CompBio481/datasets/filtered_datasets_after_rank_feat_select_branch1_sex_specific/dlb_nc_female.csv",
    "/home/aghasemi/CompBio481/datasets/filtered_datasets_after_rank_feat_select_branch1_sex_specific/ad_nc_male.csv",
    "/home/aghasemi/CompBio481/datasets/filtered_datasets_after_rank_feat_select_branch1_sex_specific/ad_nc_female.csv",
]

**Filter Datasets**

In [6]:
import pandas as pd
import os

In [7]:
def load_and_filter_datasets(directories, datasets, output_directory):
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)  # Create the output directory if it doesn't exist

    results = {}
    essential_cols = ["Age", "Sex", "APOE4", "Diagnosis", "ID_1"]  # Essential columns list including 'ID_1'

    # Loop through each directory and its corresponding dataset
    for dir_path, dataset_path in zip(directories, datasets):
        # Extract the condition name from the directory path
        condition = os.path.basename(dir_path)

        # Load the features from the rfe_results.csv file in the directory
        rfe_path = os.path.join(dir_path, 'rfe_results.csv')
        if not os.path.exists(rfe_path):
            results[condition] = "rfe_results.csv not found."
            continue

        rfe_results = pd.read_csv(rfe_path)

        # Extract feature names which are selected (assuming a column 'Feature' lists them)
        features_to_keep = list(rfe_results['Feature'])

        # Load the corresponding dataset
        df = pd.read_csv(dataset_path)

        # Ensure 'ID_1' is the first column
        if 'ID_1' in df.columns:
            id_col = df.pop('ID_1')
            df.insert(0, 'ID_1', id_col)

        # Combine essential columns with the selected features, ensuring 'ID_1' is maintained at the start
        features_to_keep = [col for col in essential_cols if col in df.columns and col != 'ID_1'] + features_to_keep
        features_to_keep.insert(0, 'ID_1')  # Make sure 'ID_1' remains the first element

        # Filter the DataFrame to only keep the required columns
        filtered_df = df[features_to_keep]

        # Define a new file name and save the filtered dataset
        new_file_path = os.path.join(output_directory, f"{condition}.csv")
        filtered_df.to_csv(new_file_path, index=False)
        results[condition] = f"Filtered dataset created and saved to {new_file_path}"

    return results

In [8]:
output_directory = "/home/aghasemi/CompBio481/datasets/filtered_datasets_after_subset_feat_select_branch1_sex_specific_refinement"

In [9]:
# Call the function
load_and_filter_datasets(directories, datasets, output_directory)

{'vad_nc_male': 'Filtered dataset created and saved to /home/aghasemi/CompBio481/datasets/filtered_datasets_after_subset_feat_select_branch1_sex_specific_refinement/vad_nc_male.csv',
 'vad_nc_female': 'Filtered dataset created and saved to /home/aghasemi/CompBio481/datasets/filtered_datasets_after_subset_feat_select_branch1_sex_specific_refinement/vad_nc_female.csv',
 'nph_nc_male': 'Filtered dataset created and saved to /home/aghasemi/CompBio481/datasets/filtered_datasets_after_subset_feat_select_branch1_sex_specific_refinement/nph_nc_male.csv',
 'nph_nc_female': 'Filtered dataset created and saved to /home/aghasemi/CompBio481/datasets/filtered_datasets_after_subset_feat_select_branch1_sex_specific_refinement/nph_nc_female.csv',
 'mci_nc_male': 'Filtered dataset created and saved to /home/aghasemi/CompBio481/datasets/filtered_datasets_after_subset_feat_select_branch1_sex_specific_refinement/mci_nc_male.csv',
 'mci_nc_female': 'Filtered dataset created and saved to /home/aghasemi/CompB