# Prepare Data for ALS Biomarker Analysis

This notebook processes clinical, transcriptomics, proteomics, and epigenomics data for ALS biomarker analysis. We'll run the script step-by-step to identify and fix issues interactively.

In [1]:
# Cell 1: Import Libraries and Define Paths
import pandas as pd
import numpy as np
import os
from utils import main, create_unified_mapping, load_and_merge, handle_missing_values, \
    perform_initial_analysis, visualize_results, save_integrated_data  # Import all necessary functions

# Define Data Paths (ABSOLUTE PATHS)
BASE_DIR = "/home/w2sg-arnav/nihwork"  # Top-level project directory
CLINICAL_DIR = os.path.join(BASE_DIR, "metadata/clinical")  # Corrected CLINICAL_DIR - assuming 'subjects.csv' is directly under 'metadata'
DATA_ROOT = os.path.join(BASE_DIR, "als_data")  # Data subdirectory for 'omics data
OUTPUT_DIR = os.path.join(BASE_DIR, "data/processed")  # Output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Clinical data path
CLINICAL_PATH = os.path.join(CLINICAL_DIR, "subjects.csv")

# Mapping files for each omics type
mapping_files = {
    "transcriptomics": os.path.join(DATA_ROOT, "transcriptomics/4_matrix/Sample Mapping Information/Sample Mapping File Feb 2024.csv"),
    "proteomics": os.path.join(DATA_ROOT, "proteomics/4_matrix/Sample Mapping Information/Sample Mapping File Feb 2024.csv"),
    "epigenomics": os.path.join(DATA_ROOT, "epigenomics/4_matrix/Sample Mapping Information/Sample Mapping File Feb 2024.csv"),
}

# Data paths for each omics type
data_paths = {
    "transcriptomics": os.path.join(DATA_ROOT, "transcriptomics/4_matrix/AnswerALS-651-T-v1-release6_raw-counts.csv"),
    "proteomics": os.path.join(DATA_ROOT, "proteomics/4_matrix/AnswerALS-436-P_proteomics-protein-matrix_correctedImputed.txt"),
    "epigenomics": os.path.join(DATA_ROOT, "epigenomics/4_matrix/AnswerALS-620-E-v1-release6_DiffBind-raw-counts-minOverlap-0.1.csv"),
}

# Display paths for verification
print("Defined paths:")
print(f"Clinical: {CLINICAL_PATH}")
print(f"Transcriptomics Mapping: {mapping_files['transcriptomics']}")
print(f"Transcriptomics Data: {data_paths['transcriptomics']}")
print(f"Proteomics Mapping: {mapping_files['proteomics']}")
print(f"Proteomics Data: {data_paths['proteomics']}")
print(f"Epigenomics Mapping: {mapping_files['epigenomics']}")
print(f"Epigenomics Data: {data_paths['epigenomics']}")
print(f"Output Directory: {OUTPUT_DIR}")

Defined paths:
Clinical: /home/w2sg-arnav/nihwork/metadata/clinical/subjects.csv
Transcriptomics Mapping: /home/w2sg-arnav/nihwork/als_data/transcriptomics/4_matrix/Sample Mapping Information/Sample Mapping File Feb 2024.csv
Transcriptomics Data: /home/w2sg-arnav/nihwork/als_data/transcriptomics/4_matrix/AnswerALS-651-T-v1-release6_raw-counts.csv
Proteomics Mapping: /home/w2sg-arnav/nihwork/als_data/proteomics/4_matrix/Sample Mapping Information/Sample Mapping File Feb 2024.csv
Proteomics Data: /home/w2sg-arnav/nihwork/als_data/proteomics/4_matrix/AnswerALS-436-P_proteomics-protein-matrix_correctedImputed.txt
Epigenomics Mapping: /home/w2sg-arnav/nihwork/als_data/epigenomics/4_matrix/Sample Mapping Information/Sample Mapping File Feb 2024.csv
Epigenomics Data: /home/w2sg-arnav/nihwork/als_data/epigenomics/4_matrix/AnswerALS-620-E-v1-release6_DiffBind-raw-counts-minOverlap-0.1.csv
Output Directory: /home/w2sg-arnav/nihwork/data/processed


### Verify Paths
Run the cell above and check the printed paths. If any path is incorrect (e.g., `FileNotFoundError` in the next steps), update the corresponding variable. For example, `TRANSCRIPTOMICS_PATH` is still a placeholder.

In [2]:
# Cell 2: Load Clinical Data
clinical_df = pd.read_csv(CLINICAL_PATH)
print("Columns in clinical_df:", clinical_df.columns.tolist())

# Normalize column names and Participant ID
clinical_df.rename(columns={
    "Participant_ID": "Participant ID",
    "SubjectUID": "SubjectUID"
}, inplace=True, errors='ignore')
clinical_df['Participant ID'] = clinical_df['Participant ID'].str.strip().str.upper()

# Check if 'Participant ID' exists after renaming
if 'Participant ID' not in clinical_df.columns:
    print("Error: 'Participant ID' column not found after renaming. Using 'Participant_ID' instead.")
    if 'Participant_ID' in clinical_df.columns:
        clinical_ids = clinical_df["Participant_ID"].tolist()
        print("Using 'Participant_ID' column directly. Extracted IDs:", clinical_ids[:10])
    else:
        print("Error: Neither 'Participant ID' nor 'Participant_ID' found. Available columns:", clinical_df.columns.tolist())
        raise SystemExit("Exiting due to missing Participant ID column.")
else:
    # Check for and handle duplicate participant IDs
    if clinical_df['Participant ID'].duplicated().any():
        print("Warning: Duplicate participant IDs found in clinical data. Dropping duplicates.")
        clinical_df.drop_duplicates(subset='Participant ID', keep='first', inplace=True)
    clinical_ids = clinical_df["Participant ID"].tolist()
    print("Extracted clinical_ids from 'Participant ID':", clinical_ids[:10])

# Display the clinical DataFrame
clinical_df.head()

Columns in clinical_df: ['Participant_ID', 'SubjectUID', 'subject_group_id']
Extracted clinical_ids from 'Participant ID': ['CASE-NEUAA295HHE', 'CTRL-NEUAA485DZL', 'CASE-NEUAA599TMX', 'CASE-NEUAB000NKC', 'CASE-NEUAC617GR5', 'CASE-NEUAD142RRY', 'CASE-NEUAD542VT0', 'CASE-NEUAD952KAZ', 'CASE-NEUAE228FF6', 'CASE-NEUAE431CGV']


Unnamed: 0,Participant ID,SubjectUID,subject_group_id
0,CASE-NEUAA295HHE,NEUAA295HHE,1
1,CTRL-NEUAA485DZL,NEUAA485DZL,5
2,CASE-NEUAA599TMX,NEUAA599TMX,1
3,CASE-NEUAB000NKC,NEUAB000NKC,1
4,CASE-NEUAC617GR5,NEUAC617GR5,17


### Clinical Data Check
- The cell above loads the clinical data and extracts participant IDs.
- Verify the output shows the correct columns (`['Participant ID', 'SubjectUID', 'subject_group_id']`) and IDs.
- The `clinical_df.head()` output lets you inspect the data.

In [3]:
# Cell 3: Create Unified Sample Mapping
unified_mapping = create_unified_mapping(
    clinical_ids,
    mapping_files,
    participant_col="Participant ID",
    sample_col="Sample ID"
)

if unified_mapping is None:
    print("Error: Failed to create unified sample mapping. Exiting.")
    raise SystemExit("Exiting due to mapping failure.")
else:
    unified_mapping.to_csv(os.path.join(OUTPUT_DIR, "unified_sample_mapping.csv"), index=False)
    print("Unified sample mapping created successfully!")

# Display the unified mapping
unified_mapping.head()

Processing mapping file for transcriptomics: /home/w2sg-arnav/nihwork/als_data/transcriptomics/4_matrix/Sample Mapping Information/Sample Mapping File Feb 2024.csv
Sample transcriptomics_Sample_ID after processing:
['CTRL-NEUEU392AE8-5234-T', 'CASE-NEUVM674HUA-5261-T', 'CASE-NEUHG791RV5-5268-T', 'CASE-NEUCE965ZGK-5275-T', 'CASE-NEUEM720BUU-5282-T']
Processing mapping file for proteomics: /home/w2sg-arnav/nihwork/als_data/proteomics/4_matrix/Sample Mapping Information/Sample Mapping File Feb 2024.csv
Sample proteomics_Sample_ID after processing:
['CTRL-NEUEU392AE8-5234-P', 'CASE-NEUVM674HUA-5261-P', 'CASE-NEUHG791RV5-5268-P', 'CASE-NEUCE965ZGK-5275-P', 'CASE-NEUEM720BUU-5282-P']
Processing mapping file for epigenomics: /home/w2sg-arnav/nihwork/als_data/epigenomics/4_matrix/Sample Mapping Information/Sample Mapping File Feb 2024.csv
Sample epigenomics_Sample_ID after processing:
['CTRL-NEUEU392AE8-5234-E', 'CASE-NEUVM674HUA-5261-E', 'CASE-NEUHG791RV5-5268-E', 'CASE-NEUCE965ZGK-5275-E', '

Unnamed: 0,Participant ID,transcriptomics_Sample_ID,transcriptomics_participant_code,proteomics_Sample_ID,proteomics_participant_code,epigenomics_Sample_ID,epigenomics_participant_code
0,CASE-NEUAA295HHE,CASE-NEUAA295HHE-9010-T,NEUAA295HHE,CASE-NEUAA295HHE-9010-P,NEUAA295HHE,CASE-NEUAA295HHE-9010-E,NEUAA295HHE
1,CTRL-NEUAA485DZL,CTRL-NEUAA485DZL-7575-T,NEUAA485DZL,CTRL-NEUAA485DZL-7575-P,NEUAA485DZL,CTRL-NEUAA485DZL-7575-E,NEUAA485DZL
2,CASE-NEUAA599TMX,CASE-NEUAA599TMX-5314-T,NEUAA599TMX,CASE-NEUAA599TMX-5314-P,NEUAA599TMX,CASE-NEUAA599TMX-5314-E,NEUAA599TMX
3,CASE-NEUAB000NKC,CASE-NEUAB000NKC-5730-T,NEUAB000NKC,CASE-NEUAB000NKC-5730-P,NEUAB000NKC,CASE-NEUAB000NKC-5730-E,NEUAB000NKC
4,CASE-NEUAC617GR5,,,,,,


In [4]:
# Cell 3.5: Investigate Unmapped Clinical IDs
# Load the mapping files again for comparison
mapping_dfs = {}
for data_type, mapping_file in mapping_files.items():
    mapping_dfs[data_type] = pd.read_csv(mapping_file)

# Get all unique Participant IDs from mapping files
all_mapped_ids = set()
for df in mapping_dfs.values():
    all_mapped_ids.update(df['Participant ID'].str.strip().str.upper().tolist())

# Get clinical IDs
clinical_ids_set = set(clinical_ids)

# Find unmapped IDs and check for potential matches
unmapped_ids = clinical_ids_set - all_mapped_ids
print(f"Total unmapped clinical IDs: {len(unmapped_ids)}")
print(f"Sample unmapped IDs: {list(unmapped_ids)[:5]}")

# Check for formatting issues (e.g., extra spaces, case sensitivity)
for unmapped_id in list(unmapped_ids)[:5]:  # Check first 5 for brevity
    for data_type, df in mapping_dfs.items():
        if any(df['Participant ID'].str.strip().str.upper() == unmapped_id):
            print(f"Potential match for {unmapped_id} found in {data_type} with different formatting!")

# Display a sample of mapped and unmapped IDs for comparison
print("\nSample of mapped IDs from clinical data:")
print(list(clinical_ids_set - unmapped_ids)[:5])
print("\nSample of unmapped IDs:")
print(list(unmapped_ids)[:5])

Total unmapped clinical IDs: 309
Sample unmapped IDs: ['CASE-NEUND058PNV', 'CASE-NEUKU894PHF', 'CASE-NEURG707TFZ', 'CASE-NEUFZ620KX0', 'CASE-NEUXC293WA6']

Sample of mapped IDs from clinical data:
['CASE-NEURG115XK8', 'CASE-NEUAA599TMX', 'CASE-NEUDV274PR3', 'CASE-NEUFF238WXM', 'CASE-NEUET659KJL']

Sample of unmapped IDs:
['CASE-NEUND058PNV', 'CASE-NEUKU894PHF', 'CASE-NEURG707TFZ', 'CASE-NEUFZ620KX0', 'CASE-NEUXC293WA6']


In [5]:
# Cell 3.6: Inspect Unified Mapping Sample IDs
print("Sample of unified_mapping:")
print(unified_mapping.head())
print("\nSample of transcriptomics_Sample_ID:")
print(unified_mapping['transcriptomics_Sample_ID'].dropna().head().tolist())
print("\nSample of proteomics_Sample_ID:")
print(unified_mapping['proteomics_Sample_ID'].dropna().head().tolist())
print("\nSample of epigenomics_Sample_ID:")
print(unified_mapping['epigenomics_Sample_ID'].dropna().head().tolist())

Sample of unified_mapping:
     Participant ID transcriptomics_Sample_ID  \
0  CASE-NEUAA295HHE   CASE-NEUAA295HHE-9010-T   
1  CTRL-NEUAA485DZL   CTRL-NEUAA485DZL-7575-T   
2  CASE-NEUAA599TMX   CASE-NEUAA599TMX-5314-T   
3  CASE-NEUAB000NKC   CASE-NEUAB000NKC-5730-T   
4  CASE-NEUAC617GR5                       NaN   

  transcriptomics_participant_code     proteomics_Sample_ID  \
0                      NEUAA295HHE  CASE-NEUAA295HHE-9010-P   
1                      NEUAA485DZL  CTRL-NEUAA485DZL-7575-P   
2                      NEUAA599TMX  CASE-NEUAA599TMX-5314-P   
3                      NEUAB000NKC  CASE-NEUAB000NKC-5730-P   
4                              NaN                      NaN   

  proteomics_participant_code    epigenomics_Sample_ID  \
0                 NEUAA295HHE  CASE-NEUAA295HHE-9010-E   
1                 NEUAA485DZL  CTRL-NEUAA485DZL-7575-E   
2                 NEUAA599TMX  CASE-NEUAA599TMX-5314-E   
3                 NEUAB000NKC  CASE-NEUAB000NKC-5730-E   
4        

In [6]:
# Cell 3.7: List Transcriptomics Files
transcriptomics_dir = "/home/w2sg-arnav/nihwork/als_data/transcriptomics/4_matrix"
os.listdir(transcriptomics_dir)

['AnswerALS-651-T-v1-release6_raw-counts.csv', 'Sample Mapping Information']

In [7]:
# Cell 3.8: List Epigenomics Files
epigenomics_dir = "/home/w2sg-arnav/nihwork/als_data/epigenomics/4_matrix"
os.listdir(epigenomics_dir)

['AnswerALS-620-E-v1-release6_DiffBind-raw-counts-minOverlap-0.1.csv',
 'Sample Mapping Information']

### Unified Mapping Check
- This cell creates a mapping between clinical participant IDs and sample IDs across datasets.
- Check for errors (e.g., `FileNotFoundError` for mapping files).
- Inspect `unified_mapping.head()` to ensure it has columns like `Participant ID`, `transcriptomics_Sample_ID`, etc.

In [8]:
# Cell 4: Load and Merge Transcriptomics Data
transcriptomics_df = load_and_merge(
    data_paths["transcriptomics"],
    unified_mapping,
    "transcriptomics",
    index_col=0
)

if transcriptomics_df is None:
    print("Error: Failed to load transcriptomics data. Check the path and try again.")
else:
    transcriptomics_df.to_csv(os.path.join(OUTPUT_DIR, "transcriptomics_merged.csv"), index=False)
    print("Transcriptomics data merged successfully!")

# Display the result
transcriptomics_df.head()

Loaded transcriptomics data from /home/w2sg-arnav/nihwork/als_data/transcriptomics/4_matrix/AnswerALS-651-T-v1-release6_raw-counts.csv. Shape: (60664, 774)
Index name: None
Columns (sample IDs): ['Geneid', 'CASE-NEUAA599TMX-5310-T', 'CASE-NEUAB000NKC-5726-T', 'CASE-NEUAE228FF6-7459-T', 'CASE-NEUAE993EPR-6287-T']...
Mapping subset for transcriptomics: (734, 2)

Mismatch report for transcriptomics:
  data_type: transcriptomics
  total_data_columns: 774
  total_mapping_ids: 734
  data_with_codes: 694
  mapping_with_codes: 734
  unique_data_codes: 572
  unique_mapping_codes: 734
  common_codes: 559
  match_percentage: 97.72727272727273
Direct matches after normalization: 0 out of 774
Trying enhanced sample matching...
Matched 559 out of 572 columns
Unmatched columns: 13
Mapped 559 columns to participant IDs
After mapping, samples with participant IDs: 681
Error processing /home/w2sg-arnav/nihwork/als_data/transcriptomics/4_matrix/AnswerALS-651-T-v1-release6_raw-counts.csv: agg function fai

AttributeError: 'NoneType' object has no attribute 'head'

### Transcriptomics Data Check
- This cell will likely fail because `TRANSCRIPTOMICS_PATH` is a placeholder.
- Update `TRANSCRIPTOMICS_PATH` in Cell 1 with the correct file name (e.g., the file with `'Geneid'` and sample IDs like `'CASE-NEUAA599TMX-5310-T'`).
- Once fixed, check `transcriptomics_df.head()` to see the merged data.

In [None]:
# Cell 5: Load and Merge Proteomics Data
proteomics_df = load_and_merge(
    data_paths["proteomics"],
    unified_mapping,
    "proteomics",
    delimiter='\t',
    index_col=0
)

if proteomics_df is None:
    print("Error: Failed to load proteomics data. Check the path and try again.")
else:
    proteomics_df.to_csv(os.path.join(OUTPUT_DIR, "proteomics_merged.csv"), index=False)
    print("Proteomics data merged successfully!")

# Display the result
proteomics_df.head()

In [None]:
# Cell 5.6: List Proteomics Files
proteomics_dir = "/home/w2sg-arnav/nihwork/als_data/proteomics/4_matrix"
os.listdir(proteomics_dir)

### Proteomics Data Check
- This cell loads and merges the proteomics data.
- If it fails (e.g., due to `PROTEOMICS_MAP_PATH`), update the path in Cell 1.
- Inspect `proteomics_df.head()` to verify the data.

In [None]:
# Cell 6: Load and Merge Epigenomics Data
epigenomics_methylation_df = load_and_merge(
    data_paths["epigenomics"],
    unified_mapping,
    "epigenomics",
    index_col=0
)

if epigenomics_methylation_df is None:
    print("Error: Failed to load epigenomics data. Check the path and try again.")
else:
    epigenomics_methylation_df.to_csv(os.path.join(OUTPUT_DIR, "epigenomics_methylation_merged.csv"), index=False)
    print("Epigenomics data merged successfully!")

# Display the result
epigenomics_methylation_df.head()

### Epigenomics Data Check
- This cell loads and merges the epigenomics data.
- If it fails (e.g., due to `EPIGENOMICS_METHYL_PATH`), update the path in Cell 1.
- Inspect `epigenomics_methylation_df.head()` to verify the data.

In [None]:
# Cell 7: Integrate Datasets
# Merge the transcriptomics, proteomics, and epigenomics DataFrames on Participant ID
# Start with clinical data to include subject_group_id (ALS case/control labels)
integrated_df = clinical_df[['Participant ID', 'subject_group_id']].copy()
integrated_df = integrated_df.rename(columns={'subject_group_id': 'Disease_Status'}) # Standardize to 'Disease_Status'

# Merge with transcriptomics
if transcriptomics_df is not None:
    integrated_df = integrated_df.merge(transcriptomics_df, left_on='Participant ID', right_index=True, how='outer')
    print(f"After merging transcriptomics, shape: {integrated_df.shape}")

# Merge with proteomics
if proteomics_df is not None:
    integrated_df = integrated_df.merge(proteomics_df, left_on='Participant ID', right_index=True, how='outer')
    print(f"After merging proteomics, shape: {integrated_df.shape}")

# Merge with epigenomics
if epigenomics_methylation_df is not None:
    integrated_df = integrated_df.merge(epigenomics_methylation_df, left_on='Participant ID', right_index=True, how='outer')
    print(f"After merging epigenomics, shape: {integrated_df.shape}")

# Set Participant ID as index for consistency with utils.py functions
integrated_df = integrated_df.set_index('Participant ID')

print("Integrated data shape before cleaning:", integrated_df.shape)
integrated_df.head()

In [None]:
# Cell 8: Data Cleaning, Analysis, and Visualization (using utils.py functions)
# Handle missing values
cleaned_df = handle_missing_values(integrated_df)

# Perform initial analysis
analysis_results = perform_initial_analysis(cleaned_df)

# Create visualizations
visualization_results = visualize_results(cleaned_df, analysis_results)

# Save the cleaned and integrated data
save_integrated_data(cleaned_df, os.path.join(OUTPUT_DIR, "final_integrated_omics_data.csv"))

print("Data cleaning, analysis, and visualization complete!")
print(f"Final cleaned dataset shape: {cleaned_df.shape}")

In [None]:
# Cell 9:  Summary and Confirmation
print("End-to-end multi-omics integration and analysis pipeline completed!")
print(f"Cleaned dataset available at: {os.path.join(OUTPUT_DIR, 'final_integrated_omics_data.csv')}")

# (Optional) Print a summary of analysis results
if 'differential_analysis' in analysis_results:
    for omics, results in analysis_results['differential_analysis'].items():
        if 'num_significant' in results:
            print(f"  {omics}: {results['num_significant']} significant features identified")