# Preprocessing

Import python modules

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

## Load gene expression and metadata

In [None]:
import_expression_filepath = "data/processed/expression_brain_only.csv" 
import_metadata_filepath = "data/processed/metadata_brain_only.csv"
export_expression_filepath = "data/processed/expression_gbm_healthy.csv" 
export_metadata_filepath = "data/processed/metadata_gbm_healthy.csv"

In [None]:
gene_expression_df = pd.read_csv(
    import_expression_filepath,
)
print("Expression Shape:", gene_expression_df.shape)
display(gene_expression_df.head())

In [None]:
metadata_df = pd.read_csv(
    import_metadata_filepath,
)

print("\nMetadata Shape:", metadata_df.shape)
display(metadata_df.head())

## Prepare samples

### label mapping
- 1 - healthy.
- 2 - Lower Grade Glioma.
- 3 - Glioblastoma Multiforme.

In [None]:
healthy_metadata = metadata_df[(
    (metadata_df['_sample_type'] == 'Normal Tissue')
)].copy()
print(f"Created Healthy group with {len(healthy_metadata)} samples.")
healthy_metadata['label'] = 1

unhealthy_metadata = metadata_df[(
    (metadata_df['_sample_type'] == 'Primary Tumor')
)]
print(f"Created Unhealthy group with {len(unhealthy_metadata)} samples.")

unhealthy_gbm_metadata = unhealthy_metadata[
    unhealthy_metadata['primary disease or tissue'] == 'Glioblastoma Multiforme'
].copy()
print(f"Created Unhealthy GBM group with {len(unhealthy_gbm_metadata)} samples.")
unhealthy_gbm_metadata['label'] = 3


combined_metadata_df = pd.concat([healthy_metadata, unhealthy_gbm_metadata])
combined_metadata_df.info()


In [None]:
plot_df = combined_metadata_df.copy()

label_map = {
    1: 'Healthy',
    2: 'LGG',
    3: 'GBM'
}
plot_df['Diagnosis'] = plot_df['label'].map(label_map)

plt.figure(figsize=(8, 6))

plot_order = ['Healthy', 'LGG', 'GBM']

plot_palette = {
    'Healthy': '#457B9D', 
    'LGG': '#A8DADC', 
    'GBM': '#E63946'
}

sns.countplot(
    x='Diagnosis',
    data=plot_df,
    order=plot_order,
    palette=plot_palette,
    hue='Diagnosis',
    legend=False
)

plt.title('Distribution of Samples by Class', fontsize=16)
plt.xlabel('Diagnosis', fontsize=12)
plt.ylabel('Sample Count', fontsize=12)
plt.xticks(fontsize=11)
plt.tight_layout()
plt.show()

In [None]:
combined_metadata_df

In [None]:
b_samples_to_keep = combined_metadata_df['sample_id'].tolist()

print(f"Total samples to keep for analysis: {len(b_samples_to_keep)}")
expression_indexed_df = gene_expression_df.set_index('sample_id')
filtered_genes = expression_indexed_df[
    expression_indexed_df.columns.intersection(b_samples_to_keep)
].copy()

print(f"Filtered expression matrix shape: {filtered_genes.shape}")

print("\n--- Pipeline B Complete ---")
print(f"Final `gene` matrix shape: {filtered_genes.shape}")
display(filtered_genes.head())

## Store data for ease of use 

In [None]:
labels_df = combined_metadata_df.set_index('sample_id').copy()
final_labels_df = labels_df[['label']]

final_labels_df.to_csv(export_metadata_filepath, index_label='sample_id')
print(f"Labels saved to: {export_metadata_filepath}")

In [None]:
gene_T = filtered_genes.T

scaler = StandardScaler()
gene_scaled_array = scaler.fit_transform(gene_T)

gene_scaled_df = pd.DataFrame(
    gene_scaled_array,
    index=gene_T.index,
    columns=gene_T.columns
)

gene_scaled_df.to_csv(export_expression_filepath, index_label='sample_id')
print(f"Scaled data saved to: {export_expression_filepath}")