# Preprocessing

Import python modules

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Load gene expression and metadata

File paths:

In [None]:
xena_expression_filepath = "data/raw/TcgaTargetGtex_rsem_gene_tpm.gz" 
xena_metadata_filepath = "data/raw/TcgaTargetGTEX_phenotype.txt.gz"
metadata_filepath = "data/processed/metadata_brain_only.csv"
expression_filepath = "data/processed/expression_brain_only.csv"


In [None]:
gene_expression_df = pd.read_csv(
    xena_expression_filepath, 
    sep='\t', 
    index_col=0,
    compression='gzip'
)
print("Expression Shape:", gene_expression_df.shape)
display(gene_expression_df.head())

In [None]:
metadata_df = pd.read_csv(
    xena_metadata_filepath, 
    sep='\t',
    encoding='latin-1'
)

print("\nMetadata Shape:", metadata_df.shape)
display(metadata_df.head())

In [None]:
metadata_df = pd.read_csv(
    xena_metadata_filepath, 
    sep='\t',
    encoding='latin-1'
)

print("\nMetadata Shape:", metadata_df.shape)
display(metadata_df.head())

## Gene map script
Script for mapping Ensembl IDs (e.g. ENSG00000138435) to Gene Symbol (e.g. IDH1)

In [None]:
import pandas as pd
import os
import glob

a_tcga_download_dir = "data/raw/gdc/"

map_filepath = "data/processed/gene_map.csv"

print("Starting gene map creation...")

try:
    search_pattern = os.path.join(a_tcga_download_dir, '*', '*star*gene_counts.tsv')
    a_tcga_all_files = glob.glob(search_pattern)
    
    if not a_tcga_all_files:
        raise FileNotFoundError(f"No '*star*gene_counts.tsv' files found in {a_tcga_download_dir}")

    one_tcga_file_path = a_tcga_all_files[0]
    print(f"Reading file: {one_tcga_file_path}")

    temp_df = pd.read_csv(
        one_tcga_file_path,
        sep='\t',
        comment='#',
        header=0
    )

    # Keep only real genes (starting with 'ENSG')
    temp_df = temp_df[temp_df['gene_id'].str.startswith('ENSG')]
    
    # Keep only the two columns we need
    temp_df = temp_df[['gene_id', 'gene_name']]
    
    # Clean the Ensembl ID (e.g., "ENSG...15" -> "ENSG...")
    temp_df['gene_id_clean'] = temp_df['gene_id'].str.split('.').str[0]
    
    # Create the final map (Symbol and Cleaned ID)
    final_map_df = temp_df[['gene_name', 'gene_id_clean']]
    
    # Drop any duplicates (some symbols might be repeated)
    final_map_df = final_map_df.drop_duplicates(subset=['gene_name'])

    # --- 5. Save the Map to a File ---
    final_map_df.to_csv(map_filepath, index=False)
    
    print("\n--- SUCCESS ---")
    print(f"Gene map with {len(final_map_df)} genes was successfully saved to: {map_filepath}")

except Exception as e:
    print("\n--- ERROR ---")
    print(f"An error occurred: {e}")
    print("Please ensure 'a_tcga_download_dir' is set to the correct folder path.")

## Drop duplicate & Drop NA

In [None]:
print(f"Original shape of data_B: {gene_expression_df.shape}")

genes_duplicated = gene_expression_df.index.duplicated().sum()
print(f"Duplicate genes (rows) found: {genes_duplicated}")

if genes_duplicated > 0:
    print("Removing duplicate genes...")
    expression_df = gene_expression_df.drop_duplicates()
    print(f"New genes shape after dropping duplicate samples: {expression_df.shape}")
else:
    expression_df = gene_expression_df

metadata_duplicated = metadata_df.T.duplicated().sum()
print(f"Duplicate metadata (columns) found: {metadata_duplicated}")

if metadata_duplicated > 0:
    print("Removing duplicate metadata...")
    metadata_df = metadata_df.drop_duplicates()
    print(f"New metadata shape after dropping duplicate samples: {expression_df.shape}")
else:
    metadata_df = metadata_df


# --- Check for NA (Missing) Values ---
# data_B.isna().any() checks each column for NAs.
# .any() again checks if *any* column returned True.
na_values_found = expression_df.isna().any().any()
print(f"NA (missing) gene values found: {na_values_found}")

if na_values_found:
    # An NA value will break the model. The best strategy
    # is to remove the entire gene (row) that contains an NA.
    original_gene_count = expression_df.shape[0]
    
    # .dropna(axis=0) drops any ROW (gene) containing an NA.
    expression_df = expression_df.dropna(axis=0) 
    
    genes_removed = original_gene_count - expression_df.shape[0]
    print(f"Removed {genes_removed} genes that contained NA values.")
    print(f"New shape after dropping NA genes: {expression_df.shape}")

na_values_found = metadata_df.isna().any().any()
print(f"NA (missing) metadata values found: {na_values_found}")

if na_values_found:
    # An NA value will break the model. The best strategy
    # is to remove the entire gene (row) that contains an NA.
    original_gene_count = metadata_df.shape[0]
    
    # .dropna(axis=0) drops any ROW (gene) containing an NA.
    metadata_df = metadata_df.dropna(axis=0) 
    
    genes_removed = original_gene_count - metadata_df.shape[0]
    print(f"Removed {genes_removed} metadata that contained NA values.")
    print(f"New shape after dropping NA metadata: {metadata_df.shape}")

print("\n--- Cleaning Complete ---")
print(f"Final gene expression shape: {expression_df.shape}")
print(f"Final metadata shape: {metadata_df.shape}")

## Distribution primary site / cancer types

In [None]:
col = '_primary_site'
plt.figure(figsize=(10, 8))

top_n = 30
num_categories = metadata_df[col].nunique()

if num_categories > top_n:
    plot_title = f'Distribution of "{col}" (Top {top_n} of {num_categories})'
    top_categories = metadata_df[col].value_counts().nlargest(top_n).index
else:
    plot_title = f'Distribution of "{col}" (All {num_categories})'
    top_categories = metadata_df[col].value_counts().index

sns.countplot(
    y=col, 
    data=metadata_df, 
    order=top_categories,
    hue=col,
    legend=False
)

plt.title(plot_title, fontsize=15)
plt.xlabel('Sample Count', fontsize=12)
plt.ylabel(col, fontsize=12)
plt.tight_layout()
plt.show() # Display the plot

In [None]:
# We use .str.startswith('Brain') to get 'Brain' (TCGA) AND 'Brain - ...' (GTEx)
filter_condition = metadata_df['_primary_site'].str.startswith('Brain', na=False)
metadata_brain_df = metadata_df[filter_condition]

print(f"Original shape: {metadata_df.shape}")
print(f"Filtered (Brain only) shape: {metadata_brain_df.shape}")

original_studies = set(metadata_df['_study'])
filtered_studies = set(metadata_brain_df['_study'])

print(f"\nOriginal studies: {original_studies}")
print(f"Filtered studies: {filtered_studies}")

if 'TARGET' in original_studies and 'TARGET' not in filtered_studies:
    print("Observation confirmed: 'TARGET' samples were successfully filtered out.")

In [None]:
col = '_study'
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 5))

# --- Left Plot (Full Dataset) ---
top_n_left = 30
num_cat_left = metadata_df[col].nunique()
if num_cat_left > top_n_left:
    title_left = f'Full: "{col}" (Top {top_n_left} of {num_cat_left})'
    top_cat_left = metadata_df[col].value_counts().nlargest(top_n_left).index
else:
    title_left = f'Full: "{col}" (All {num_cat_left})'
    top_cat_left = metadata_df[col].value_counts().index
    
sns.countplot(y=col, data=metadata_df, order=top_cat_left, hue=col, legend=False, ax=ax1)
ax1.set_title(title_left, fontsize=15)
ax1.set_xlabel('Sample Count (Full Dataset)', fontsize=12)
ax1.set_ylabel(col, fontsize=12)

# --- Right Plot (Filtered Dataset) ---
top_n_right = 30
num_cat_right = metadata_brain_df[col].nunique()
if num_cat_right > top_n_right:
    title_right = f'Filtered: "{col}" (Top {top_n_right} of {num_cat_right})'
    top_cat_right = metadata_brain_df[col].value_counts().nlargest(top_n_right).index
else:
    title_right = f'Filtered: "{col}" (All {num_cat_right})'
    top_cat_right = metadata_brain_df[col].value_counts().index
    
sns.countplot(y=col, data=metadata_brain_df, order=top_cat_right, hue=col, legend=False, ax=ax2)
ax2.set_title(title_right, fontsize=15)
ax2.set_xlabel('Sample Count (Brain Filtered)', fontsize=12)
ax2.set_ylabel(None) 

plt.tight_layout()
plt.show()

In [None]:
col = 'primary disease or tissue'
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 10))

# --- Left Plot (Full Dataset) ---
top_n_left = 30
num_cat_left = metadata_df[col].nunique()
if num_cat_left > top_n_left:
    title_left = f'Full: "{col}" (Top {top_n_left} of {num_cat_left})'
    top_cat_left = metadata_df[col].value_counts().nlargest(top_n_left).index
else:
    title_left = f'Full: "{col}" (All {num_cat_left})'
    top_cat_left = metadata_df[col].value_counts().index
    
sns.countplot(y=col, data=metadata_df, order=top_cat_left, hue=col, legend=False, ax=ax1)
ax1.set_title(title_left, fontsize=15)
ax1.set_xlabel('Sample Count (Full Dataset)', fontsize=12)
ax1.set_ylabel(col, fontsize=12)

# --- Right Plot (Filtered Dataset) ---
top_n_right = 30
num_cat_right = metadata_brain_df[col].nunique()
if num_cat_right > top_n_right:
    title_right = f'Filtered: "{col}" (Top {top_n_right} of {num_cat_right})'
    top_cat_right = metadata_brain_df[col].value_counts().nlargest(top_n_right).index
else:
    title_right = f'Filtered: "{col}" (All {num_cat_right})'
    top_cat_right = metadata_brain_df[col].value_counts().index
    
sns.countplot(y=col, data=metadata_brain_df, order=top_cat_right, hue=col, legend=False, ax=ax2)
ax2.set_title(title_right, fontsize=15)
ax2.set_xlabel('Sample Count (Brain Filtered)', fontsize=12)
ax2.set_ylabel(None) # Remove redundant y-label

plt.tight_layout()
plt.show()

In [None]:
col = '_sample_type'
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 7))

# --- Left Plot (Full Dataset) ---
top_n_left = 30
num_cat_left = metadata_df[col].nunique()
if num_cat_left > top_n_left:
    title_left = f'Full: "{col}" (Top {top_n_left} of {num_cat_left})'
    top_cat_left = metadata_df[col].value_counts().nlargest(top_n_left).index
else:
    title_left = f'Full: "{col}" (All {num_cat_left})'
    top_cat_left = metadata_df[col].value_counts().index
    
sns.countplot(y=col, data=metadata_df, order=top_cat_left, hue=col, legend=False, ax=ax1)
ax1.set_title(title_left, fontsize=15)
ax1.set_xlabel('Sample Count (Full Dataset)', fontsize=12)
ax1.set_ylabel(col, fontsize=12)

# --- Right Plot (Filtered Dataset) ---
top_n_right = 30
num_cat_right = metadata_brain_df[col].nunique()
if num_cat_right > top_n_right:
    title_right = f'Filtered: "{col}" (Top {top_n_right} of {num_cat_right})'
    top_cat_right = metadata_brain_df[col].value_counts().nlargest(top_n_right).index
else:
    title_right = f'Filtered: "{col}" (All {num_cat_right})'
    top_cat_right = metadata_brain_df[col].value_counts().index
    
sns.countplot(y=col, data=metadata_brain_df, order=top_cat_right, hue=col, legend=False, ax=ax2)
ax2.set_title(title_right, fontsize=15)
ax2.set_xlabel('Sample Count (Brain Filtered)', fontsize=12)
ax2.set_ylabel(None)

plt.tight_layout()
plt.show()

In [None]:
b_samples_to_keep = metadata_brain_df['sample'].tolist()

print(f"Total samples to keep for analysis: {len(b_samples_to_keep)}")

filtered_genes = gene_expression_df[
    gene_expression_df.columns.intersection(b_samples_to_keep)
].copy()

print("\n--- Pipeline Complete ---")
print(f"Final `gene` matrix shape: {filtered_genes.shape}")
display(filtered_genes.head())

## Store data for ease of use 

In [None]:
metadata_brain_copy = metadata_brain_df.set_index('sample').copy()

metadata_brain_copy.to_csv(metadata_filepath, index_label='sample_id')
print(f"Labels saved to: {metadata_filepath}")

In [None]:
# We add index_label='sample_id' to name the first column
filtered_genes.to_csv(expression_filepath, index_label='sample_id')
print(f"Scaled data saved to: {expression_filepath}")