# Preprocessing

- Used Python `3.13.1`
- Folder structure as following:
  - Unhealthy data: `data/raw/unhealthy/*/*.tsv` 
  - Healthy data: `data/raw/healthy.gct`

## Prepare all unhealthy data
We read all the files inside the `unhealthy` directory and convert all the `.tsv` data to a unified `.csv` file.

# TODO:
Normalize the data, on a scale from 0 to 1 (0 = 0% 1 = 100%, 0.5 = 50%)

The values from the unhealthy data 0 being 0 and the highest number being 100%, make all other data in that ratio.

In [1]:
import os
import pandas as pd
from glob import glob

# Path to your unhealthy files
all_files = glob(os.path.join("data/raw/unhealthy/", "*", "*.tsv"))

sample_dict = {}

for file_path in all_files:
    sample_name = os.path.splitext(os.path.basename(file_path))[0]
    try:
        df = pd.read_csv(file_path, sep="\t", usecols=["gene_id", "gene_name", "copy_number"])
        df["gene_key"] = df["gene_id"] + "|" + df["gene_name"]
        df.set_index("gene_key", inplace=True)
        sample_dict[sample_name] = df["copy_number"]
    except Exception as e:
        print(f"Error processing {file_path}: {e}")

# Combine into one matrix
combined_df = pd.DataFrame(sample_dict)

# Optional: split back gene_id and gene_name for clarity
combined_df.index.name = "gene_key"
combined_df.reset_index(inplace=True)
combined_df[["gene_id", "gene_name"]] = combined_df["gene_key"].str.split("|", expand=True)
combined_df.drop(columns="gene_key", inplace=True)
combined_df.set_index(["gene_id", "gene_name"], inplace=True)

# Sort rows and columns
combined_df.sort_index(inplace=True)
combined_df = combined_df[sorted(combined_df.columns)]

# Save to CSV
combined_df.to_csv("data/processed/unhealthy_matrix.csv")


## Preprocess healthy data (.gct)

# TODO:
Normalize the data, on a scale from 0 to 1 (0 = 0% 1 = 100%, 0.5 = 50%)

The values from the unhealthy data 0 being 0 and the highest number being 100%, make all other data in that ratio.

In [2]:
import pandas as pd

def gct_to_gene_matrix(filepath: str) -> pd.DataFrame:
    # Read GCT file (skip metadata lines)
    gct_df = pd.read_csv(filepath, sep='\t', skiprows=2)

    # Extract gene identifiers
    gene_id = gct_df["Name"]
    gene_name = gct_df["Description"]

    # Drop metadata columns
    expr_df = gct_df.drop(columns=["Name", "Description"])

    # Add gene info
    expr_df.insert(0, "gene_name", gene_name)
    expr_df.insert(0, "gene_id", gene_id)

    # Save to CSV
    expr_df.to_csv("data/processed/healthy_matrix.csv", index=False)

gct_to_gene_matrix("data/raw/healthy.gct")

## Align the matrixes

In [3]:
import pandas as pd

# Load both matrices
unhealthy_df = pd.read_csv("data/processed/unhealthy_matrix.csv", index_col=["gene_id", "gene_name"])
healthy_df = pd.read_csv("data/processed/healthy_matrix.csv", index_col=["gene_id", "gene_name"])

# Find common genes
common_genes = unhealthy_df.index.intersection(healthy_df.index)

# Filter and sort both datasets to match gene order
unhealthy_common = unhealthy_df.loc[common_genes].sort_index()
healthy_common = healthy_df.loc[common_genes].sort_index()

# Double-check shape alignment
assert unhealthy_common.shape[0] == healthy_common.shape[0], "Row mismatch after filtering."

# Optional: Save aligned versions
unhealthy_common.to_csv("data/processed/unhealthy_aligned.csv")
healthy_common.to_csv("data/processed/healthy_aligned.csv")


## Combine them

In [None]:
import pandas as pd

# Load the aligned data - use the correct indices
healthy = pd.read_csv("data/processed/healthy_aligned.csv", index_col=["gene_id"])
unhealthy = pd.read_csv("data/processed/unhealthy_aligned.csv", index_col=["gene_id"])

# Transpose so rows are patients, columns are genes
# Convert MultiIndex columns to string to avoid issues
healthy_patients_T = healthy.T
unhealthy_patients_T = unhealthy.T

# Add labels
healthy_patients_T["healthy"] = 1
unhealthy_patients_T["healthy"] = 0

# Add patient_id as a column
healthy_patients_T["patient_id"] = healthy_patients_T.index
unhealthy_patients_T["patient_id"] = unhealthy_patients_T.index

# Combine datasets
combined = pd.concat([healthy_patients_T, unhealthy_patients_T], axis=0)

# Reorder columns to put patient_id and healthy first
gene_cols = [col for col in combined.columns if col not in ["patient_id", "healthy"]]
cols = ["patient_id", "healthy"] + gene_cols
combined = combined[cols]

# Save to csv
combined.to_csv("data/processed/combined_labeled.csv", index=False)
