# Preprocessing

- Used Python `3.13.1`
- Folder structure as following:
  - Unhealthy data: `data/raw/unhealthy/*/*.tsv` 
  - Healthy data: `data/raw/healthy.gct`

## Prepare all unhealthy data
We read all the files inside the `unhealthy` directory and convert all the `.tsv` data to a unified `.csv` file.

In [9]:
import os
import pandas as pd
from glob import glob

all_files = glob(os.path.join("data/raw/unhealthy/", "*", "*.tsv"))

sample_dict = {}

for file_path in all_files:
    sample_name = os.path.splitext(os.path.basename(file_path))[0]
    try:
        df = pd.read_csv(file_path, sep="\t", skiprows=1, usecols=["gene_id", "gene_name", "tpm_unstranded"])
        df["gene_key"] = df["gene_id"] + "|" + df["gene_name"]
        df.set_index("gene_key", inplace=True)
        sample_dict[sample_name] = df["tpm_unstranded"]
    except Exception as e:
        print(f"Error processing {file_path}: {e}")

combined_df = pd.DataFrame(sample_dict)

combined_df.index.name = "gene_key"
combined_df.reset_index(inplace=True)
combined_df[["gene_id", "gene_name"]] = combined_df["gene_key"].str.split("|", expand=True)
combined_df.drop(columns="gene_key", inplace=True)
combined_df.set_index(["gene_id", "gene_name"], inplace=True)
combined_df.sort_index(inplace=True)
combined_df = combined_df[sorted(combined_df.columns)]

combined_df.to_csv("data/processed/unhealthy_matrix.csv")


## Preprocess healthy data (.gct)

In [10]:
import pandas as pd

gct_df = pd.read_csv("data/raw/healthy.gct", sep='\t', skiprows=2)

gene_id = gct_df["Name"]
gene_name = gct_df["Description"]

expr_df = gct_df.drop(columns=["Name", "Description"])

expr_df.insert(0, "gene_name", gene_name)
expr_df.insert(0, "gene_id", gene_id)

expr_df.to_csv("data/processed/healthy_matrix.csv", index=False)


## Align the matrixes

In [11]:
import pandas as pd

unhealthy_df = pd.read_csv("data/processed/unhealthy_matrix.csv", index_col=["gene_id", "gene_name"])
healthy_df = pd.read_csv("data/processed/healthy_matrix.csv", index_col=["gene_id", "gene_name"])

common_genes = unhealthy_df.index.intersection(healthy_df.index)

unhealthy_common = unhealthy_df.loc[common_genes].sort_index()
healthy_common = healthy_df.loc[common_genes].sort_index()

assert unhealthy_common.shape[0] == healthy_common.shape[0], "Row mismatch after filtering."

unhealthy_common.to_csv("data/processed/unhealthy_aligned.csv")
healthy_common.to_csv("data/processed/healthy_aligned.csv")


## Combine them into 1 file

In [12]:
import pandas as pd

healthy = pd.read_csv("data/processed/healthy_aligned.csv", index_col=["gene_id"])
unhealthy = pd.read_csv("data/processed/unhealthy_aligned.csv", index_col=["gene_id"])

healthy_patients_T = healthy.T
unhealthy_patients_T = unhealthy.T

healthy_patients_T["healthy"] = 1
unhealthy_patients_T["healthy"] = 0

healthy_patients_T["patient_id"] = healthy_patients_T.index
unhealthy_patients_T["patient_id"] = unhealthy_patients_T.index

combined = pd.concat([healthy_patients_T, unhealthy_patients_T], axis=0)

gene_cols = [col for col in combined.columns if col not in ["patient_id", "healthy"]]
cols = ["patient_id", "healthy"] + gene_cols
combined = combined[cols]
combined = combined[~combined["patient_id"].str.startswith("gene_name")] # Remove gene_name rows

combined.to_csv("data/processed/combined_labeled.csv", index=False)


## Standardize the values
Both our datasets use gene expression values with TPM as type.

In [13]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

df = pd.read_csv("data/processed/combined_labeled.csv")

# Identify gene columns
gene_cols = [c for c in df.columns if c not in ("patient_id", "healthy")]

# Apply a single standard scaler to all data
scaler = StandardScaler()
df[gene_cols] = scaler.fit_transform(df[gene_cols])

# Save the standardized dataset
df.to_csv("data/processed/combined_labeled_standardized.csv", index=False)