# Preprocessing

- Used Python `3.13.1`
- Folder structure as following:
  - Unhealthy data: `data/raw/unhealthy/*/*.tsv` 
  - Healthy data: `data/raw/healthy.gct`

## Prepare all unhealthy data
We read all the files inside the `unhealthy` directory and convert all the `.tsv` data to a unified `.csv` file.

In [1]:
import os
import pandas as pd
from glob import glob

all_files = glob(os.path.join("data/raw/unhealthy/", "*", "*.tsv"))

sample_dict = {}

for file_path in all_files:
    sample_name = os.path.splitext(os.path.basename(file_path))[0]
    try:
        df = pd.read_csv(file_path, sep="\t", usecols=["gene_id", "gene_name", "copy_number"])
        df["gene_key"] = df["gene_id"] + "|" + df["gene_name"]
        df.set_index("gene_key", inplace=True)
        sample_dict[sample_name] = df["copy_number"]
    except Exception as e:
        print(f"Error processing {file_path}: {e}")

combined_df = pd.DataFrame(sample_dict)

combined_df.index.name = "gene_key"
combined_df.reset_index(inplace=True)
combined_df[["gene_id", "gene_name"]] = combined_df["gene_key"].str.split("|", expand=True)
combined_df.drop(columns="gene_key", inplace=True)
combined_df.set_index(["gene_id", "gene_name"], inplace=True)
combined_df.sort_index(inplace=True)
combined_df = combined_df[sorted(combined_df.columns)]

combined_df.to_csv("data/processed/unhealthy_matrix.csv")


Error processing data/raw/unhealthy\00252d7f-e222-462f-badc-b97e8dce2021\5abd235a-829b-4457-8f47-ecd1adab30ca.rna_seq.augmented_star_gene_counts.tsv: Number of passed names did not match number of header fields in the file
Error processing data/raw/unhealthy\00f8b5fa-e195-42bd-a3a7-95a97a33aea6\55731cdc-3348-4cfd-8bf1-d5a296057110.rna_seq.augmented_star_gene_counts.tsv: Number of passed names did not match number of header fields in the file
Error processing data/raw/unhealthy\0149679a-8220-47ba-9060-eda9329bee70\2e7d0821-ecd1-4e3f-ace3-b160b2e6f690.rna_seq.augmented_star_gene_counts.tsv: Number of passed names did not match number of header fields in the file
Error processing data/raw/unhealthy\01851b83-f8c4-4108-99c9-248fe29e39e9\6dd12311-581d-46fd-9b98-6523b67ec2e4.rna_seq.augmented_star_gene_counts.tsv: Number of passed names did not match number of header fields in the file
Error processing data/raw/unhealthy\02c1c266-473f-437d-b768-20f685349ecb\8a31c02d-a429-4d4e-a461-8024bea95b1


KeyboardInterrupt



## Preprocess healthy data (.gct)

In [None]:
import pandas as pd

gct_df = pd.read_csv("data/raw/healthy.gct", sep='\t', skiprows=2)

gene_id = gct_df["Name"]
gene_name = gct_df["Description"]

expr_df = gct_df.drop(columns=["Name", "Description"])

expr_df.insert(0, "gene_name", gene_name)
expr_df.insert(0, "gene_id", gene_id)

expr_df.to_csv("data/processed/healthy_matrix.csv", index=False)


## Align the matrixes

In [None]:
import pandas as pd

unhealthy_df = pd.read_csv("data/processed/unhealthy_matrix.csv", index_col=["gene_id", "gene_name"])
healthy_df = pd.read_csv("data/processed/healthy_matrix.csv", index_col=["gene_id", "gene_name"])

common_genes = unhealthy_df.index.intersection(healthy_df.index)

unhealthy_common = unhealthy_df.loc[common_genes].sort_index()
healthy_common = healthy_df.loc[common_genes].sort_index()

assert unhealthy_common.shape[0] == healthy_common.shape[0], "Row mismatch after filtering."

unhealthy_common.to_csv("data/processed/unhealthy_aligned.csv")
healthy_common.to_csv("data/processed/healthy_aligned.csv")


## Combine them into 1 file

In [None]:
import pandas as pd

healthy = pd.read_csv("data/processed/healthy_aligned.csv", index_col=["gene_id"])
unhealthy = pd.read_csv("data/processed/unhealthy_aligned.csv", index_col=["gene_id"])

healthy_patients_T = healthy.T
unhealthy_patients_T = unhealthy.T

healthy_patients_T["healthy"] = 1
unhealthy_patients_T["healthy"] = 0

healthy_patients_T["patient_id"] = healthy_patients_T.index
unhealthy_patients_T["patient_id"] = unhealthy_patients_T.index

combined = pd.concat([healthy_patients_T, unhealthy_patients_T], axis=0)

gene_cols = [col for col in combined.columns if col not in ["patient_id", "healthy"]]
cols = ["patient_id", "healthy"] + gene_cols
combined = combined[cols]
combined = combined[~combined["patient_id"].str.startswith("gene_name")] # Remove gene_name rows

combined.to_csv("data/processed/combined_labeled.csv", index=False)


## Standardize the values
We have unhealthy data using copy_number and healthy data using expression.

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

df = pd.read_csv("data/processed/combined_labeled.csv")

gene_cols = [c for c in df.columns if c not in ("patient_id", "healthy")]

# healthy_df = df[df["healthy"] == 1].copy()
# unhealthy_df = df[df["healthy"] == 0].copy()

scaler_h = StandardScaler()
# scaler_uh = StandardScaler()

healthy_df[gene_cols] = scaler_h.fit_transform(df[gene_cols])
# unhealthy_df[gene_cols] = scaler_h.fit_transform(unhealthy_df[gene_cols])

# df_scaled = pd.concat([healthy_df], axis=0)
healthy_df.to_csv("data/processed/combined_labeled_standardized2.csv", index=False)

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


Saved standardized (separate) dataset to combined_labeled_standardized_sep.csv
