## MCE

In [None]:
import pandas as pd

import pandas as pd

# ---- 1. Load counts table (Excel) ----
counts = pd.read_excel(
    "/Users/hornung_comp1/Downloads/MCE_Bioactive_Compounds_HEK293T_10μM_Counts.xlsx",
    engine="openpyxl", header=1
)

# ---- 2. Load meta table (CSV) ----
meta = pd.read_csv(
    "/Users/hornung_comp1/meta_MCE_with_smiles_pubchem_parallel.csv",
)

counts.head(), meta.head()

# Merge so each count row knows its treatment & metadata
df = counts.merge(
    meta,
    left_on='Sample_id',
    right_on='unique_ID',
    how='left'
)

non_gene_cols = [
    'Sample_id',
    'unique_ID',
    'experiment_no',
    'sample_plate',
    'sample_row',       # numeric but NOT a gene
    'sample_column',    # numeric but NOT a gene
    'cell_id',
    'pert_itime',
    'pert_idose',
    'sample',
    'treatment',
    'Catalog Number',
    'Compound name',
    'CAS Number',
    'smiles'
]

gene_cols = [c for c in df.columns if c not in non_gene_cols]


# Average across replicates of the same treatment
avg_counts = (
    df
    .groupby('treatment')[gene_cols]
    .mean()
    .reset_index()
)

# ---- 5. Per-treatment meta (only treatments with ≥1 SMILES) ----
smiles_per_treat = (
    df[['treatment', 'smiles', 'Catalog Number', 'Compound name', 'CAS Number']]
    .dropna(subset=['smiles'])               # keep only rows where SMILES is present
    .drop_duplicates(subset=['treatment'])   # one meta row per treatment
)

# Merge: keep only treatments that have a SMILES somewhere
avg_with_smiles = avg_counts.merge(
    smiles_per_treat,
    on='treatment',
    how='inner'   # drop treatments with no SMILES in any replicate
)

# ---- 6. Add dose + seq_platform ----
avg_with_smiles['pert_idose'] = 10          # 10 µM
avg_with_smiles['seq_platform'] = 0         # arbitrary platform code

# ---- 7. Reorder columns: meta first, then genes ----
meta_first = [
    'treatment',
    'smiles',
    'Catalog Number',
    'Compound name',
    'CAS Number',
    'pert_idose',
    'seq_platform'
]

gene_cols_final = [c for c in avg_with_smiles.columns if c not in meta_first]

avg_with_smiles = avg_with_smiles[meta_first + gene_cols_final]

# ---- 8. Normalize gene counts to CPM → log1p ----
cpm = (
    avg_with_smiles[gene_cols_final]
    .div(avg_with_smiles[gene_cols_final].sum(axis=1), axis=0)
    * 1e6
)

log1p_cpm = np.log1p(cpm)

# ---- 9. Keep only smiles, dose, platform + genes ----
meta_for_ml = ['smiles', 'pert_idose', 'seq_platform']

final_df = pd.concat(
    [avg_with_smiles[meta_for_ml], log1p_cpm],
    axis=1
)

# ---- 10. Save one CSV (no X/Y split yet) ----
out_path = "/Users/hornung_comp1/MCE_HEK293T_10uM_averaged_log1pCPM.csv"
final_df.to_csv(out_path, index=False)

print(f"Saved: {out_path}")
print(f"Shape: {final_df.shape}")
print("Columns (first 10):", final_df.columns[:10].tolist())



In [None]:
import pandas as pd

# -----------------------------
# Load processed dataset
# -----------------------------
df = pd.read_csv("/Users/hornung_comp1/MCE_HEK293T_10uM_averaged_log1pCPM.csv")

# -----------------------------
# Define X columns
# -----------------------------
X_cols = ['smiles', 'pert_idose', 'seq_platform']

# Y = all gene columns
Y_cols = [c for c in df.columns if c not in X_cols]

# -----------------------------
# Split into X and Y
# -----------------------------
X = df[X_cols]
Y = df[Y_cols].astype(float)   # ensure numeric

# -----------------------------
# Save
# -----------------------------
X.to_csv("/Users/hornung_comp1/MCE_HEK293T_10uM_X.csv", index=False)
Y.to_csv("/Users/hornung_comp1/MCE_HEK293T_10uM_Y.csv", index=False)

print("Saved X and Y")
print("X shape:", X.shape)
print("Y shape:", Y.shape)


## TCM

In [None]:
import pandas as pd

import pandas as pd

# ---- 1. Load counts table (Excel) ----
counts = pd.read_excel(
    "/Users/hornung_comp1/Downloads/TCM_Compounds_HEK293T_10_Counts.xlsx",
    engine="openpyxl", header=1
)

# ---- 2. Load meta table (CSV) ----
meta = pd.read_csv(
    "/Users/hornung_comp1/meta_TCM_with_smiles_pubchem_parallel_5.csv",
)

counts.head(), meta.head()

df = counts.merge(
    meta,
    on='Sample_unique_id',   # SAME column name in both files
    how='left'
)


non_gene_cols = [
    'Sample_unique_id',
    'Treat',
    'Plate',
    'library',          # numeric but NOT a gene
    'Cell',
    'Dose',
    'Time',
    'Compound name',
    'Catalog Number',
    'Catalog Number.1',
    'smiles'
]

# 2) Gene columns = everything else
gene_cols = [c for c in df.columns if c not in non_gene_cols]


print(df[gene_cols].select_dtypes(exclude='number').columns)

# Average across replicates of the same treatment
avg_counts = (
    df
    .groupby('Treat')[gene_cols]
    .mean()
    .reset_index()
)



In [None]:
# ---- 5. Per-compound meta (only Treats with ≥1 SMILES) ----
smiles_per_treat = (
    df[['Treat', 'smiles', 'Compound name', 'Catalog Number', 'Catalog Number.1', 'Dose']]
    .dropna(subset=['smiles'])             # keep only rows where SMILES is present
    .drop_duplicates(subset=['Treat'])     # one meta row per Treat
)

# Merge: keep only Treats that have a SMILES somewhere
avg_with_smiles = avg_counts.merge(
    smiles_per_treat,
    on='Treat',
    how='inner'   # drop Treats with no SMILES in any replicate
)

# ---- 6. Add numeric dose + seq_platform ----
# Dose column is a string like '10uM'; for the model we want a numeric value.
avg_with_smiles['pert_idose'] = 10          # 10 µM
avg_with_smiles['seq_platform'] = 1         # arbitrary platform code

# ---- 7. Reorder columns: meta first, then genes ----
meta_first = [
    'Treat',            # group key
    'smiles',
    'Compound name',
    'Catalog Number',   # compound ID
    'Catalog Number.1', # CAS
    'Dose',             # original string dose
    'pert_idose',
    'seq_platform'
]

gene_cols_final = [c for c in avg_with_smiles.columns if c not in meta_first]

avg_with_smiles = avg_with_smiles[meta_first + gene_cols_final]

# ---- 8. Normalize gene counts to CPM → log1p ----
cpm = (
    avg_with_smiles[gene_cols_final]
    .div(avg_with_smiles[gene_cols_final].sum(axis=1), axis=0)
    * 1e6
)

log1p_cpm = np.log1p(cpm)

# ---- 9. Keep only smiles, dose, platform + genes for ML ----
meta_for_ml = ['smiles', 'pert_idose', 'seq_platform']

final_df = pd.concat(
    [avg_with_smiles[meta_for_ml], log1p_cpm],
    axis=1
)

# ---- 10. Save one CSV (no X/Y split yet) ----
out_path = "/Users/hornung_comp1/TCM_HEK293T_10uM_averaged_log1pCPM.csv"
final_df.to_csv(out_path, index=False)

print(f"Saved: {out_path}")
print(f"Shape: {final_df.shape}")
print("Columns (first 10):", final_df.columns[:10].tolist())


In [None]:
import pandas as pd

# -----------------------------
# Load processed dataset
# -----------------------------
df = pd.read_csv("/Users/hornung_comp1/TCM_HEK293T_10uM_averaged_log1pCPM.csv")

# -----------------------------
# Define X columns
# -----------------------------
X_cols = ['smiles', 'pert_idose', 'seq_platform']

# Y = all gene columns
Y_cols = [c for c in df.columns if c not in X_cols]

# -----------------------------
# Split into X and Y
# -----------------------------
X = df[X_cols]
Y = df[Y_cols].astype(float)   # ensure numeric

# -----------------------------
# Save
# -----------------------------
X.to_csv("/Users/hornung_comp1/TCM_HEK293T_10uM_X.csv", index=False)
Y.to_csv("/Users/hornung_comp1/TCM_HEK293T_10uM_Y.csv", index=False)

print("Saved X and Y")
print("X shape:", X.shape)
print("Y shape:", Y.shape)


## TCM 20 uM

In [None]:
import pandas as pd

# ----------------------------------------
# 1. Load 10 µM meta with SMILES (lookup)
# ----------------------------------------
meta_10 = pd.read_csv(
    "/Users/hornung_comp1/meta_TCM_with_smiles_pubchem_parallel_5.csv"
)

# Keep one row per Treat with compound info + SMILES
smiles_map = (
    meta_10[
        ['Treat', 'Compound name', 'Catalog Number', 'Catalog Number.1', 'smiles']
    ]
    .dropna(subset=['smiles'])
    .drop_duplicates(subset=['Treat'])
)

print("smiles_map shape:", smiles_map.shape)

# ----------------------------------------
# 2. Load 20 µM meta (no SMILES yet)
# ----------------------------------------
meta_20 = pd.read_excel(
    "/Users/hornung_comp1/Downloads/TCM_Compounds_HEK293T_20_MetaData.xlsx",
    engine="openpyxl", header=1
    # if headers look shifted, you might need header=1 instead
    # header=0 is default
)

print("meta_20 shape:", meta_20.shape)
print("meta_20 columns:", meta_20.columns.tolist())

# Ensure the 'Treat' column exists (it should)
assert 'Treat' in meta_20.columns, "Column 'Treat' not found in meta_20!"

# ----------------------------------------
# 3. Merge 20 µM meta with 10 µM SMILES via Treat
# ----------------------------------------
meta_20_with_smiles = meta_20.merge(
    smiles_map,
    on='Treat',
    how='left'   # keep all 20 µM rows, add smiles where possible
)

print("Merged shape:", meta_20_with_smiles.shape)

# Optional sanity check: which Treats didn't get SMILES?
missing_smiles = meta_20_with_smiles[meta_20_with_smiles['smiles'].isna()]['Treat'].unique()
print("Treats without SMILES (if any):", missing_smiles)

# ----------------------------------------
# 4. Save 20 µM meta with SMILES
# ----------------------------------------
out_path = "/Users/hornung_comp1/meta_TCM_20_with_smiles_pubchem_parallel.csv"
meta_20_with_smiles.to_csv(out_path, index=False)

print("Saved:", out_path)


In [None]:
import pandas as pd

import pandas as pd

# ---- 1. Load counts table (Excel) ----
counts = pd.read_excel(
    "/Users/hornung_comp1/Downloads/TCM_Compounds_HEK293T_20_Counts.xlsx",
    engine="openpyxl", header=1
)

# ---- 2. Load meta table (CSV) ----
meta = pd.read_csv(
    "/Users/hornung_comp1/meta_TCM_20_with_smiles_pubchem_parallel.csv",
)

counts.head(), meta.head()

df = counts.merge(
    meta,
    on='Sample_unique_id',   # SAME column name in both files
    how='left'
)

non_gene_cols = [
    'Sample_unique_id',
    'Treat',
    'Plate',
    'library',          # numeric but NOT a gene
    'Cell',
    'Dose',
    'Time',
    'Compound name',
    'Catalog Number',
    'Catalog Number.1',
    'smiles'
]

# 2) Gene columns = everything else
gene_cols = [c for c in df.columns if c not in non_gene_cols]


print(df[gene_cols].select_dtypes(exclude='number').columns)


avg_counts = (
    df
    .groupby('Treat')[gene_cols]
    .mean()
    .reset_index()
)


# ---- 5. Per-compound meta (only Treats with ≥1 SMILES) ----
smiles_per_treat = (
    df[['Treat', 'smiles', 'Compound name', 'Catalog Number', 'Catalog Number.1', 'Dose']]
    .dropna(subset=['smiles'])             # keep only rows where SMILES is present
    .drop_duplicates(subset=['Treat'])     # one meta row per Treat
)

# Merge: keep only Treats that have a SMILES somewhere
avg_with_smiles = avg_counts.merge(
    smiles_per_treat,
    on='Treat',
    how='inner'   # drop Treats with no SMILES in any replicate
)

# ---- 6. Add numeric dose + seq_platform ----
# Dose column is a string like '10uM'; for the model we want a numeric value.
avg_with_smiles['pert_idose'] = 20          # 10 µM
avg_with_smiles['seq_platform'] = 1         # arbitrary platform code

# ---- 7. Reorder columns: meta first, then genes ----
meta_first = [
    'Treat',            # group key
    'smiles',
    'Compound name',
    'Catalog Number',   # compound ID
    'Catalog Number.1', # CAS
    'Dose',             # original string dose
    'pert_idose',
    'seq_platform'
]

gene_cols_final = [c for c in avg_with_smiles.columns if c not in meta_first]

avg_with_smiles = avg_with_smiles[meta_first + gene_cols_final]

# ---- 8. Normalize gene counts to CPM → log1p ----
cpm = (
    avg_with_smiles[gene_cols_final]
    .div(avg_with_smiles[gene_cols_final].sum(axis=1), axis=0)
    * 1e6
)

log1p_cpm = np.log1p(cpm)

# ---- 9. Keep only smiles, dose, platform + genes for ML ----
meta_for_ml = ['smiles', 'pert_idose', 'seq_platform']

final_df = pd.concat(
    [avg_with_smiles[meta_for_ml], log1p_cpm],
    axis=1
)

# ---- 10. Save one CSV (no X/Y split yet) ----
out_path = "/Users/hornung_comp1/TCM_HEK293T_20uM_averaged_log1pCPM.csv"
final_df.to_csv(out_path, index=False)

print(f"Saved: {out_path}")
print(f"Shape: {final_df.shape}")
print("Columns (first 10):", final_df.columns[:10].tolist())



In [None]:
import pandas as pd

# -----------------------------
# Load processed dataset
# -----------------------------
df = pd.read_csv("/Users/hornung_comp1/TCM_HEK293T_20uM_averaged_log1pCPM.csv")

# -----------------------------
# Define X columns
# -----------------------------
X_cols = ['smiles', 'pert_idose', 'seq_platform']

# Y = all gene columns
Y_cols = [c for c in df.columns if c not in X_cols]

# -----------------------------
# Split into X and Y
# -----------------------------
X = df[X_cols]
Y = df[Y_cols].astype(float)   # ensure numeric

# -----------------------------
# Save
# -----------------------------
X.to_csv("/Users/hornung_comp1/TCM_HEK293T_20uM_X.csv", index=False)
Y.to_csv("/Users/hornung_comp1/TCM_HEK293T_20uM_Y.csv", index=False)

print("Saved X and Y")
print("X shape:", X.shape)
print("Y shape:", Y.shape)


## concat 

In [None]:
import pandas as pd

def concat_X_files(paths, save_path):
    dfs = []
    for p in paths:
        print("Loading:", p)
        df = pd.read_csv(p)
        dfs.append(df)

    X_all = pd.concat(dfs, axis=0, ignore_index=True)
    X_all.to_csv(save_path, index=False)
    print("Saved X:", save_path)
    print("Final X shape:", X_all.shape)
    return X_all


X_paths = [
    "/Users/hornung_comp1/TCM_HEK293T_10uM_X.csv",
    "/Users/hornung_comp1/TCM_HEK293T_20uM_X.csv",
    "/Users/hornung_comp1/MCE_HEK293T_10uM_X.csv",
]

X_out = "/Users/hornung_comp1/ALL_HEK293T_X.csv"

X_all = concat_X_files(X_paths, X_out)


def concat_Y_files(paths, save_path):
    dfs = []
    for p in paths:
        print("Loading:", p)
        df = pd.read_csv(p)
        dfs.append(df)

    Y_all = pd.concat(dfs, axis=0, ignore_index=True)

    # ensure float
    Y_all = Y_all.astype(float)

    Y_all.to_csv(save_path, index=False)
    print("Saved Y:", save_path)
    print("Final Y shape:", Y_all.shape)
    return Y_all


Y_paths = [
    "/Users/hornung_comp1/TCM_HEK293T_10uM_Y.csv",
    "/Users/hornung_comp1/TCM_HEK293T_20uM_Y.csv",
    "/Users/hornung_comp1/MCE_HEK293T_10uM_Y.csv",
]

Y_out = "/Users/hornung_comp1/ALL_HEK293T_Y.csv"

Y_all = concat_Y_files(Y_paths, Y_out)
