In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# %cd '/content/drive/MyDrive/Bioinfo_Project'

/content/drive/MyDrive/Bioinfo_Project


In [1]:
import pandas as pd

In [2]:
import pandas as pd
# import polars as pl
import numpy as np
from scipy.stats import pearsonr

prev_disease = pd.read_csv("df_d_final.csv", nrows=5)
prev_control = pd.read_csv("df_c_final.csv", nrows=5)
    
gene_col_d = prev_disease.columns[0]
gene_col_c = prev_control.columns[0]

numeric_cols_d = prev_disease.select_dtypes(include=["number"]).columns.tolist()
numeric_cols_c = prev_control.select_dtypes(include=["number"]).columns.tolist()

print("Disease numeric cols:", len(numeric_cols_d))
print("Control numeric cols:", len(numeric_cols_c))

Disease numeric cols: 147
Control numeric cols: 173


Mean expression per gene for disease

In [3]:
import polars as pl

In [4]:
# disease_mean = {}

# for chunk in pd.read_csv(
#     "df_d_final.csv",
#     usecols=[gene_col_d] + numeric_cols_d,
#     index_col=gene_col_d,
#     chunksize=200_000
# ):
#     m = chunk[numeric_cols_d].mean(axis=1)
#     disease_mean.update(m.to_dict())

# print("Disease mean genes:", len(disease_mean))
# 1. Load the entire dataframe at once

# df = pd.read_csv(
#     "df_d_final.csv",
#     usecols=[gene_col_d] + numeric_cols_d,
#     index_col=gene_col_d
# )

# # 2. Calculate the mean across the rows (axis=1)
# # This creates a pandas Series
# mean_series = df[numeric_cols_d].mean(axis=1)

# # 3. Convert to dictionary
# disease_mean = mean_series.to_dict()

# print("Disease mean genes:", len(disease_mean))

import polars as pl

# 1. Read the CSV (Eager mode)
# Polars uses 'columns' instead of 'usecols'
df = pl.read_csv(
    "df_d_final.csv",
    columns=[gene_col_d] + numeric_cols_d
)

# 2. Calculate Mean and Select
# Polars calculates row-wise means using 'mean_horizontal'
result = df.select([
    pl.col(gene_col_d),
    pl.mean_horizontal(numeric_cols_d).alias("mean_value")
])

# 3. Convert to dictionary
# Zip the gene column (keys) and mean column (values)
disease_mean = dict(zip(result[gene_col_d], result["mean_value"]))

print("Disease mean genes:", len(disease_mean))

Disease mean genes: 644


Mean expression per gene for control

In [5]:
# control_mean = {}

# for chunk in pd.read_csv(
#     "df_control_final_first10.csv",
#     usecols=[gene_col_c] + numeric_cols_c,
#     index_col=gene_col_c,
#     chunksize=200_000
# ):
#     m = chunk[numeric_cols_c].mean(axis=1)
#     control_mean.update(m.to_dict())

# print("Control mean genes:", len(control_mean))

# 1. Read the CSV (Eager mode)
df = pl.read_csv(
    "df_c_final.csv",
    columns=[gene_col_c] + numeric_cols_c
)

# 2. Calculate Mean and Select
result = df.select([
    pl.col(gene_col_c),
    pl.mean_horizontal(numeric_cols_c).alias("mean_value")
])

# 3. Convert to dictionary
control_mean = dict(zip(result[gene_col_c], result["mean_value"]))

print("Control mean genes:", len(control_mean))


Control mean genes: 644


Building disaese signature (Disease Mean - Control Mean)

In [6]:
disease_signature = {}
all_genes = set(disease_mean.keys()).union(set(control_mean.keys()))

for g in all_genes:
    disease_signature[g] = disease_mean.get(g, 0) - control_mean.get(g, 0)

print("Disease signature genes:", len(disease_signature))

Disease signature genes: 644


Load Drug matrix and store each drug by similarity to disaese signature

In [9]:
df_drug = pl.read_csv("drugs_normalized.csv")
print("Drug shape:", df_drug.shape)

Drug shape: (652, 173014)


In [12]:
# def score_drug(drug_profile, disease_signature_dict):
#     common_genes = list(set(drug_profile.index) & set(disease_signature_dict.keys()))

#     if len(common_genes) < 3:
#         return np.nan

#     x = drug_profile.loc[common_genes].values
#     y = np.array([disease_signature_dict[g] for g in common_genes])

#     if np.std(x) == 0 or np.std(y) == 0:
#         return np.nan

#     score, _ = pearsonr(x, y)
#     return score


# drug_results = []
# for drug_col in df_drug.columns:
#     score = score_drug(df_drug[drug_col], disease_signature)
#     drug_results.append((drug_col, score))

# drug_scores_df = pd.DataFrame(drug_results, columns=["Drug", "RestorationScore"])
# drug_scores_df = drug_scores_df.sort_values("RestorationScore", ascending=False)

# drug_scores_df.head(20)

In [11]:
def score_drug_polars(drug_profile_series, disease_signature_dict):
    # Get common genes between drug profile and disease signature
    common_genes = list(set(drug_profile_series.to_list()) & set(disease_signature_dict.keys()))
    
    if len(common_genes) < 3:
        return np.nan
    
    # Create lists for x and y values
    x = []
    y = []
    
    # Build the arrays for common genes
    drug_dict = dict(zip(drug_profile_series.to_list(), range(len(drug_profile_series))))
    
    for g in common_genes:
        x.append(drug_profile_series[drug_dict[g]])
        y.append(disease_signature_dict[g])
    
    x = np.array(x)
    y = np.array(y)
    
    if np.std(x) == 0 or np.std(y) == 0:
        return np.nan
    
    score, _ = pearsonr(x, y)
    return score


# Get the gene column (first column "Row")
gene_col = df_drug["Row"]

drug_results = []
for col_name in df_drug.columns[1:]:  # Skip the "Row" column
    drug_values = df_drug[col_name]
    
    # Create a mapping of genes to drug values
    drug_dict = dict(zip(gene_col, drug_values))
    
    # Find common genes
    common_genes = list(set(drug_dict.keys()) & set(disease_signature.keys()))
    
    if len(common_genes) < 3:
        drug_results.append((col_name, np.nan))
        continue
    
    x = np.array([drug_dict[g] for g in common_genes])
    y = np.array([disease_signature[g] for g in common_genes])
    
    if np.std(x) == 0 or np.std(y) == 0:
        drug_results.append((col_name, np.nan))
        continue
    
    score, _ = pearsonr(x, y)
    drug_results.append((col_name, score))

drug_scores_df = pd.DataFrame(drug_results, columns=["Drug", "RestorationScore"])
drug_scores_df = drug_scores_df.sort_values("RestorationScore", ascending=False)

drug_scores_df.head(20)

Unnamed: 0,Drug,RestorationScore
56016,CPC011_PC3_24H:BRD-A34817987-001-24-1:10,0.806962
2270,CPC002_HCC515_24H:BRD-K49671696-045-03-6:10,0.778192
2443,CPC002_HCC515_6H:BRD-K48367671-001-01-8:10,0.773285
34219,CPC018_HT29_6H:BRD-K50398167-236-12-8:10,0.691024
2248,CPC002_HCC515_24H:BRD-K40578143-001-01-8:10,0.689708
45524,NMH002_FIBRNPC_24H:BRD-K53263234-001-04-5:10,0.684642
9738,CPC005_A549_6H:BRD-A65440446-001-03-3:10,0.684067
161548,REP.A024_HT29_24H:D12,0.680732
18502,CPC006_PC3_24H:BRD-K32330832-001-01-0:11.1,0.666923
72954,HOG003_MCF7_24H:BRD-K01976263-003-04-5:0.0015,0.660546


Score by restoration score descending

In [15]:
top_drugs = drug_scores_df.sort_values(by='RestorationScore', ascending=False).head(999)

top_drugs = top_drugs.reset_index(drop=True)

def parse_drug_name(drug_name):
    parts = drug_name.split(':')
    compound_cell = parts[0]
    well = parts[1] if len(parts) > 1 else ''
    compound, cell_line, time = compound_cell.split('_')
    return compound, cell_line, time, well

top_drugs[['Compound','CellLine','Time','Well']] = top_drugs['Drug'].apply(lambda x: pd.Series(parse_drug_name(x)))

top_drugs = top_drugs[['Compound','CellLine','Time','Well','RestorationScore']]

print(top_drugs)

top_drugs.to_csv('top_drugs.csv', index=False)

print("Results saved to top_drugs.csv")

    Compound CellLine Time                    Well  RestorationScore
0     CPC011      PC3  24H  BRD-A34817987-001-24-1          0.806962
1     CPC002   HCC515  24H  BRD-K49671696-045-03-6          0.778192
2     CPC002   HCC515   6H  BRD-K48367671-001-01-8          0.773285
3     CPC018     HT29   6H  BRD-K50398167-236-12-8          0.691024
4     CPC002   HCC515  24H  BRD-K40578143-001-01-8          0.689708
..       ...      ...  ...                     ...               ...
994   CPC019     MCF7   6H  BRD-A14985772-001-02-5         -0.007387
995   DEB001     MCF7   6H  BRD-A75409952-001-05-7         -0.007479
996   CPC017     A549   6H  BRD-A38898897-003-01-0         -0.007498
997   CPC004     HA1E   6H  BRD-A65671304-001-03-4         -0.007599
998   CPC004      PC3  24H  BRD-K29950728-048-08-3         -0.007814

[999 rows x 5 columns]
Results saved to top_drugs.csv
