In [1]:
import anndata as ad
import pandas as pd
import numpy as np

In [2]:
# DE labels
kaggle_train_de_df = pd.read_parquet('data/de_train.parquet')

In [3]:
genes = list(set(kaggle_train_de_df.columns.tolist()) - set(["cell_type", "sm_name", "sm_lincs_id", "SMILES", "control"]))

In [4]:
len(genes)

18211

In [5]:
# adata_test_df = pd.read_parquet('data/df_expr_predict.all_donors.parquet')
# genes = list(set(adata_test_df.columns.tolist()) - set(['donor_id','sm_name', 'cell_type']))

In [6]:
adata_train_df = pd.read_parquet('data/adata_train.parquet')

In [7]:
adata_train_df = adata_train_df.drop("normalized_count", axis=1)

In [8]:
meta_data_columns = ['obs_id', 'donor_id', 'cell_type', 'sm_name']
adata_obs_meta_df = pd.read_csv('data/adata_obs_meta.csv')[meta_data_columns]

In [9]:
adata_train_df = pd.merge(adata_train_df, adata_obs_meta_df, on="obs_id")

In [None]:
# sorted(adata_train_df.sm_name.unique())

In [None]:
# adata_train_df.to_parquet("data/adata_merged.parquet")

In [10]:
import pyarrow.parquet as pq

parquet_file = pq.ParquetFile("data/adata_merged.parquet")
result_chunks = []
for i in range(parquet_file.num_row_groups):
    chunk = parquet_file.read_row_group(i).to_pandas()
    result_chunk = chunk.groupby(["donor_id", "cell_type", "sm_name", "gene"]).sum()
    result_chunks.append(result_chunk)

In [11]:
adata_train_df = pd.concat(result_chunks)

In [12]:
# adata_train_df.to_parquet('data/filtered_adata_train.parquet', engine='pyarrow')

In [13]:
adata_train_df = adata_train_df.groupby(["donor_id", "cell_type", "sm_name", "gene"]).sum()

In [14]:
adata_train_df.drop("obs_id", axis=1, inplace=True)

In [15]:
adata_train_df.reset_index(inplace=True)

In [16]:
adata_train_df.to_parquet('data/filtered_adata_train.parquet', engine='pyarrow')

In [17]:
# Pivot the DataFrame
pivot_df = adata_train_df.pivot(index=["donor_id", "cell_type", "sm_name"], columns='gene', values='count')


In [18]:
extra_genes = list(set(pivot_df.columns.tolist()) - set(genes))

In [19]:
pivot_df.drop(extra_genes, axis=1, inplace=True)

In [21]:
pivot_df = pivot_df.fillna(0)

In [22]:
pivot_df.reset_index(inplace=True)

In [23]:
pivot_df.to_parquet('data/bulk_adata_train.parquet', engine='pyarrow')

In [79]:
pivot_df = pd.read_parquet('data/bulk_adata_train.parquet')

In [80]:
# 1. log normalization
pivot_df[genes] = pivot_df[genes].apply(lambda x: np.log(x / x.sum() + 1), axis=1)

In [81]:
# 2. neg control subtraction
neg_ctr = "Dimethyl Sulfoxide"
neg_df = pivot_df[pivot_df['sm_name'] == neg_ctr]

new_gene_names = [g + "_neg" for g in genes]

merged_df = pivot_df.merge(neg_df, on=["donor_id", "cell_type"], suffixes=('', '_neg'))

In [82]:
pivot_df[genes] = pd.DataFrame(pivot_df[genes].to_numpy() - merged_df[new_gene_names].to_numpy())

In [84]:
neg_df.to_parquet('data/neg.parquet', engine='pyarrow')
pivot_df = pivot_df[pivot_df.sm_name != neg_ctr]
del merged_df

In [85]:
pivot_df = pivot_df.groupby(["cell_type", "sm_name"])[genes].mean().reset_index()

In [86]:
pivot_df.to_parquet('data/ddd.parquet', engine='pyarrow')

In [87]:
pivot_df.shape #(255, 54638)

(614, 18213)

In [88]:
ddde = pd.merge(pivot_df, kaggle_train_de_df[genes + ["cell_type", "sm_name"]], on=["cell_type", "sm_name"], how="inner", suffixes = ('_d2','_de'))

In [89]:
ddde.to_parquet('data/ddde.parquet', engine='pyarrow')

In [90]:
ddde.shape

(614, 36424)