In [None]:
import pandas as pd # read Dataframe 
import numpy as np # for numerical operations
from sklearn.preprocessing import StandardScaler # to normalize data

In [None]:
# Load TUGDA data
gdsc_dataset = pd.read_csv('./data//TUGDA/GDSCDA_fpkm_AUC_all_drugs.zip', index_col=0)
drug_list = gdsc_dataset.columns[1780:]

# Load final lv2 edges dataframe from Network_Construction.py
lvl2_edges_df = pd.read_csv('./data/lv2_edges.csv', index_col=0)

### 1.Count-based

- How many genes of a drug occur in a specific pathway
- Example: Drug A has 3 genes in pathway X, then pathway X would be assigned a value of 3

In [None]:
# Count the number of unique genes (target) per source-drug combination
source_drug_gene_counts = (
    lvl2_edges_df.groupby(['source', 'drug'])['target']
    .nunique()
    .reset_index(name='gene_count')
)

# Pivot the data to create a matrix with drugs as rows and sources as columns
gene_count_pivot = source_drug_gene_counts.pivot(index='drug', columns='source', values='gene_count')
gene_count_pivot = gene_count_pivot.reindex(index=drug_list)
gene_count_pivot = gene_count_pivot.fillna(0)

# Apply log transformation (log(x + 1)) to handle zero values and reduce skewness
# Perform Z-score normalization for each column 
gene_count_log = np.log1p(gene_count_pivot)
scaler = StandardScaler()
gene_count_zscore = pd.DataFrame(
    scaler.fit_transform(gene_count_log),
    index=gene_count_log.index,
    columns=gene_count_log.columns
)

gene_count_zscore.to_csv("TUGDA/data/Pathways/gene_count_zscore.csv")

### 2. Relative frequency

- How many genes of a drug occur in a specific pathway
- Example: Drug A has 3 genes in pathway X and a total of 6 genes in all pathways, then pathway X would be assigned a value of 3/6.

In [None]:
# Count the number of unique genes (target) per source-drug combination
source_drug_gene_counts = (
    lvl2_edges_df.groupby(['source', 'drug'])['target']
    .nunique()
    .reset_index(name='gene_count')
)

# Total number of unique genes per drug
total_gene_counts_per_drug = (
    lvl2_edges_df.groupby('drug')['target']
    .nunique()
    .reset_index(name='total_genes_per_drug')
)

# Merge
relative_freq_df = pd.merge(source_drug_gene_counts, total_gene_counts_per_drug, on='drug')
relative_freq_df['relative_frequency'] = relative_freq_df['gene_count'] / relative_freq_df['total_genes_per_drug']

# Create pivot table: drugs as rows, pathways as columns
pivot_matrix = relative_freq_df.pivot(index='drug', columns='source', values='relative_frequency')
pivot_matrix = pivot_matrix.reindex(index=drug_list)

# Avoid NaN
pivot_matrix = pivot_matrix.fillna(0)
pivot_matrix.to_csv("TUGDA/data/Pathways/gene_frequency.csv")

### 3. Expression-based weighting

- For each drug pathway combination, you sum the mean expression values of all genes that occur in this combination
- higher expression = higher weight for the gene in the pathway

In [None]:
# cell-line dataset:
gdsc_dataset = pd.read_csv('./data/TUGDA/GDSCDA_fpkm_AUC_all_drugs.zip', index_col=0)
gene_expression = gdsc_dataset.iloc[:, :1780].mean()

# Add the weights based on gene expression to lvl2_edges_df
lvl2_edges_df['expression_weight'] = lvl2_edges_df['target'].map(gene_expression)
agg_weights = lvl2_edges_df.groupby(['drug', 'source'])['expression_weight'].sum().reset_index()

# Create pivot table: drugs as rows, pathways as columns
agg_weights = agg_weights.pivot(index='drug', columns='source', values='expression_weight')
agg_weights = agg_weights.reindex(index=drug_list)
agg_weights = agg_weights.fillna(0)

# Apply log transformation (log(x + 1)) to handle zero values and reduce skewness
# Perform Z-score normalization for each column 

agg_weights_log = np.log1p(agg_weights)
agg_weights_zscore = pd.DataFrame(
    scaler.fit_transform(agg_weights_log),
    index=agg_weights_log.index,
    columns=agg_weights_log.columns
)

agg_weights_zscore.to_csv("TUGDA/data/Pathways/pathway_weights_zscore.csv")