# Titel

Explanation ...

# 1. Drug-Gene-Interactions (DGI)

Which databases and how to download each file

In [None]:
!pip install chembl_webresource_client

In [2]:
import pandas as pd

# BioGrid
import requests # to get UniProt Id
import re
import numpy as np

# ChEMBL 
from concurrent.futures import ThreadPoolExecutor, as_completed
# from chembl_webresource_client.new_client import new_client  # type: ignore
import time
from tqdm import tqdm
import requests

In [None]:
# cell-line and drug sample dataset of TUGDA (GDSC)
# gdsc_dataset = pd.read_csv('/Users/tm03/Desktop/TUGDA_1/data/GDSCDA_fpkm_AUC_all_drugs.zip', index_col=0)
gdsc_dataset = pd.read_csv('/sybig/home/tmu/TUGDA/data/GDSCDA_fpkm_AUC_all_drugs.zip', index_col=0)
gene_list = gdsc_dataset.columns[0:1780]
drug_list = gdsc_dataset.columns[1780:]

# to convert the same output as in dgidb
drug_list_upper = set(drug_list.str.upper())

### 1.1 DGIdb

Data Source: https://dgidb.org/downloads 
- under downloads: Date: latest and interactions.tsv

<u>Gene and Drug related columns:</u>
- **gene_claim_name**: name of the gene as reported by the source database
- **gene_concept_id**: standardized gene ID, typically from the HGNC (HUGO Gene Nomenclature Committee)
- **gene_name**: official gene name (usually the same as **gene_claim_name**, but standardized)
- **drug_claim_name**: the name of the drug as reported by the source database
- **drug_concept_id**:  A standardized drug ID, typically from the HGNC (HUGO Gene Nomenclature Committee)
- **drug_name**: official drug name (usually the same as **drug_claim_name**, but standardized)

<u>Source and Interaction related columns:</u>
- **interaction_source_db_name**: name of the database providing the interaction data (DTC: Drug Target Commons)
- **interaction_source_db_version**: elase date of the database 
- **interaction_type**: type of biological interaction 
- **interaction_score**: numerical score representing the strength or reliability of the interaction

<u>Clinical categories:</u>
- **approved**: Is the drug approved by a regulatory agency?
- **immunitherapy**: Is it used in immuntherapy?
- **anti_neoplastic**: Is it an anti-cancer-drug (used against neoplasms?)

In [5]:
dgidb_df = pd.read_csv("raw_data/interactions.tsv", sep="\t")

# Filters DGIdb drug-gene interactions matching the gene and drug list of TUGDA
dgidb_filtered = dgidb_df[dgidb_df['gene_name'].isin(gene_list) & dgidb_df['drug_name'].isin(drug_list_upper)].copy()

# Map original drug name to uppercase
drug_name_map = {drug.upper(): drug for drug in drug_list}
dgidb_filtered['drug_name'] = dgidb_filtered['drug_name'].str.upper().map(drug_name_map)

# remove duplicates (n=337)
dgidb_final = dgidb_filtered[['gene_name', 'drug_name']].drop_duplicates()
# Final: 1516 Interactions

NameError: name 'gene_list' is not defined

In [14]:
dgidb_final.to_csv("./results/DGI/dgidb.csv", index=0)

### 1.2 ChEMBL

#### preprocess

In [None]:
# Load any file in folder /data_TUGDA directory (e.g. cl_y_test_o_k1.csv)
# extract all drug names (expected: 200)
# convert drug names to ChEMBL IDs using PubChem's ID exchange service: https://pubchem.ncbi.nlm.nih.gov/idexchange/idexchange.cgi

drug_list_df = drug_list.tolist()
drug_list_df = pd.DataFrame(drug_list)

In [None]:
drug_list_df.to_csv('./data/DGI/ChEMBL/drug_list.csv', header=False, index=False)

In [32]:
# Load the output file from PubChem's ID Exchange: maps drug names to ChEMBL IDs
drug2chembl = pd.read_csv('./data/DGI/ChEMBL/drug2chembl.txt', sep='\t', names=['drug_name', 'chembl_id'], header=None)

# Identify duplicate entries caused by drug name synonyms or multiple ChEMBL IDs (can happen due to synonyms and records keys in ChEMBL)
duplicates = drug2chembl[drug2chembl.duplicated('drug_name', keep=False)]

# Manually resolve duplicates for specific drugs by selecting the correct ChEMBL ID
allowed_pairs = {
    'AZD6738': 'CHEMBL4285417',
    'BMS-345541': 'CHEMBL249697',
    'EPZ5676': 'CHEMBL3414626',
    'Linsitinib': 'CHEMBL1091644',
    'Luminespib': 'CHEMBL252164',
    'NVP-ADW742': 'CHEMBL399021',
    'OSI-027': 'CHEMBL3120215',
    'Obatoclax Mesylate': 'CHEMBL2107358',
}

# Keep only the manually verified ChEMBL ID for each listed drug
def keep_entry(row):
    drug = row['drug_name']
    chembl = row['chembl_id']
    if drug in allowed_pairs:
        return chembl == allowed_pairs[drug]
    return True 

drug2chembl = drug2chembl[drug2chembl.apply(keep_entry, axis=1)]

In [None]:
# Identify drugs without assigned ChEMBL ID 
non_found_chembl = drug2chembl[drug2chembl['chembl_id'].isna()]

# Manually search for ChEMBL IDs or known synonyms (based on external sources or manual curation)
non_found_chembl_search = [
    ["Oxaliplatin", "CHEMBL414804"], # directly in ChEMBL
    ["Nutlin-3a (-)", "CHEMBL191334"], # directly in ChEMBL
    ["Cisplatin", "CHEMBL11359"], # directly in ChEMBL
    ["BPD-00008900", None],
    ["BDP-00009066", None],
    ["JAK1_8709", None],
    ["IRAK4_4710", None],
    ["Podophyllotoxin bromide", "CHEMBL61"],
    ["Sinularin", "CHEMBL488193"], # synonym: Flexibilide
    ["VSP34_8731", None],
    ["KRAS (G12C) Inhibitor-12", None], 
    ["ERK_2440", None],
    ["Mirin", "CHEMBL570841"],
    ["Picolinici-acid", "CHEMBL72628"], # synonym: 2-PICOLINIC ACID
    ["JAK_8517", None],
    ["ERK_6604", None],
    ["PAK_5339", None],
    ["TAF1_5496", None],
    ["IGF1R_3801", None],
    ["CDK9_5576", None],
    ["CDK9_5038", None],
    ["ULK1_4989", None],
    ["IAP_5620", None],
    ["Eg5_9814", None],
    ["Cetuximab", "CHEMBL1201577"], # directly in ChEMBL
    ["Bleomycin", "CHEMBL403664"], # directly in ChEMBL
    ["Bleomycin (50 uM)", None]
]

# 18 drugs still lack a valid ChEMBL ID

# Integrates curated ChEMBL matches, de-duplicates existing mappings, excludes unresolved drugs
new_entries = pd.DataFrame(non_found_chembl_search, columns=['drug_name', 'chembl_id'])
new_entries = new_entries[new_entries['chembl_id'].notna()]
drug2chembl = drug2chembl[~drug2chembl['chembl_id'].isin(new_entries['chembl_id'])]
drug2chembl = pd.concat([drug2chembl, new_entries], ignore_index=True)
drug2chembl = drug2chembl.dropna(subset=['chembl_id'])

In [35]:
drug2chembl.to_csv('./data/DGI/ChEMBL/drug2chembl_final.csv', header=False, index=False)

#### get targets

In [None]:
def fetch_targets_for_id(chembl_id, retries=3, delay=2):
    """
    Takes a list of ChEMBL IDs and returns a DataFrame with organism, pref_name, target_chembl_id,
    target_type, UniProt accession and chembl_id via the ChEMBL API.

    Filters:
    - organism: only Homo sapiens
    - target_type: must contain 'Protein'
    
    Returns:
    - pd.DataFrame: columns [organism, pref_name, target_chembl_id, target_type, uniprot_id, chembl_id]
    """
    bioactivities_api = new_client.activity
    targets_api = new_client.target

    for attempt in range(retries):
        try:
            bioactivities = bioactivities_api.filter(molecule_chembl_id=chembl_id).only("target_chembl_id")
            bioactivities_df = pd.DataFrame.from_records(bioactivities)

            if bioactivities_df.empty:
                print(f"No targets found for {chembl_id}")
                return pd.DataFrame([{
                    "target_chembl_id": None,
                    "pref_name": None,
                    "organism": None,
                    "target_type": None,
                    "uniprot_id": None,
                    "chembl_id": chembl_id
                }])

            target_ids = bioactivities_df["target_chembl_id"].drop_duplicates().tolist()
            targets = targets_api.filter(target_chembl_id__in=target_ids).only(
                "target_chembl_id", "pref_name", "organism", "target_type", "target_components"
            )

            records = []
            for target in targets:
                # Just targets with organism: Homo sapiens and target_type: PROTEIN
                if target.get("organism") != "Homo sapiens":
                    continue
                if " PROTEIN" not in target.get("target_type", "").upper():
                    continue

                components = target.get("target_components", [])
                uniprot_ids = [comp.get("accession") for comp in components if comp.get("accession")]

                for uniprot_id in uniprot_ids:
                    records.append({
                        "target_chembl_id": target.get("target_chembl_id"),
                        "pref_name": target.get("pref_name"),
                        "organism": target.get("organism"),
                        "target_type": target.get("target_type"),
                        "uniprot_id": uniprot_id,
                        "chembl_id": chembl_id
                    })

            if not records:
                print(f"No hits found for targets in Homo sapiens with {chembl_id}")
                return pd.DataFrame([{
                    "target_chembl_id": None,
                    "pref_name": None,
                    "organism": None,
                    "target_type": None,
                    "uniprot_id": None,
                    "chembl_id": chembl_id
                }])

            return pd.DataFrame(records)

        except Exception as e:
            print(f"[{chembl_id}] Error during attempt {attempt + 1}/{retries}: {e}")
            time.sleep(delay)

    print(f"[{chembl_id}] All attempts failed.")
    return pd.DataFrame([{
        "target_chembl_id": None,
        "pref_name": None,
        "organism": None,
        "target_type": None,
        "uniprot_id": None,
        "chembl_id": chembl_id
    }])

def get_targets_parallel(chembl_ids, max_workers=8):
    """
    Parallelizes the retrieval of targets for a list of ChEMBL IDs.
    """
    results = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(fetch_targets_for_id, chembl_id): chembl_id for chembl_id in chembl_ids}

        for future in tqdm(as_completed(futures), total=len(futures), desc="Parallel processing of ChEMBL IDs"):
            try:
                df = future.result()
                if not df.empty:
                    results.append(df)
            except Exception as e:
                print(f"[FUTURE ERROR] Error while processing a thread: {e}")

    if results:
        all_targets_df = pd.concat(results, ignore_index=True)
        return all_targets_df.drop_duplicates()
    else:
        print("No data collected.")
        return pd.DataFrame()

In [None]:
targets_df = get_targets_parallel(drug2chembl["chembl_id"].tolist())
targets_df.to_csv("./results/DGI/ChEMBL/chembl_targets.csv") # for later use in PPI

No targets found for 
- CHEMBL1421 (Dasatinib), wondering becuase there are over 3000 bioactivites (maybe to much to catch them all)
- CHEMBL1201577 (Cetuximab), no data available for compound 

No hits found for targets in Homo sapiens with
- CHEMBL2349416 (Pyridostatin), just target_type: cellline (11, but only 1 with Homo sapiens as Target Organism)
- CHEMBL1969416, just target_type: cellline (55) 
- CHEMBL924 (Zoledronic acid anhydrous), remark: there are SINGLE PROTEIN as target_type
- CHEMBL399907 (Elephantin), just target_type: cellline (3)
- CHEMBL488193 (Flexibilide), just target_type: cellline (2)

In [39]:
# new format: drug_name, pref_name, chembl_id (rename to drug_chembl_id), target_chembl_id, uniprot_id (rename too target_uniprot_id), organism, target_type

targets_df = pd.read_csv("./results/DGI/ChEMBL/chembl_targets.csv", sep='\t')

targets_df = targets_df.rename(columns={
    'chembl_id': 'drug_chembl_id',
    'uniprot_id': 'target_uniprot_id'
})

targets_df = targets_df.merge(
    drug2chembl[['chembl_id', 'drug_name']],
    left_on='drug_chembl_id',
    right_on='chembl_id',
    how='left'
)

targets_df = targets_df[['drug_name', 'pref_name', 'drug_chembl_id', 'target_chembl_id', 'target_uniprot_id', 'organism', 'target_type']]

#### get gene_name

In [40]:
# get unique UniProt IDs fromt the DataFrame (fewer API calls by avoiding duplicates and take unique genes)
unique_uniprot_ids = targets_df['target_uniprot_id'].dropna().unique()

def fetch_gene_name(uniprot_id):
    """
    Takes a list of UniProt IDs and returns a DataFrame with the first hint of the gene_name via the ChEMBL API.

    Returns:
    - pd.DataFrame: columns [gene_name]
    """
    url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.json"
    try:
        response = requests.get(url)
        if response.status_code == 200:
            data = response.json()
            genes = data.get('genes')
            if genes and 'geneName' in genes[0]:
                return genes[0]['geneName'].get('value')
    except:
        pass
    return None

# Map UniProt IDs to gene names using the UniProt API
uniprot_to_gene = {}
for uniprot_id in tqdm(unique_uniprot_ids, desc="Loading gene names from UniProt"):
    gene_name = fetch_gene_name(uniprot_id)
    uniprot_to_gene[uniprot_id] = gene_name
    time.sleep(0.5)  # Respect API rate limits

# Add gene name column to the DataFrame by mapping UniProt IDs
targets_df['gene_name'] = targets_df['target_uniprot_id'].map(uniprot_to_gene)

Loading gene names from UniProt: 100%|██████████| 1148/1148 [13:17<00:00,  1.44it/s]


In [None]:
# Filters ChEMBL drug-gene interactions matching the gene and drug list of TUGDA
targets_df = targets_df[['drug_name', 'gene_name']]
targets_df = targets_df[targets_df['gene_name'].isin(gene_list)]
targets_df = targets_df[targets_df['drug_name'].isin(drug_list)]
chembl_final = targets_df
# Final: 18622 Interactions

In [50]:
chembl_final.to_csv("./results/DGI/ChEMBL.csv", index=0)

### 1.3 BioGrid

In [None]:
# Download BIOGRID-ALL-4.4.244.tab3.txt (https://downloads.thebiogrid.org/File/BioGRID/Release-Archive/BIOGRID-4.4.246/BIOGRID-CHEMICALS-4.4.246.chemtab.zip)
BioGrid_df = pd.read_csv("raw_data/BIOGRID-CHEMICALS-4.4.246.chemtab.zip", compression='zip', sep="\t", low_memory=False) # low_memory=False disables chunk processing to properly infer data types, which uses more memory but avoids dtype warnings.

In [None]:
# Filter BioGRID data to include only interactions involving relevant drugs and genes
# Matches are based on official names and known synonyms

# Preprocessing: Split column as list (pipe-separated)
BioGrid_df['Chemical Name List'] = BioGrid_df['Chemical Name'].str.split('|')
BioGrid_df['Chemical Synonyms List'] = BioGrid_df['Chemical Synonyms'].str.split('|')
BioGrid_df['Official Symbol List'] = BioGrid_df['Official Symbol'].str.split('|')
BioGrid_df['Synonyms List'] = BioGrid_df['Synonyms'].str.split('|')

# Expand the lists: each list entry becomes a separate row (one-to-many explosion)
df_exploded_drugs = BioGrid_df.explode('Chemical Name List').explode('Chemical Synonyms List')
BioGrid_exploded_all = df_exploded_drugs.explode('Official Symbol List').explode('Synonyms List')

# Mapping 
# Match, if hits for gene and drug in the corresponding columns
drug_set = set(drug_list)
gene_set = set(gene_list)

# Create boolean masks for rows that match known drugs or their synonyms
# Drug with Chemical Name List and Chemical Synonyms List
drug_mask = (
    BioGrid_exploded_all['Chemical Name List'].isin(drug_set) |
    BioGrid_exploded_all['Chemical Synonyms List'].isin(drug_set)
)

# Create boolean masks for rows that match known drugs or their synonyms
# Gene with Official Symbol List and and Synonyms List
gene_mask = (
    BioGrid_exploded_all['Official Symbol List'].isin(gene_set) |
    BioGrid_exploded_all['Synonyms List'].isin(gene_set)
)

# Filter rows where both drug and gene match
BioGrid_final = BioGrid_exploded_all[drug_mask & gene_mask].copy()



In [59]:

# Issue: Rows may contain multiple matches; we need clear 1:1 mapping for gene–drug pairs
# Solution: Determine the exact matching name (original or synonym) for each gene and drug

# Resolve drug name: prefer official name, fallback to synonym
BioGrid_final['drug_name'] = np.where(
    BioGrid_final['Chemical Name List'].isin(drug_set),
    BioGrid_final['Chemical Name List'],
    np.where(
        BioGrid_final['Chemical Synonyms List'].isin(drug_set),
        BioGrid_final['Chemical Synonyms List'],
        None
    )
)

# Resolve gene name: prefer official symbol, fallback to synonym
BioGrid_final['gene_name'] = np.where(
    BioGrid_final['Official Symbol List'].isin(gene_set),
    BioGrid_final['Official Symbol List'],
    np.where(
        BioGrid_final['Synonyms List'].isin(gene_set),
        BioGrid_final['Synonyms List'],
        None
    )
)

BioGrid_final = BioGrid_final[['gene_name', 'drug_name']].drop_duplicates()
# Final: 129 Interactions

In [60]:
BioGrid_final.to_csv("./results/DGI/BioGrid.csv", index=0)

### 1.4 PharmaGKB

In [None]:
# Download clinicalAnnotations.zip (https://www.pharmgkb.org/downloads)

from zipfile import ZipFile
import os

# Download relationship file from zip file
with ZipFile("raw_data/relationships.zip") as file: 
    file.extract("relationships.tsv", path="raw_data")
# Delete zip file
os.remove("raw_data/relationships.zip")

In [None]:
Pharma_relationships = pd.read_csv("raw_data/relationships.tsv", compression='zip') 
Pharma_relationships = Pharma_relationships[['Entity1_name', 'Entity2_name']]
Pharma_relationships['Entity2_name'] = Pharma_relationships['Entity2_name'].str.capitalize()

# Filters PharmaGKB drug-gene interactions matching the gene and drug list of TUGDA
Pharma_relationships = Pharma_relationships.rename(columns={'Entity1_name': 'gene_name', 'Entity2_name': 'drug_name'})
Pharma_relationships = Pharma_relationships[Pharma_relationships['gene_name'].isin(gene_list)]
Pharma_final = Pharma_relationships[Pharma_relationships['drug_name'].isin(drug_list)]
Pharma_final = Pharma_final.drop_duplicates()
# 284 Interactions

In [21]:
Pharma_final.to_csv("./results/DGI/Pharma.csv", index=0)

### 1.5 CTD

In [4]:
# Download the file from the "Chemical-gene-interactions" section at: https://ctdbase.org/downloads/
# Define all column names as specified under the "Fields" section of the Chemical-Gene Interactions file
column_names = [
    "ChemicalName",
    "ChemicalID",
    "CasRN",
    "GeneSymbol",
    "GeneID",
    "GeneForms",
    "Organism",
    "OrganismID",
    "Interaction",
    "InteractionActions",
    "PubMedIDs"
]

# Load the CTD chemical-gene interaction data, skipping comment lines and assigning column names
CTD = pd.read_csv("raw_data/CTD_chem_gene_ixns.csv.gz",comment="#",names=column_names)

# Rename columns for clarity and consistency with downstream analysis
CTD = CTD.rename(columns={'GeneSymbol': 'gene_name', 'ChemicalName': 'drug_name'})

In [23]:
# Filters CTD drug-gene interactions for human-specific entries matching the gene and drug list of TUGDA
# returning unique drug-gene pairs
CTD = CTD[['drug_name','gene_name', 'Organism']]
CTD = CTD[CTD['Organism'] == 'Homo sapiens']
CTD = CTD[CTD['gene_name'].isin(gene_list)]
CTD = CTD[CTD['drug_name'].isin(drug_list)]
CTD_final = CTD[['drug_name','gene_name']].drop_duplicates()
# Number of interactions: 6500

In [24]:
CTD_final.to_csv("./results/DGI/CTD.csv", index=0)

### 1.6 Convert all databases together

In [276]:
dfs = {
    "DGIdb": dgidb_final,
    "ChEMBL": chembl_final,
    "BioGrid": BioGrid_final,
    "PharmaGKB": Pharma_final,
    "CTD": CTD_final
}

# New list with DataFrames, each gets a “database” column
df_list = []
for name, df in dfs.items():
    df_copy = df.copy()
    df_copy["database"] = name
    df_list.append(df_copy)

# Convert them
final_all_DGI = pd.concat(df_list, ignore_index=True)


In [277]:
# Combine all duplicates with the same gene_name and drug_name into one entry
final_all_DGI = (
    final_all_DGI
    .groupby(["gene_name", "drug_name"]) # remove the duplicates so that each unique entry (gene_name, drug_name) entry occurs only once
    .agg({
        "database": lambda x: ", ".join(sorted(set(x))), # in the "database" column, list all participating databases separated by commas
    })
    .reset_index()
)

In [278]:
final_all_DGI.to_csv("./results/DGI/DGI_Final.csv")

### 1.7 Analysis

In [279]:
# Final check for consistency between DGI data and TUGDA input

# How many interactions per database?
print("\n" + "="*45)
print("1.) How many interactions per database?\n")
for name, df in dfs.items():
    print(f"   - {name:}: {len(df):} pairs")

# Total Interactions 
print("\n" + "="*45)
print(f"2.) Total Drug–Gene Interactions: {len(final_all_DGI)}")
print("="*45)

# Overlapping between the databases

# Filter use rows with comma in 'databases'
combi = final_all_DGI[final_all_DGI['database'].str.contains(',')]
counts = combi['database'].value_counts()
print(f"3.) Overlapping Drug–Gene Interactions: {counts.sum()}")
print("="*45)

# Missing genes and drugs (Overlap with TUGDA Input)
# Check overlap in drugs
print("4.) Overlap with TUGDA Input\n")

drugs_final_dgi = set(final_all_DGI['drug_name'].dropna().unique())
drugs_TUGDA = set(drug_list)
drugs_common = drugs_final_dgi & drugs_TUGDA

percent_drugs = len(drugs_common) / len(drugs_TUGDA) * 100 if drugs_TUGDA else 0
print(f"   - Drugs: {len(drugs_common)} of {len(drugs_TUGDA)} matched ({percent_drugs:.2f}%)") # 176 of 200 common genes (88%)

# Check overlap in genes
genes_final_dgi = set(final_all_DGI['gene_name'].dropna().unique())
genes_TUGDA = set(gene_list)
genes_common = genes_final_dgi & genes_TUGDA
percent_genes = len(genes_common) / len(genes_TUGDA) * 100 if genes_TUGDA else 0
print(f"   - Genes: {len(genes_common)} of {len(genes_TUGDA)} matched ({percent_genes:.2f}%)") # 1636 of 1780 common genes (91,91%)
print("="*45 + "\n")




1.) How many interactions per database?

   - DGIdb: 1516 pairs
   - ChEMBL: 18622 pairs
   - BioGrid: 129 pairs
   - PharmaGKB: 284 pairs
   - CTD: 6500 pairs

2.) Total Drug–Gene Interactions: 25388
3.) Overlapping Drug–Gene Interactions: 1392
4.) Overlap with TUGDA Input

   - Drugs: 176 of 200 matched (88.00%)
   - Genes: 1636 of 1780 matched (91.91%)



### 1.8 Matrix

In [280]:
# define setup of TUGDA
gene_list = list(gene_list)
drug_list = list(drug_list)

# Build an interaction matrix initialized with zeros, with genes as rows and drugs as columns
interaction_matrix_DGI = pd.DataFrame(0, index=gene_list, columns=drug_list)

# Iterate over the filtered DataFrame to mark interactions with 1
for _, row in final_all_DGI.iterrows():
    gene = row['gene_name']
    drug = row['drug_name']

    # Only consider interactions where both gene and drug are in the predefined lists
    if gene in gene_list and drug in drug_list:
        interaction_matrix_DGI.at[gene, drug] = 1

# Transpose the matrix so that drugs become rows and genes become columns
interaction_matrix_DGI = interaction_matrix_DGI.T

In [281]:
interaction_matrix_DGI.to_csv("./results/DGI/DGI_binary_matrix.csv")

# 2. Protein-Protein-Interactions (PPI)

In [None]:
gene_name = pd.read_csv("./data/heterogeneous_network/protein2gene.csv", sep='\t')
ppi = pd.read_csv("./data/heterogeneous_network/9606.protein.links.v12.0.txt", sep=" ")

# Mapping 
ppi["protein1"] = ppi["protein1"].str.replace("9606.", "", regex=False)
ppi["protein2"] = ppi["protein2"].str.replace("9606.", "", regex=False)

# Umbenennen zur Vorbereitung fürs Joinen
gene_name = gene_name.rename(columns={"protein_ENSP": "protein", "gene name": "gene"})

# Merge für protein1
ppi = ppi.merge(gene_name, how="left", left_on="protein1", right_on="protein")
ppi = ppi.rename(columns={"gene": "source"})
ppi = ppi.drop(columns=["protein"])  # doppelte Spalte entfernen

# Merge für protein2
ppi = ppi.merge(gene_name, how="left", left_on="protein2", right_on="protein")
ppi = ppi.rename(columns={"gene": "target"})
ppi = ppi.drop(columns=["protein"])

In [16]:
ppi

Unnamed: 0,protein1,protein2,combined_score,source,target
0,ENSP00000000233,ENSP00000356607,173,ARF5,RALGPS2
1,ENSP00000000233,ENSP00000427567,154,ARF5,FHDC1
2,ENSP00000000233,ENSP00000253413,151,ARF5,ATP6V1E1
3,ENSP00000000233,ENSP00000493357,471,ARF5,Not Found
4,ENSP00000000233,ENSP00000324127,201,ARF5,PSD3
...,...,...,...,...,...
13715399,ENSP00000501317,ENSP00000475489,195,Not Found,Not Found
13715400,ENSP00000501317,ENSP00000370447,158,Not Found,VCX
13715401,ENSP00000501317,ENSP00000312272,226,Not Found,YPEL2
13715402,ENSP00000501317,ENSP00000402092,169,Not Found,SAMD3


In [23]:
ppi = ppi[["source", "target"]]

mask = (ppi['source'] != 'Not Found') & (ppi['target'] != 'Not Found')
final_ppi_String = ppi[mask]

In [24]:
final_ppi_String.to_csv("./results/PPI/final_PPI_String.csv")

### Missings

- bei protein1/source: 2647 Interaktionen gehen verloren durch "Not found"
- bei protein2/target: 2647 Interaktionen gehen verloren durch "Not found"
- Schnittmenge: 2647 unique Interaktionen gehen verloren

In [21]:
filtered_df_target = ppi[ppi['target'] == 'Not Found']
unique_combinations_target = filtered_df_target[['protein2', 'target']].drop_duplicates()

filtered_df_source = ppi[ppi['source'] == 'Not Found']
unique_combinations_source = filtered_df_source[['protein1', 'source']].drop_duplicates()

In [22]:
unique_proteins_from_source = filtered_df_source['protein1'].drop_duplicates()
unique_proteins_from_target = filtered_df_target['protein2'].drop_duplicates()

# 4. Zusammenführen der einzigartigen Proteine aus beiden Spalten:
all_unique_proteins = pd.concat([
    unique_proteins_from_source,
    unique_proteins_from_target
]).drop_duplicates().reset_index(drop=True)
print(all_unique_proteins)

0       ENSP00000450436
1       ENSP00000450444
2       ENSP00000450456
3       ENSP00000450461
4       ENSP00000450480
             ...       
2642    ENSP00000501254
2643    ENSP00000501259
2644    ENSP00000501265
2645    ENSP00000501277
2646    ENSP00000501317
Length: 2647, dtype: object
