In [1]:
import json
import urllib.request
from enum import Enum

import pandas as pd

In [6]:
class Conc(Enum):
    uM = "uM"
    µM = "µM"
    NA = "NA"


def convert_to_float(var: str):
    if var == "\\N":
        var = "NaN"
    return float(var)


def convert_to_conc(var: str):
    return Conc(var)


# Note that the https requests don't work if drugcomb has not updated their TLS certificate
with urllib.request.urlopen("https://api.drugcomb.org/drugs/") as url:
    data = json.load(url)
    drugs = pd.json_normalize(data)

with urllib.request.urlopen("https://api.drugcomb.org/cell_lines/") as url:
    data = json.load(url)
    cell_lines = pd.json_normalize(data)

In [7]:
# File paths - insert your own paths
drug_comb_path = "../data/path_to_drugcomb_1.5.csv"
gdsc_genexpression_path = "../data/path_to_gdsc_genexpression.csv"

In [8]:
# Read files
df_drugcomb_core = pd.read_csv(
    drug_comb_path,
    converters={
        "drug_row_clinical_phase": convert_to_float,
        "drug_col_clinical_phase": convert_to_float,
        "conc_row_unit": convert_to_conc,
        "conc_col_unit": convert_to_conc,
        "synergy_zip": convert_to_float,
        "synergy_loewe": convert_to_float,
        "synergy_hsa": convert_to_float,
        "synergy_bliss": convert_to_float,
    },
    header=0,
    dtype={
        "block_id": int,
        "drug_row": str,
        "drug_col": str,
        "cell_line_name": str,
        "study_name": str,
        "tissue_name": str,
        "ic50_row": float,
        "ic50_col": float,
        "ri_row": float,
        "ri_col": float,
        "css_row": float,
        "css_col": float,
        "css_ri": float,
        "S_sum": float,
        "S_mean": float,
        "S_max": float,
        "drug_row_target_name": str,
        "drug_col_target_name": str,
    },
)

df_geneExpression = pd.read_csv(
    gdsc_genexpression_path,
    delimiter=",",
)

In [9]:
# Add COSMIC IDs from cell lines
df_core_cosmic = pd.merge(
    df_drugcomb_core,
    df_cell_lines[["name", "cosmic_id"]],
    right_on="name",
    left_on="cell_line_name",
    how="inner",
)
# Remove entries without a COSMIC ID
df_core_cosmic = df_core_cosmic[~df_core_cosmic["cosmic_id"].isna()]


# Pandas incorrectly cast the COSMIC IDs to floats therefore we cast it to a string and remove the .0 after each ID
df_core_cosmic["cosmic_id"] = df_core_cosmic["cosmic_id"].apply(lambda x: str(x)[:-2])


# Remove entries with COSMIC IDs for which we have no GDSC entry
df_core_cosmic_in_gdsc = df_core_cosmic[
    df_core_cosmic["cosmic_id"].isin(df_geneExpression.keys())
]
df_core_cosmic.reset_index(inplace=True)

In [20]:
# Remove duplicate drugs by name
df_drugs_duplicate_removed = df_drugs.drop_duplicates(subset="dname")

# Remove drugs that have no SMILES
df_drugs_final = df_drugs_duplicate_removed[
    ~df_drugs_duplicate_removed["smiles"].isnull()
]

df_drugs_final = df_drugs_final.drop(
    df_drugs_final[
        # Filter entries with invalid SMILES
        (df_drugs_final["smiles"] == "NULL")
        | (df_drugs_final["smiles"] == "-666")
    ].index
)

df_drugs_final.reset_index(inplace=True)
del df_drugs_final["index"]
df_drugs_final["id"] = df_drugs_final.index

# Transpose the dataframe to access a drug by its id through df_drugs_final_transposed[drug_id]
df_drugs_final_transposed = df_drugs_final.transpose()
df_drugs_final_transposed.to_json("../data/DrugComb_drugs_full.json")

In [23]:
# Add the drug IDs to the DrugComb dataframe
df_drugcomb_final = pd.merge(
    df_core_cosmic_in_gdsc,
    df_drugs_duplicate_removed[["id", "dname"]],
    left_on="drug_row",
    right_on="dname",
    how="left",
)
df_drugcomb_final.rename(columns={"id": "drug_row_id"}, inplace=True)
del df_drugcomb_final["dname"]

df_drugcomb_final = pd.merge(
    df_drugcomb_final,
    df_drugs_duplicate_removed[["id", "dname"]],
    left_on="drug_col",
    right_on="dname",
    how="left",
)
df_drugcomb_final.rename(columns={"id": "drug_col_id"}, inplace=True)
del df_drugcomb_final["dname"]


df_drugcomb_final.to_csv("../data/DrugComb_Final.csv")

In [24]:
### Fetch the drug responses matrixes for each concentration combination
df_responses = pd.DataFrame()

# The response api is limited to a few thousand per call. We found 1000 to work the best.
step_size = 1000
# This fetch is very costly and was not optimized. It was just used once to create the full dataset of responses. Regardless of optimization this is very costly since there are roughly 1.5 million entries so this will total to 1 500 000 / 1000 = 1 500 requests
total_blocks = df_drugcomb_final["lock_id"].max()
for i in range(1, total_blocks, step_size):
    with urllib.request.urlopen(
        f"https://api.drugcomb.org/response?from={i}&to={i+step_size}"
    ) as url:
        data = json.load(url)
        response = pd.json_normalize(data)
        df_responses = pd.concat(
            [df_responses, response[["block_id", "conc_r", "conc_c", "inhibition"]]]
        )
    percent = (i / total_blocks) * 100
    bar = ("#" * int(percent // 2)).ljust(50)
    print(f"\rProgress: [{bar}] {percent:.2f}%", end="")

df_responses.drop_duplicates(inplace=True)

Progress: [################################################# ] 99.99%

In [49]:
df_responses.to_csv(
    "../data/DrugComb_dose_responses.csv"
)

In [59]:
# Add Drug and COSMIC IDs to the response matrix dataframe
df_responses_final = pd.merge(
    df_responses,
    df_drugcomb_final[["lock_id", "drug_row_id", "drug_col_id", "cosmic_id"]],
    left_on="block_id",
    right_on="lock_id",
    how="left",
)
del df_responses_final["lock_id"]

df_responses_final = df_responses_final[~df_responses_final["cosmic_id"].isna()]

# Remove entries with outlier inhibition values
df_responses_final_extreme_removed = df_responses_final[
    (
        (df_responses_final["inhibition"] > -200)
        & (df_responses_final["inhibition"] < 200)
    )
]

In [60]:
# Remove 
df_responses_final_extreme_removed["cosmic_id"] = df_responses_final_extreme_removed[
    "cosmic_id"
].astype(int)
# Set Drug ID to -1 for monotherapies second drug (which is NaN for monotherapies)
df_responses_final_extreme_removed["drug_col_id"] = df_responses_final_extreme_removed[
    "drug_col_id"
].fillna(-1)
df_responses_final_extreme_removed["drug_col_id"] = df_responses_final_extreme_removed[
    "drug_col_id"
].astype(int)
df_responses_final_extreme_removed["drug_col_id"] = df_responses_final_extreme_removed[
    "drug_col_id"
].astype(int)
df_responses_final_extreme_removed["drug_row_id"] = df_responses_final_extreme_removed[
    "drug_row_id"
].astype(int)
df_responses_final_extreme_removed.reset_index(inplace=True)

df_responses_final_extreme_removed.to_csv(
    "../data/DrugComb_dose_response_mapped_final.csv"
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_responses_final_extreme_removed["cosmic_id"] = df_responses_final_extreme_removed[
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_responses_final_extreme_removed["drug_col_id"] = df_responses_final_extreme_removed[
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_responses_final_extreme_remo