In [1]:
import pathlib
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np


## Entry Step

In [2]:
# input fir file dir
ic50_path = pathlib.Path("../tmp/")
ic50_filename = "FIR_file_for_EC50.xlsx"

ic50_df = pd.read_excel(ic50_path / ic50_filename)

In [3]:
rename_columns = ["EOS", "CONCENTRATION", "VALUE"]
rename_dict = dict(zip(ic50_df.columns, rename_columns))
ic50_df.rename(columns=rename_dict, inplace=True)

In [4]:
ic50_df.sort_values(by=["EOS", "CONCENTRATION"], inplace=True)

## DRC Fitting Step

In [5]:
LOWER_BOUND = -100

inh_valid = ic50_df[ic50_df["VALUE"] >= LOWER_BOUND]

# repeated sorting is redundant

In [6]:
from scipy.optimize import curve_fit

# To determine ec50 or ic50??? - UPDATE: ic50
def four_param_logistic(x, lower_limit, upper_limit, ic50, slope):
    return upper_limit + (lower_limit - upper_limit) / (1 + (x / ic50) ** slope)

# upper branch does not group cmpds?? - UPDATE: it does (implicitly)

by_eos = inh_valid.groupby("EOS")


In [7]:
fit_props = ["lower_limit", "upper_limit", "ic50", "slope"]
concentration_props = ["min_concentration", "max_concentration"]
curve_fit_params = {key: [] for key in ["EOS", *fit_props, *concentration_props]}
for key, group in by_eos:
    by_conc = group.groupby("CONCENTRATION")
    values_avg = by_conc["VALUE"].mean()
    x = values_avg.index.to_numpy()
    y = values_avg.values
    try:
        params, _ = curve_fit(four_param_logistic, x, y, maxfev=10000)
    except RuntimeError:
        print(f"EOS: {key} - curve_fit failed")
        params = [np.nan] * 4

    curve_fit_params["EOS"].append(key)
    for i, name in enumerate(fit_props):
        curve_fit_params[name].append(params[i])

    curve_fit_params["min_concentration"].append(x.min())
    curve_fit_params["max_concentration"].append(x.max())
    

curve_fit_df = pd.DataFrame(curve_fit_params)

  return upper_limit + (lower_limit - upper_limit) / (1 + (x / ic50) ** slope)


EOS: EOS101302 - curve_fit failed
EOS: EOS2452 - curve_fit failed
EOS: EOS60470 - curve_fit failed
EOS: EOS84313 - curve_fit failed
EOS: EOS98641 - curve_fit failed


In [8]:
# what is justification for this step? UPDATE = by -1
KNIME_SLOPE_MULTIPLIER = -1

# TBD: should the negation persist if ACTIVATION/INHIBITION tested?

curve_fit_df["slope"] = round(curve_fit_df.slope * KNIME_SLOPE_MULTIPLIER, 2)

In [9]:
curve_fit_df

Unnamed: 0,EOS,lower_limit,upper_limit,ic50,slope,min_concentration,max_concentration
0,EOS100028,-11.652444,101.686898,5.073602e-01,-0.87,0.0125,50.0
1,EOS100057,0.546953,104.079979,2.018330e+00,-0.96,0.0125,50.0
2,EOS100080,-0.091180,-47.887953,4.980150e+00,-14.64,0.0125,5.0
3,EOS100134,-2.483754,99.885443,8.876217e-01,-1.23,0.0125,50.0
4,EOS100147,-4.950270,99.465109,3.961385e-01,-1.16,0.0125,50.0
...,...,...,...,...,...,...,...
504,EOS98635,-14.101436,100.659103,3.775230e-01,-0.90,0.0125,50.0
505,EOS98640,-152.610628,99.999461,3.282508e-03,-0.96,0.0125,50.0
506,EOS98641,,,,,0.0125,50.0
507,EOS98642,-640225.681281,100.019671,5.762685e-08,-0.93,0.0125,50.0


In [10]:
# initial operator
curve_fit_df["operator"] = np.where(
    curve_fit_df["ic50"] > curve_fit_df["max_concentration"], '>',
    np.where(
        curve_fit_df["ic50"] < curve_fit_df["min_concentration"], '<', '='
    )
)

In [11]:
curve_fit_df_indexed_by_eos = curve_fit_df.set_index("EOS")

## Activity Determination

In [12]:
concentration_grouped_df = (
    ic50_df
    .groupby(["EOS", "CONCENTRATION"])
    .VALUE
    .mean()
    .reset_index()
    .groupby("EOS")
    .VALUE
    .aggregate(["max", "min", "mean"])
    .reset_index()
)

In [13]:
# sorter branch groupped by CMPD ID and concentration, DRC only by CMPD ID

# should be groupped by CMPD ID only

activation_df = (
    concentration_grouped_df
    .merge(
        curve_fit_df_indexed_by_eos, 
        how="inner", 
        left_on="EOS", 
        right_on="EOS"
    )
    .rename(
        columns={
            "min": "min_value",
            "max": "max_value",
            "mean": "mean_value"
        }
    )
)

In [14]:
activation_df.head()

Unnamed: 0,EOS,max_value,min_value,mean_value,lower_limit,upper_limit,ic50,slope,min_concentration,max_concentration,operator
0,EOS100028,98.147236,-7.986852,51.045043,-11.652444,101.686898,0.50736,-0.87,0.0125,50.0,=
1,EOS100057,97.939306,1.522787,42.664352,0.546953,104.079979,2.01833,-0.96,0.0125,50.0,=
2,EOS100080,5.125264,-176.911844,-43.892611,-0.09118,-47.887953,4.98015,-14.64,0.0125,5.0,=
3,EOS100134,100.345955,-2.666537,47.895665,-2.483754,99.885443,0.887622,-1.23,0.0125,50.0,=
4,EOS100147,100.171501,-1.553326,55.804997,-4.95027,99.465109,0.396139,-1.16,0.0125,50.0,=


In [16]:
MAX_MIN_VALUE_THRESHOLD = 75
MIN_MAX_VALUE_THRESHOLD = 30

# adjust operator step
activation_df["all_conc_active"] = activation_df["min_value"] > MAX_MIN_VALUE_THRESHOLD
activation_df["all_conc_inactive"] = activation_df["max_value"] < MIN_MAX_VALUE_THRESHOLD

activation_df["operator"] = np.where(
    activation_df.all_conc_active,
    "<",
    np.where(
        activation_df.all_conc_inactive,
        ">",
        activation_df.operator
    )
)

In [17]:
# un-adjusted slope used

activation_df["is_reverse_dose"] = activation_df.slope < 0

In [18]:
ACTIVITY_THRESHOLD = 10

activation_df["is_active"] = activation_df.ic50 < ACTIVITY_THRESHOLD

In [19]:
# what if unknown? UPDATE: as inferred from file - should be inconclusive

activation_df["activity_final"] = np.where(
    activation_df.operator != "=",
    "inconclusive",
    np.where(
        (activation_df.ic50 >= 10) | (activation_df.upper_limit <= 30),
        "inactive",
        "active"
        )
    )


In [20]:
activation_df["is_partially_active"] = (activation_df.upper_limit > 30) & (activation_df.upper_limit < 80) & (activation_df.ic50 < 10)

In [21]:
activation_df.to_excel("active_prototype.xlsx", index=False)