# GI50
This NCI60 data gives an insight to effect of a chemical on cancer.
Cancer is not one disease but a collection of every possible cell in growing uncontrolled.
For this reason NCI60 has multiple different cell lines (column CELL_NAME).

I have here the GI50 data of the NCI60 project
GI50: concentration to stop growth with 50%. column "AVERAGE" is the average concentration needed.

In [1]:
import pandas as pd
from os.path import join as path_join
import numpy as np

In [2]:
gi50 = pd.read_csv(path_join("data", "GI50.csv"))
print(gi50.shape)
gi50.head()

(4547757, 14)


Unnamed: 0,RELEASE_DATE,EXPID,PREFIX,NSC,CONCENTRATION_UNIT,LOG_HI_CONCENTRATION,PANEL_NUMBER,CELL_NUMBER,PANEL_NAME,CELL_NAME,PANEL_CODE,COUNT,AVERAGE,STDDEV
0,20210223,0804NS72,S,745455,M,-4.0,10,7,Melanoma,SK-MEL-5,MEL,1,-5.1652,0.0
1,20210223,0804NS72,S,745455,M,-4.0,10,8,Melanoma,SK-MEL-28,MEL,1,-4.0,0.0
2,20210223,0804NS72,S,745455,M,-4.0,10,14,Melanoma,M14,MEL,1,-5.3654,0.0
3,20210223,0804NS72,S,745455,M,-4.0,10,20,Melanoma,UACC-62,MEL,1,-4.0,0.0
4,20210223,0804NS72,S,745455,M,-4.0,10,21,Melanoma,UACC-257,MEL,1,-4.0,0.0


# Experiment ID


In [3]:
gi50.groupby("EXPID").count().sort_values("RELEASE_DATE")["RELEASE_DATE"]

EXPID
9308BM77       3
9308MD77       3
9903SE98       4
9308BG87       4
9510HG75       6
            ... 
1210NS67    5252
1104RS29    5307
1108NS08    5482
1103NS18    5580
1102NS11    5632
Name: RELEASE_DATE, Length: 5034, dtype: int64

In [4]:
import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Histogram(x=gi50.groupby("EXPID").count().sort_values("RELEASE_DATE")["RELEASE_DATE"], name="Experiment Count"))

fig.update_layout(
    title_text='Number of GI50 measurements per experiment', # title of plot
    xaxis_title_text='Number measurements per experiment', # xaxis label
    yaxis_title_text='Number of Experiments', # yaxis label
)
fig.show()

In [5]:
gi50["RELEASE_DATE"].unique()

array([20210223, 20211008, 20210630, 20220119, 20210319], dtype=int64)

In [6]:
gi50.groupby("NSC").count().sort_values("EXPID")

Unnamed: 0_level_0,RELEASE_DATE,EXPID,PREFIX,CONCENTRATION_UNIT,LOG_HI_CONCENTRATION,PANEL_NUMBER,CELL_NUMBER,PANEL_NAME,CELL_NAME,PANEL_CODE,COUNT,AVERAGE,STDDEV
NSC,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
649468,3,3,3,3,3,3,3,3,3,3,3,3,3
617600,4,4,4,4,4,4,4,4,4,4,4,4,4
722655,6,6,6,6,6,6,6,6,6,6,6,6,6
655471,6,6,6,6,6,6,6,6,6,6,6,6,6
667648,7,7,7,7,7,7,7,7,7,7,7,7,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...
249992,7068,7068,7068,7068,7068,7068,7068,7068,7068,7068,7068,7068,7068
6396,7177,7177,7177,7177,7177,7177,7177,7177,7177,7177,7177,7177,7177
119875,7381,7381,7381,7381,7381,7381,7381,7381,7381,7381,7381,7381,7381
19893,101978,101978,101978,101978,101978,101978,101978,101978,101978,101978,101978,101978,101978


In [7]:
gi50.loc[(gi50.NSC == 123127)].CELL_NAME.value_counts().sort_values()

WI-38             1
A-CREB 2          1
VDSO/CMV-8        1
MDA-MB-435S       1
CCD-19LU          1
               ... 
U251           2471
HCT-116        2475
SW-620         2482
MDA-MB-435     2486
HCT-15         2488
Name: CELL_NAME, Length: 105, dtype: int64

77% of all chemicals tested on HCT-15 are only applied in 1 experiment, 99% only in 5 or less.
This makes statics a bit hard

In [17]:
gi50.loc[gi50["CELL_NAME"] == "HCT-15"].groupby("NSC").count()["EXPID"].sort_values()
import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Histogram(x=gi50.loc[gi50["CELL_NAME"] == "HCT-15"].groupby("NSC").count()["EXPID"], name="Number repeat experiments", histnorm='percent', cumulative_enabled=True))

fig.update_layout(
    title_text='Number of times the same NSC is used on HCT-15', # title of plot
    xaxis_title_text='Number of repeats', # xaxis label
    yaxis_title_text='Number of Experiments', # yaxis label
)
fig.update_xaxes(range=[0, 50])
fig.show()

# Correlating chemicals

In [9]:
import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Histogram(x=gi50.loc[(gi50.NSC == 123127) & (gi50.CELL_NAME == "HCT-15")].AVERAGE, name="NSC 123127"))
fig.add_trace(go.Histogram(x=gi50.loc[(gi50.NSC == 19893) & (gi50.CELL_NAME == "HCT-15")].AVERAGE, name="NSC 19893"))

# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.update_layout(
    title_text='GI50 on HCT-15', # title of plot
    xaxis_title_text='Concentration for GI50 (log)', # xaxis label
    yaxis_title_text='Count', # yaxis label
)
fig.show()

In [10]:
big_two_hct15 = gi50.loc[((gi50.NSC == 123127) | (gi50.NSC == 19893)) & (gi50.CELL_NAME == "HCT-15")]
big_two_hct15.head()
big_two_matched = pd.pivot_table(big_two_hct15, values='AVERAGE', index=['EXPID'],
                    columns=["NSC"], aggfunc=np.average, fill_value=None)
# big_two_matched = big_two_matched.dropna()
print(big_two_matched.shape)
big_two_matched.head()

(2884, 2)


NSC,19893,123127
EXPID,Unnamed: 1_level_1,Unnamed: 2_level_1
0001MD02,-5.0484,-6.0349
0001MD03,-5.0328,
0001MD05,-5.1225,-5.8473
0001MD07,-5.1803,-5.9424
0001MD08,-4.926,-6.2542


In [11]:
# Add release data
release_dates = big_two_hct15[["RELEASE_DATE", "EXPID"]].set_index("EXPID")
full_table = pd.merge(big_two_matched, release_dates, on="EXPID")
full_table

Unnamed: 0_level_0,19893,123127,RELEASE_DATE
EXPID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0001MD02,-5.0484,-6.0349,20210223
0001MD02,-5.0484,-6.0349,20210223
0001MD03,-5.0328,,20210223
0001MD05,-5.1225,-5.8473,20210223
0001MD05,-5.1225,-5.8473,20210223
...,...,...,...
9911RS90,-4.7096,-5.3659,20210223
9911RS92,,-6.4994,20210223
9912MD98,-5.1100,-5.9558,20210223
9912MD98,-5.1100,-5.9558,20210223


In [12]:
full_table["RELEASE_DATE"].value_counts()

20210223    4267
20211008      23
20220119       8
20210319       2
Name: RELEASE_DATE, dtype: int64

In [13]:
full_table = full_table.dropna()

In [14]:
full_table["RELEASE_DATE"].value_counts()

20210223    2832
Name: RELEASE_DATE, dtype: int64

In [15]:
from scipy import stats

(r, p) = stats.pearsonr(full_table[19893], full_table[123127])

print(f"The correlation between NSC19893 and NSC123127 is {r:.5f} with a p-value of {p}")


The correlation between NSC19893 and NSC123127 is 0.19037 with a p-value of 1.6112905358796105e-24


In [16]:
import plotly.express as px
fig = px.scatter(full_table, x=19893, y=123127, color="RELEASE_DATE", hover_name=full_table.index)
fig.show()