# Nearest neighbors from prospective cohorts
We applied matchmaking to the prospective clinical trial (I-PREDICT). In this notebook, we look at the results; specifically....

In [1]:
import glob
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt
import pandas as pd

import sys
sys.path.append('../../common/')
import settings
settings.set_arial_as_font()

from settings import Colors
tableau10 = Colors.tableau10

%matplotlib inline

In [2]:
handles = glob.glob('../prospective-trial/almanac_outputs/*/*.matchmaker.txt')

summary = pd.read_csv('formatted/cell-lines.summary.txt', sep='\t')
zscores = pd.read_csv('formatted/sanger.gdsc.txt', sep='\t')

df = pd.concat([pd.read_csv(handle, sep='\t', index_col=0) for handle in handles])
df = df[~df['comparison'].eq('case-profile')]
df.sort_values(['SNF: FDA & CGC'], ascending=True, inplace=True)
df.reset_index(inplace=True)
df['case'] = df['case'].astype(str)

broad_to_ccle_map = summary.set_index('broad')['ccle_name'].dropna().to_dict()
sanger_to_ccle_map = summary.set_index('sanger')['ccle_name'].dropna().to_dict()

zscores['ccle_name'] = zscores['model_id'].replace(sanger_to_ccle_map)

first_neighbors = pd.Series(index=df['case'].astype(str).sort_values().drop_duplicates())
for label, group in df.groupby('case'):
    nearest = group['comparison'].replace(broad_to_ccle_map).tolist()[0]
    first_neighbors.loc[label] = nearest
first_neighbors.name = 'nearest neighbor'
first_neighbors = first_neighbors.to_frame().reset_index()





In [3]:
for index in first_neighbors.index: 
    cell_line = first_neighbors.loc[index, 'nearest neighbor']
    therapies = zscores[zscores['ccle_name'].eq(cell_line) & zscores['z_score'].le(-2)]['therapy_name'].tolist()
    n = len(therapies)
    first_neighbors.loc[index, 'sensitive therapies'] = ', '.join(therapies)
    first_neighbors.loc[index, 'n'] = n

In [4]:
first_neighbors['n'].value_counts()

2.0     22
0.0     19
1.0     12
5.0      6
6.0      5
4.0      4
16.0     3
8.0      3
14.0     3
11.0     2
3.0      2
9.0      2
Name: n, dtype: int64

In [5]:
first_neighbors['n'].sum()

292.0

In [6]:
first_neighbors['n'].describe()

count    83.000000
mean      3.518072
std       4.185811
min       0.000000
25%       1.000000
50%       2.000000
75%       5.000000
max      16.000000
Name: n, dtype: float64

In [7]:
import json

handle = 'almanac-gdsc-mappings.json'
def read_json(handle):
    with open(handle, 'r') as json_handle:
        return json.load(json_handle)
almanac_map = read_json(handle)

In [8]:
first_neighbors.to_csv('tables/ipredict/ipredict.nearest_neighbors.unannotated.txt', sep='\t')

In [9]:
first_neighbors

Unnamed: 0,case,nearest neighbor,sensitive therapies,n
0,101,U118MG_CENTRAL_NERVOUS_SYSTEM,"Rucaparib, Fludarabine",2.0
1,102,A375_SKIN,"AS605240, Pilaralisib, LIMK1 inhibitor BMS4, R...",14.0
2,105,CAL29_URINARY_TRACT,"AZD8835, PI3Ka_4409, Alpelisib, Taselisib",4.0
3,112,A549_LUNG,"Tanespimycin, BPTES",2.0
4,115,CL40_LARGE_INTESTINE,,0.0
...,...,...,...,...
78,A041,MHHNB11_AUTONOMIC_GANGLIA,CX-5461,1.0
79,A042,SKHEP1_LIVER,"SN-38, AGI-5198",2.0
80,A043,SW837_LARGE_INTESTINE,,0.0
81,A044,GMS10_CENTRAL_NERVOUS_SYSTEM,"IAP_7638, MCT4_1422, IAP_5620, LCL161, AZD5582...",6.0


In [10]:
first_neighbors = pd.read_csv('tables/ipredict/ipredict.nearest_neighbors.annotated.txt', sep='\t', index_col=0).reset_index(drop=True)
first_neighbors

Unnamed: 0,case,nearest neighbor,sensitive therapies,n,moalmanac_n_overlap,ipredict_n_overlap
0,157,A204_SOFT_TISSUE,"Ponatinib, Pazopanib, TL-2-105, Cabozantinib, ...",16,0,0
1,66,A204_SOFT_TISSUE,"Ponatinib, Pazopanib, TL-2-105, Cabozantinib, ...",16,0,0
2,A009,A204_SOFT_TISSUE,"Ponatinib, Pazopanib, TL-2-105, Cabozantinib, ...",16,0,0
3,102,A375_SKIN,"AS605240, Pilaralisib, LIMK1 inhibitor BMS4, R...",14,0,0
4,119,A375_SKIN,"AS605240, Pilaralisib, LIMK1 inhibitor BMS4, R...",14,0,0
...,...,...,...,...,...,...
78,A043,SW837_LARGE_INTESTINE,,0,0,0
79,26,T24_URINARY_TRACT,,0,0,0
80,A018,T24_URINARY_TRACT,,0,0,0
81,101,U118MG_CENTRAL_NERVOUS_SYSTEM,"Rucaparib, Fludarabine",2,0,0


In [11]:
first_neighbors['delta_moalmanac'] = first_neighbors['n'].subtract(first_neighbors['moalmanac_n_overlap'].astype(int))
first_neighbors['delta_ipredict'] = first_neighbors['n'].subtract(first_neighbors['ipredict_n_overlap'].astype(int))

In [12]:
first_neighbors['delta_moalmanac'].describe()

count    83.000000
mean      3.397590
std       4.169985
min       0.000000
25%       1.000000
50%       2.000000
75%       5.000000
max      16.000000
Name: delta_moalmanac, dtype: float64

In [13]:
first_neighbors['delta_ipredict'].describe()

count    83.000000
mean      3.469880
std       4.162367
min       0.000000
25%       1.000000
50%       2.000000
75%       5.000000
max      16.000000
Name: delta_ipredict, dtype: float64

In [14]:
first_neighbors['n'].describe()

count    83.000000
mean      3.518072
std       4.185811
min       0.000000
25%       1.000000
50%       2.000000
75%       5.000000
max      16.000000
Name: n, dtype: float64

In [15]:
first_neighbors['n'].median()

2.0

In [16]:
first_neighbors['moalmanac_n_overlap'].eq(0).value_counts()

True     79
False     4
Name: moalmanac_n_overlap, dtype: int64

In [17]:
first_neighbors['ipredict_n_overlap'].eq(0).value_counts()

True     81
False     2
Name: ipredict_n_overlap, dtype: int64

In [26]:
therapies = pd.read_csv('../prospective-trial/therapies.annotated.txt', sep='\t')
print(therapies[therapies['almanac-evidence-strongest'].notnull()]['patient_id'].value_counts().median())
print(therapies[therapies['almanac-evidence-strongest'].notnull()]['patient_id'].value_counts().mean())

2.0
2.32
