# Nearest neighbors from prospective cohorts
We applied matchmaking to the prospective clinical trial (I-PREDICT). In this notebook, we look at the results; specifically....

In [2]:
import glob
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt
import pandas as pd

import sys
sys.path.append('../../common/')
import settings
settings.set_arial_as_font()

from settings import Colors
tableau10 = Colors.tableau10

%matplotlib inline

In [3]:
handles = glob.glob('../prospective-trial/almanac_outputs/*/*.matchmaker.txt')

summary = pd.read_csv('formatted/cell-lines.summary.txt', sep='\t')
zscores = pd.read_csv('formatted/sanger.gdsc.txt', sep='\t')

df = pd.concat([pd.read_csv(handle, sep='\t', index_col=0) for handle in handles])
df = df[~df['comparison'].eq('case-profile')]
df.sort_values(['SNF: FDA & CGC'], ascending=True, inplace=True)
df.reset_index(inplace=True)
df['case'] = df['case'].astype(str)

broad_to_ccle_map = summary.set_index('broad')['ccle_name'].dropna().to_dict()
sanger_to_ccle_map = summary.set_index('sanger')['ccle_name'].dropna().to_dict()

zscores['ccle_name'] = zscores['model_id'].replace(sanger_to_ccle_map)

first_neighbors = pd.Series(index=df['case'].astype(str).sort_values().drop_duplicates())
for label, group in df.groupby('case'):
    nearest = group['comparison'].replace(broad_to_ccle_map).tolist()[0]
    first_neighbors.loc[label] = nearest
first_neighbors.name = 'nearest neighbor'
first_neighbors = first_neighbors.to_frame().reset_index()




In [4]:
for index in first_neighbors.index: 
    cell_line = first_neighbors.loc[index, 'nearest neighbor']
    therapies = zscores[zscores['ccle_name'].eq(cell_line) & zscores['z_score'].le(-2)]['therapy_name'].tolist()
    n = len(therapies)
    first_neighbors.loc[index, 'sensitive therapies'] = ', '.join(therapies)
    first_neighbors.loc[index, 'n'] = n

In [5]:
first_neighbors['n'].value_counts()

2.0     19
1.0     17
0.0     14
6.0      8
5.0      5
8.0      4
4.0      4
13.0     4
11.0     2
16.0     2
7.0      1
12.0     1
9.0      1
3.0      1
Name: n, dtype: int64

In [6]:
first_neighbors['n'].sum()

313.0

In [7]:
first_neighbors['n'].describe()

count    83.000000
mean      3.771084
std       4.070493
min       0.000000
25%       1.000000
50%       2.000000
75%       6.000000
max      16.000000
Name: n, dtype: float64

In [14]:
first_neighbors['case'] = first_neighbors['case'].astype(str)
(first_neighbors
 .sort_values(['case'])
 .to_csv('tables/ipredict/ipredict.nearest_neighbors.txt', sep='\t', index=False)
)

In [15]:
first_neighbors

Unnamed: 0,case,nearest neighbor,sensitive therapies,n
0,101,U118MG_CENTRAL_NERVOUS_SYSTEM,"Fludarabine, Rucaparib",2.0
1,102,A375_SKIN,"SB590885, Dabrafenib, LIMK1 inhibitor BMS4, (5...",13.0
2,105,CAL29_URINARY_TRACT,"AZD8835, PI3Ka_4409, Taselisib, Alpelisib",4.0
3,112,A549_LUNG,"Tanespimycin, BPTES",2.0
4,115,SW948_LARGE_INTESTINE,Bicalutamide,1.0
...,...,...,...,...
78,A041,MHHNB11_AUTONOMIC_GANGLIA,CX-5461,1.0
79,A042,HT144_SKIN,"SB590885, Dabrafenib, AZ628, PLX-4720, Voxtali...",13.0
80,A043,NCIH1792_LUNG,"UNC0642, Avagacestat, CHIR-99021, FMK",4.0
81,A044,GMS10_CENTRAL_NERVOUS_SYSTEM,"IAP_5620, AZD5582, LCL161, MCT4_1422, IAP_7638...",6.0
