## Imports & Helpers

In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../')

from checkmate_imports import *

# global variables 
HUE_ORDER = ['stroma','pred_g2','intermediate_grade','pred_g4']
MIN_SEGMENT_SIZE = 50
GRADE_DIFF_THRESH = 0.35
TUMOR_DIFF_THRESH = 0.35
MIN_TUMOR_SEG_MEAN = 0.70
NODE_DIFF_CUTOFF = invert_rag_weight(GRADE_DIFF_THRESH)
TILES_PER_MM2 = 0.256**-2

MIN_TIL_COUNT = 10
TIL_ISO_CUTOFF = 14  # based on none vs any AUROC bootstrap on high grade foci + no hard cases
TIL_HIGH_CUTOFF = 48 # based on not-high vs high AUROC bootstrap on high grade foci + no hard cases
FRAC_CUTOFF = 0.25
TIL_AREA_CUTOFF = 10

# assume 7x7 minimum case for a square area focus
# going 2 tiles inner would result in a 5x5 inner cube and thus area cutoff of 25
# MIN_CENTER_AREA = 25
MIN_CENTER_AREA = 10  # relaxing from 25 to try to recover possible interesting foci

In [2]:
# set minimum # tiles to consider in analysis
MIN_TILES = 200

def get_anno(df):
    return df.reset_index().drop_duplicates(df.index.name).set_index(df.index.name)

def run_category_annotation(results, homog_cutoff=0.7, tumor_cutoff=0.5, g2_cutoff=0.1, g4_cutoff=0.1, grade_var='prob_g4_not_g2'):
    print(f'assigning categories based on {grade_var}')
    
    category_count = results.groupby(['unique_id','meta']).count().iloc[:,0]
    category_count.name = 'category_count'

    nonstroma_category_freq = category_count.drop(labels=['stroma'], level=1).groupby('unique_id').apply(lambda x: x/x.sum())
    nonstroma_category_freq.name = 'nonstroma_category_freq'

    homog_candidates = get_indices(nonstroma_category_freq.groupby('unique_id').max() >= homog_cutoff)
    het_candidates = get_indices(nonstroma_category_freq.groupby('unique_id').max() < homog_cutoff)

    nonstroma_category_freq_pivot = pd.DataFrame(nonstroma_category_freq).reset_index().pivot_table(index='unique_id', columns='meta', values='nonstroma_category_freq')
    nonstroma_category_freq_pivot.columns = [x+'_nonstroma_freq' for x in nonstroma_category_freq_pivot.columns]
    nonstroma_category_freq_pivot = nonstroma_category_freq_pivot.fillna(0)

    max_category_freq = nonstroma_category_freq.groupby('unique_id').max()
    max_category_freq.name = 'max_category_freq'

    min_category_freq = nonstroma_category_freq.groupby('unique_id').min()
    min_category_freq.name = 'min_category_freq'

    results['max_category_freq'] = max_category_freq
    results['min_category_freq'] = min_category_freq

    results['candidate_category'] = np.nan

    nonstroma_grade_mean = results.loc[results.meta != 'stroma', grade_var].groupby('unique_id').mean()
    nonstroma_grade_mean.name = 'nonstroma_grade_mean'

    tumor_tile_count = results.loc[results.meta != 'stroma'].groupby('unique_id').count().iloc[:,0]
    tumor_tile_count.name = 'tumor_tile_count'

    tile_count = results.groupby('unique_id').count().iloc[:,0]
    tile_count.name = 'tile_count'

    #################
    metrics = pd.concat([nonstroma_category_freq_pivot, max_category_freq, min_category_freq, tile_count, tumor_tile_count, nonstroma_grade_mean], 1)

    crit0 = (metrics['max_category_freq'] < homog_cutoff) & (metrics['tumor_tile_count'] >= MIN_TILES)
    metrics.loc[crit0, 'candidate_category'] = 'heterogeneous'

    crit0 = (metrics['max_category_freq'] >= homog_cutoff) & (metrics['tumor_tile_count'] >= MIN_TILES)
    metrics.loc[crit0, 'candidate_category'] = 'homogeneous'

    combined_simple_anno = get_anno(results)
    metrics['candidate_category'] = metrics['candidate_category'].fillna('other')
    metrics['nonstroma_grade_mean_above_050'] = metrics['nonstroma_grade_mean'] > tumor_cutoff

    # add additional subset to candidate_category
    crit_a = metrics.candidate_category == 'heterogeneous'
    crit_b = (metrics.pred_g2_nonstroma_freq > g2_cutoff) & (metrics.pred_g4_nonstroma_freq > g4_cutoff)

    crit_c = metrics.candidate_category == 'homogeneous'

    metrics.loc[crit_a & crit_b, 'candidate_category_ext'] = 'heterogeneous_mincount'
    metrics.loc[crit_a & ~crit_b, 'candidate_category_ext'] = 'heterogeneous_other'
    metrics.loc[crit_c, 'candidate_category_ext'] = 'homogeneous'
    metrics['candidate_category_ext'] = metrics['candidate_category_ext'].fillna('other')
    
    return metrics

#### Profile ccRCC Info 

In [3]:
profile_anno = pd.read_csv('/home/jupyter/profile_rcc_data_anno_updated_20200114.csv')
profile_anno  = profile_anno.set_index('UNIQUE_SAMPLE_ID')
profile_anno.index.name = 'unique_id'

profile_anno = profile_anno.reset_index()
profile_anno['unique_id'] = profile_anno.unique_id.values.astype(float).astype(int)
profile_anno = profile_anno.set_index('unique_id')

profile_anno['grade'] = profile_anno['fuhrman_grade']
profile_anno.loc[~profile_anno.grade.isna(), 'grade'] = profile_anno.loc[~profile_anno.grade.isna(), 'grade'].apply(lambda x: 'G'+str(int(x)))
profile_anno['grade'] = profile_anno.grade.fillna('GX')

stage_map = {1.:'Stage I', 2.:'Stage II', 3.:'Stage III', 4.: 'Stage IV'}
profile_anno['stage'] = profile_anno['stage_at_diagnosis_pathology_preferred_otherwise_clinica'].map(stage_map).fillna('no_anno')

# explicitly add cohort name to prevent int ambiguity
profile_anno.index = 'profile_' + profile_anno.index.astype(str)


profile_inf_tilemean = pd.read_csv('/home/jupyter/checkmate-histo/20220121_rerun_all_profile_ccrcc_with_20210312_profile_models_g24_ENSEMBLE.csv')
profile_inf_tilemean = profile_inf_tilemean.set_index('slide_id').join(profile_anno.reset_index().set_index('file_id')['unique_id']).set_index('unique_id')

# profile_assignments = pd.read_csv('/home/jupyter/20210119_profile_ccrcc_clustering_label_agg.csv', index_col=0)
# profile_assignments.index = 'profile_'+ profile_assignments.index.astype(int).astype(str)

#### KIRC Info (20210405 full tile 4-fold inference)

In [4]:
kirc_inf = pd.read_pickle('/mnt/disks/bms/20210405_kirc_inference_using_profile_g2_g4_models_all_slides.pkl')
kirc_inf = kirc_inf.set_index('slide_id')
kirc_inf_pivot = kirc_inf.pivot_table(index=['slide_id','x','y','prob_tumor'], columns='model_id',values='class_1_model_prob').sort_index()
print('Correlation between each of 4 folds preditions')
print(np.corrcoef(kirc_inf_pivot.sample(10000).T))

# grab 4-fold inference p(G4 not G2), and p(Tumor)
kirc_inf_tilemean = kirc_inf.groupby(['slide_id','x','y']).mean()
kirc_inf_tilemean = reset_set_idx(kirc_inf_tilemean, 'slide_id')
kirc_inf_tilemean['slide_id'] = kirc_inf_tilemean.reset_index()['slide_id'].apply(lambda x: '-'.join(x.split('-')[:3])).values

Correlation between each of 4 folds preditions
[[1.         0.83721559 0.82491198 0.87629731]
 [0.83721559 1.         0.76440673 0.84271355]
 [0.82491198 0.76440673 1.         0.78926071]
 [0.87629731 0.84271355 0.78926071 1.        ]]


In [5]:
kirc_inf = kirc_inf.reset_index()
kirc_inf['tcga_id'] = kirc_inf['slide_id'].apply(lambda x: '-'.join(x.split('-')[:3]))
kirc_inf = kirc_inf.set_index('tcga_id')

kirc_scores = pd.read_csv('/home/jupyter/20210308_kirc_wgii_ith_scores.csv')

kirc_anno = pd.read_csv('/home/jupyter/kirc_anno_with_angio_scores.csv').set_index('Unnamed: 0')
print(kirc_anno.shape)
kirc_anno = kirc_anno.drop_duplicates('bcr_patient_uuid')
print(kirc_anno.shape)

kirc_anno.index.name = 'tcga_id'
kirc_anno['grade'] = kirc_anno.neoplasm_histologic_grade
kirc_anno['stage'] = kirc_anno.pathologic_stage

pancan_survival = pd.read_excel('/home/jupyter/TCGA-CDR-SupplementalTableS1.xlsx')
kirc_survival = pancan_survival.loc[pancan_survival['type'] == 'KIRC']
kirc_survival = kirc_survival.set_index('bcr_patient_barcode')

(447, 752)
(406, 752)


### CM-025 Info

##### Load in updated inference (4-folds of models from 20210312 profile ccRCC) on CM-025
- Use already averaged version (if needed, full inference set `20210427_tvnt_and_g2_g4_inference_profile_ccrcc_training_manual_cm025_tiles_INFERENCE_AGG_ALL_MODELS.pkl`

In [6]:
# cm025_inf = pd.read_csv('/mnt/disks/bms/20210316_cm025_inference_using_profile_g2_g4_models_ALL_TILES.csv')
# cm025_inf = cm025_inf.set_index('slide_id')
# cm025_inf_pivot = cm025_inf.pivot_table(index=['slide_id','x','y','prob_tumor'], columns='model_id',values='class_1_model_prob').sort_index()
# print('Correlation between each of 4 folds preditions')
# print(np.corrcoef(cm025_inf_pivot.sample(10000).T))

# grab 4-fold inference p(G4 not G2), and p(Tumor)
cm025_inf_tilemean = pd.read_pickle('/mnt/disks/manual_bms_tiles/20210427_tvnt_and_g2_g4_inference_profile_ccrcc_training_manual_cm025_tiles_INFERENCE_AGG_ENSEMBLEMEAN.pkl')
# cm025_inf_tilemean = cm025_inf.groupby(['slide_id','x','y']).mean()
cm025_inf_tilemean = reset_set_idx(cm025_inf_tilemean, 'subjid')
cm025_inf_tilemean.index = 'cm025_' + cm025_inf_tilemean.index.astype(str)

In [7]:
# grab harmonized ITH calculation from `20210305_revisiting_kirc_ith_score` NB
bms_ith_recalc = pd.read_csv('/home/jupyter/20210308_bms_all_cohorts_ith_score_describe_ccf_mod.csv')
# cm025_anno = pd.read_csv('./cm025_manifest_with_braun2020_annotations.csv')
cm025_anno = pd.read_csv('/mnt/disks/manual_bms_tiles/manual_cm025_merged_braunsupp_annotations.csv').set_index('subjid')

#### Add updated annotation set from Thomas/Sabina lab


In [8]:
manual_bms_grade_sr_anno = pd.read_excel('/home/jupyter/Copy of CA209-025 H&E grade, histotype,necrosis reviewed with SS +AC 5.30.20.xlsx')
manual_bms_grade_sr_anno['subjid'] = manual_bms_grade_sr_anno['Patient ID'].apply(lambda x: x.split(' ')[-1].lstrip('0'))
manual_bms_grade_sr_anno['unique_id'] = 'cm025_'+manual_bms_grade_sr_anno['subjid'].astype(str)

print('dropping cases without image file annotated')
manual_bms_grade_sr_anno = manual_bms_grade_sr_anno.loc[~manual_bms_grade_sr_anno['Image Location'].isna()]

dropping cases without image file annotated


In [9]:
manual_bms_grade_sr_anno['slide_id'] = manual_bms_grade_sr_anno['Image Location'].apply(lambda x: x.split('\\')[-1].strip('.svs'))

In [10]:
cm025_anno['unique_id'] = 'cm025_'+cm025_anno.reset_index()['subjid'].astype(str).values

In [11]:
cm025_anno = reset_set_idx(cm025_anno,'unique_id')

#### May want to incorporate the grade percentage estimates and S/R estimates later

In [12]:
manual_bms_grade_sr_anno['grade'] = manual_bms_grade_sr_anno['Grade MSA+MF+AC'].replace('n.a.', np.nan).apply(lambda x: str(x).split(' ')[0])
manual_bms_grade_sr_anno['grade'] = manual_bms_grade_sr_anno['grade'].apply(lambda x: 'G'+str(int(x)) if x!= 'nan' else 'GX')
manual_bms_grade_sr_anno['grade'].value_counts()

G4    108
G3     85
G2     81
GX     13
G1      2
Name: grade, dtype: int64

In [13]:
# drop previously "GX" fill to add where available
cols_to_use = manual_bms_grade_sr_anno.columns.difference(cm025_anno.columns)
cm025_anno = cm025_anno.join(manual_bms_grade_sr_anno[cols_to_use].set_index('unique_id'))
cm025_anno['grade'] = cm025_anno['grade'].fillna('GX')
cm025_anno['grade'].value_counts()

GX    381
G4    106
G3     79
G2     68
G1      1
Name: grade, dtype: int64

In [14]:
# add recalculated ITH score to anno df and harmonize naming of wGII (lowercase)
cm025_anno = get_merged_df(cm025_anno.reset_index().set_index('MAF_Tumor_ID', drop=False), bms_ith_recalc.set_index('Tumor_Sample_Barcode'))
cm025_anno['wgii'] = cm025_anno['WGII']
cm025_anno = cm025_anno.reset_index(drop=True).set_index('unique_id')

### Group tile-level info 

In [15]:
print('Using full profile self-inference set')

Using full profile self-inference set


In [16]:
inference_raw = {
    'kirc':kirc_inf_tilemean.set_index('slide_id').drop(columns='prob_g4_not_g2').rename(columns={'class_1_model_prob':'prob_g4_not_g2'}),  
    'profile':profile_inf_tilemean.rename(columns={'class_1_model_prob':'prob_g4_not_g2'}),  
    'cm025':cm025_inf_tilemean.rename(columns={'class_1_model_prob':'prob_g4_not_g2'}),
}

In [17]:
for df in inference_raw.values():
    df.index.name = 'unique_id'

### Do tile-level smoothing

In [18]:
from collections import defaultdict

META_VARS = ['stroma', 'pred_g2', 'intermediate_grade', 'pred_g4']
N_NEIGHBORS = 4
TUMOR_CUTOFF = 0.5
LOWER_CUTOFF = 1/3.
UPPER_CUTOFF = 2/3.
print(f'using p(Tumor) cutoff of {TUMOR_CUTOFF}')

###########
all_results = defaultdict(list)
print('using aggregate mean pred. over all 4 models from 20210312 checkpoints ')
for key,df in inference_raw.items():
    print(key)
    for slide_id, slide_df in df.groupby('unique_id'):
        try:
            temp = run_inference_smoothing(slide_df, dist_type='uniform', n_neighbors=N_NEIGHBORS, tumor_cutoff=TUMOR_CUTOFF, lower_grade_cutoff=LOWER_CUTOFF, upper_grade_cutoff=UPPER_CUTOFF)

            all_results[key].append(temp)

        except Exception as e:
            print(e)
            print(f'Error with {slide_id}')

using p(Tumor) cutoff of 0.5
using aggregate mean pred. over all 4 models from 20210312 checkpoints 
kirc
profile
Found array with 0 sample(s) (shape=(0, 2)) while a minimum of 1 is required.
Error with profile_1096424
Expected n_neighbors <= n_samples,  but n_samples = 3, n_neighbors = 4
Error with profile_1097978
cm025
Expected n_neighbors <= n_samples,  but n_samples = 3, n_neighbors = 4
Error with cm025_205
Expected n_neighbors <= n_samples,  but n_samples = 1, n_neighbors = 4
Error with cm025_248
Expected n_neighbors <= n_samples,  but n_samples = 3, n_neighbors = 4
Error with cm025_507
Expected n_neighbors <= n_samples,  but n_samples = 1, n_neighbors = 4
Error with cm025_927


### Aggregate across the 3 cohorts

In [19]:
combined = []
for key,val in all_results.items():
    print(pd.concat(val).shape)
    temp = pd.concat(val)
    temp['cohort'] = key
    combined.append(temp)
combined = pd.concat(combined)
combined.index.name = 'unique_id'
combined.index = combined.index.astype(str)

combined_results = combined[['x','y','prob_tumor','prob_g4_not_g2','smoothed_prob_tumor','smoothed_prob_g4_not_g2','meta','cohort']]
combined_results.index.name = 'unique_id'

(1658468, 8)
(875000, 11)
(2536654, 9)


#### Reintroduce tilepaths 

In [20]:
kirc_inf = pd.read_pickle('/mnt/disks/bms/20210405_kirc_inference_using_profile_g2_g4_models_all_slides.pkl')
kirc_paths = kirc_inf.loc[kirc_inf.model_id == 'profile_ccrcc_fold0_20210312']
kirc_paths['slide_id'] = kirc_paths.reset_index()['slide_id'].apply(lambda x: '-'.join(x.split('-')[:3])).values

cm025_paths = pd.read_pickle('/mnt/disks/manual_bms_tiles/20210426_tile_paths_unprocessed_manual_cm025_635_slides.pkl')
cm025_paths = cm025_paths.set_index('slide_id').join(reset_set_idx(cm025_anno,'slide_id')['unique_id'])
cm025_paths = reset_set_idx(cm025_paths,['unique_id','x','y'])[['full_path']]

combined_paths = pd.concat([cm025_paths, kirc_paths.set_index(['slide_id','x','y'])[['full_path']]])

combined_results = reset_set_idx(combined_results, ['unique_id','x','y']).join(combined_paths)

### Save to file

In [21]:
combined_results.to_pickle('./rerun_smoothed_tile_level_info.pkl')

---
## Harmonize shared clinical/molecular labels

### Map survival variables to common naming scheme (used for profile)

#### Default units:
- CM025: months
- KIRC: days 
- Profile: months

In [22]:
# ## LEGACY FEATURES

# HOMOG_CUTOFF = 0.85
# G2_CUTOFF = 0.05
# G4_CUTOFF = 0.05

# combined_metrics = run_category_annotation(combined_results.reset_index(level=[1,2]), grade_var='smoothed_prob_g4_not_g2', 
#                                            homog_cutoff=HOMOG_CUTOFF, g2_cutoff=G2_CUTOFF, g4_cutoff=G4_CUTOFF)

# combined_metrics = combined_metrics.join(get_anno(combined_results[['cohort']]))

In [23]:
# just use a placeholder since we won't be using features from `run_category_annotation`
combined_metrics = combined_results.value_counts(['unique_id'])
combined_metrics.name = 'tile_count'
combined_metrics = pd.DataFrame(combined_metrics)
combined_metrics.index = combined_metrics.index.get_level_values(0)

combined_metrics = combined_metrics.join(get_anno(combined_results.reset_index(level=[1,2])[['cohort']]))
combined_metrics.shape

(1443, 2)

In [24]:
# operating on p(tumor) > 0.5 cutoff 
tumor_tile_frac = (combined_results['meta'] != 'stroma').groupby('unique_id').mean()
tumor_tile_count = (combined_results['meta'] != 'stroma').groupby('unique_id').sum()

combined_metrics['tumor_tile_fraction'] = tumor_tile_frac
combined_metrics['tumor_tile_count'] = tumor_tile_count
combined_metrics.head()

Unnamed: 0_level_0,tile_count,cohort,tumor_tile_fraction,tumor_tile_count
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cm025_424,40374,cm025,0.559989,22609
TCGA-T7-A92I,18576,kirc,0.253015,4700
cm025_234,16127,cm025,0.657593,10605
cm025_355,14071,cm025,0.189965,2673
cm025_296,12758,cm025,0.655197,8359


In [25]:
survival_mapper = {}
survival_mapper['cm025'] = {'anno':cm025_anno, 'pfs':'PFS', 'pfs_event':'PFS_CNSR', 'os':'OS', 'os_event':'OS_CNSR', 'ttf':'PFS', 'ttf_event':'PFS_CNSR'}
survival_mapper['kirc'] = {'anno':kirc_survival, 'pfs':'PFI.time', 'pfs_event':'PFI', 'os':'OS.time', 'os_event':'OS', 'ttf':'no_info', 'ttf_event':'no_info'}
survival_mapper['profile'] = {'anno':profile_anno, 'pfs':'dfs_from_nephrectomy', 'pfs_event':'dfs_event', 'os':'os_line_specific', 'os_event':'os_event',
                             'ttf':'ttf', 'ttf_event':'ttf_event'}

harmonized_survival_info = []
for key,val in survival_mapper.items():
    temp = pd.DataFrame()
    for new_col, old_col in val.items():
        if new_col != 'anno':
            if old_col != 'no_info':
                temp[new_col] = val['anno'][old_col]
            else:
                temp[new_col] = np.nan
                
    temp['cohort'] = key 
    if key == 'kirc': # push to months 
        temp['os'] = temp['os']/(365/12)
        temp['pfs'] = temp['pfs']/(365/12)
    harmonized_survival_info.append(temp)

harmonized_survival_info = pd.concat(harmonized_survival_info)
harmonized_survival_info.index.name = 'unique_id'

#### Add age, gender info 

In [26]:
kirc_survival_reformat = kirc_survival.reset_index()[['bcr_patient_barcode','age_at_initial_pathologic_diagnosis','gender']]
kirc_survival_reformat = kirc_survival_reformat.rename(columns={'bcr_patient_barcode':'unique_id', 'age_at_initial_pathologic_diagnosis':'age_at_diagnosis'})
# kirc_survival_reformat['dataset'] = 'tcga'

kirc_survival_reformat = kirc_survival_reformat.set_index(['unique_id'])
kirc_survival_reformat['gender'] = kirc_survival_reformat['gender'].str.lower()
profile_anno['gender'] = profile_anno['male_male__female'].map({1:'male',0:'female'})    

profile_anno_reformat = profile_anno[['gender','age_at_diagnosis']]
# profile_anno_reformat['dataset'] = 'profile'
profile_anno_reformat = reset_set_idx(profile_anno_reformat, ['unique_id'])
cm025_age_gender = cm025_anno[['Age','Sex']].rename(columns={'Age':'age_at_diagnosis', 'Sex':'gender'})
cm025_age_gender['gender'] = cm025_age_gender['gender'].map({'M':'male', 'F':'female'})
age_gender = pd.concat([kirc_survival_reformat, profile_anno_reformat,cm025_age_gender])
age_gender.index.name = 'unique_id'

combined_metrics = get_merged_df(combined_metrics, harmonized_survival_info.drop(columns='cohort'))
combined_metrics = get_merged_df(combined_metrics, age_gender)
print(combined_metrics.shape)

cm025_multi_slide_ambiguous = get_indices(cm025_anno.subjid.duplicated())

print(f'dropping {len(cm025_multi_slide_ambiguous)} ambiguous CM025 cases with multiple slides for single SUBJID')
combined_metrics = combined_metrics.drop(labels=cm025_multi_slide_ambiguous)
print(combined_metrics.shape)

(1455, 12)
dropping 4 ambiguous CM025 cases with multiple slides for single SUBJID
(1439, 12)


#### Add harmonized `biopsy_site_type`, `biopsy_location`, `assigned_grade`, etc

In [27]:
# assigned grade for all cohorts 
combined_metrics = get_merged_df(combined_metrics,pd.concat([cm025_anno['grade'],kirc_anno['grade'], profile_anno['grade']]))
combined_metrics['grade'] = combined_metrics.grade.fillna('GX')

# stage for KIRC and Profile (unavail. for cm025)
combined_metrics = get_merged_df(combined_metrics, pd.concat([kirc_anno['stage'], profile_anno['stage']]))
combined_metrics['stage'] = combined_metrics.stage.fillna('no_anno')

# reformat biopsy site info
cm025_anno['biopsy_site_type'] = cm025_anno['Tumor_Sample_Primary_or_Metastasis']
cm025_anno['primary_site'] = cm025_anno['biopsy_site_type'] == 'PRIMARY'
profile_anno['primary_site'] = profile_anno['biopsy_site_type'] == 'Primary'
cm025_anno['metastatic_site'] = cm025_anno['biopsy_site_type'] == 'METASTASIS'
profile_anno['metastatic_site'] = profile_anno['biopsy_site_type'] == 'Metastatic Recurrence'

# biopsy site (is_primary) for cm025 and Profile
combined_metrics = get_merged_df(combined_metrics, pd.concat([cm025_anno['primary_site'], profile_anno['primary_site']]))
combined_metrics['primary_site'] = combined_metrics.primary_site.fillna(False)
combined_metrics.loc[combined_metrics.cohort == 'kirc', 'primary_site'] = True

# biopsy site (is_metastatic) for cm025 and Profile
combined_metrics = get_merged_df(combined_metrics, pd.concat([cm025_anno['metastatic_site'], profile_anno['metastatic_site']]))
combined_metrics['metastatic_site'] = combined_metrics.metastatic_site.fillna(False)
# combined_metrics['percent_included'] = combined_metrics['tumor_tile_count'] / combined_metrics['tile_count']
print(combined_metrics.shape)

(1439, 16)


### KIRC RNA signature addition

In [28]:
kbi_supp_tcga = pd.read_excel('/home/jupyter/TableS5_Submit.xlsx', sheet_name=3, header=1).set_index(['Patient'])

kirc_rna =  pd.read_csv('/home/jupyter/20200910_kirc_rna_matched_hne_stage_grade.csv', index_col=[0,1,2,3,4])

rna_scores = pd.read_csv('/home/jupyter/20200910_kirc_angio_rna_signature_scores.csv').set_index('tcga_id')

mcderm_angio_sig = ['VEGFA', 'KDR', 'ESM1', 'PECAM1', 'ANGPTL4', 'CD34']
mcderm_teff_sig = ['CD8A', 'EOMES', 'PRF1', 'IFNG', 'CD274']
mcderm_myeloid_sig = ['IL6', 'CXCL1', 'CXCL2', 'CXCL3', 'IL8','PTGS2']

sig_subset = ['javelin_angio_score','mcderm_angio_sig', 'mcderm_teff_sig', 'mcderm_myeloid_sig','TP1 Score', 'TP2 Score']

javelin_angio_sig = pd.Series(pd.read_csv('/home/jupyter/javelin_angio_sig_block.csv', header=None).values.reshape(-1)).dropna().values
javelin_angio_sig = list(javelin_angio_sig)

rna_scores['mcderm_angio_sig'] = kirc_rna[mcderm_angio_sig].mean(1).reset_index(level=[1,2,3,4], drop=True)
rna_scores['mcderm_teff_sig'] = kirc_rna[mcderm_teff_sig].mean(1).reset_index(level=[1,2,3,4], drop=True)
rna_scores['mcderm_myeloid_sig'] = kirc_rna[mcderm_myeloid_sig].mean(1).reset_index(level=[1,2,3,4], drop=True)

rna_scores_merge = pd.concat([rna_scores, kbi_supp_tcga[['TP1 Score', 'TP2 Score']]], 1).dropna()

kirc_metrics = combined_metrics.loc[combined_metrics.cohort =='kirc']
kirc_metrics = get_merged_df(kirc_metrics, rna_scores_merge[sig_subset])
kirc_metrics_rna = kirc_metrics.dropna(subset=['mcderm_angio_sig'])
kirc_metrics_rna['tp1_tp2_diff'] = kirc_metrics_rna['TP1 Score']- kirc_metrics_rna['TP2 Score']
kirc_metrics_rna.to_pickle('./rerun_kirc_rna_signature_info.pkl')

#### Add wGII and ITH scores where available

In [29]:
combined_metrics = get_merged_df(combined_metrics, pd.concat([kirc_scores.set_index('tcga_id'), cm025_anno[['wgii','ith']]]))

#### Add driver mutation status info

In [30]:
# load TCGA MAF 
kidney_maf = pd.read_pickle('/home/jupyter/kidney_mc3.maf')

# driver genes in cm025 paper supplement 
cm025_driver_genes = ['ARID1A', 'ATM', 'BAP1',
       'COL9A3', 'KDM5C', 'MTOR', 'NF2', 'PBRM1', 'PCK1', 'PIK3CA', 'PTEN',
       'S100B', 'SETD2', 'SMARCA4', 'TCEB1', 'TP53', 'TRMT2B', 'TSC1', 'USP32',
       'VHL', 'WNT8A', 'ZNF800']

# genes in both profile's panel and the CM-025 paper subset focus
shared_drivers = ['ARID1A', 'BAP1', 'KDM5C', 'MTOR', 'PBRM1', 'PTEN', 'SETD2', 'TP53', 'VHL']

# truncating = ['Nonsense_Mutation', 'Frame_Shift_Ins','Frame_Shift_Del', 'Splice_Site',
#               'Stop_Codon_Del','Start_Codon_Del','Nonstop_Mutation']
truncating = ['Nonsense_Mutation', 'Frame_Shift_Ins','Frame_Shift_Del', 'Splice_Site']

# add placeholders to profile's gene panel to match columns in cm025 anno
for gene in cm025_driver_genes:
    try:
        profile_anno[gene]
    except:
        print(f'adding {gene} placeholder...')
        profile_anno[gene] = 'no_info'

profile_drivers_harmonized = pd.concat([profile_anno[col].map({0.0:'WT', 1.0:'MUT','no_info':'no_info'}) for col in cm025_driver_genes], 1)
combined_metrics.shape

adding ATM placeholder...
adding COL9A3 placeholder...
adding NF2 placeholder...
adding PCK1 placeholder...
adding PIK3CA placeholder...
adding S100B placeholder...
adding SMARCA4 placeholder...
adding TCEB1 placeholder...
adding TRMT2B placeholder...
adding TSC1 placeholder...
adding USP32 placeholder...
adding WNT8A placeholder...
adding ZNF800 placeholder...


(1439, 18)

In [31]:
# add in profile and cm025 info
existing_mut_info = pd.concat([cm025_anno[cm025_driver_genes], profile_drivers_harmonized])
existing_mut_info.index.name = 'unique_id'

existing_mut_info.index = existing_mut_info.index.astype(str)
combined_metrics.index = combined_metrics.index.astype(str)

combined_metrics = get_merged_df(combined_metrics, existing_mut_info.fillna('no_info'))


# use KIRC MAF to get truncating status for driver gene subset 
kirc_driver_results = {}
for gene in cm025_driver_genes:
    temp_trunc = kidney_maf.loc[(kidney_maf.Hugo_Symbol == gene) & kidney_maf.Variant_Classification.apply(lambda x: x in truncating)]
    filtered_ids = [x for x in temp_trunc.index.values if x in combined_metrics.index.unique()]
    kirc_driver_results[gene] = filtered_ids
    combined_metrics.loc[filtered_ids, gene] = 'MUT'
    combined_metrics[gene] = combined_metrics[gene].fillna('WT')

#### Add Drug type, RECIST group, and CB/ICB/NCB binning to profile data based on Braun 
- CB: CRPR or SD with shrinkage AND PFS > 6mo
- NCB: PD AND PFS < 3mo
- ICB: not CB, not NCB
---
- Note that CM025 `PFS` is approximately `ttf` for Profile


In [32]:
cm025_anno['drug_type'] = cm025_anno['Arm'].map({'EVEROLIMUS':'Non-ICI', 'NIVOLUMAB':'ICI'})

profile_anno['recist'] = profile_anno['best_response_during_this_line_of_therapy']
profile_anno['recist'] = profile_anno.recist.map({'PR':'CRPR', 'CR':'CRPR','SD':'SD', 'PD':'PD'})

cm025_anno['recist'] = cm025_anno['ORR']
cm025_anno['recist'] = cm025_anno.recist.map({'CRPR':'CRPR','SD':'SD', 'PD':'PD'})

profile_anno.loc[profile_anno['recist'].isin(['CRPR','SD']) & (profile_anno['ttf'] > 6),'benefit'] = 'CB'
profile_anno.loc[profile_anno['recist'].isin(['PD']) & (profile_anno['ttf'] < 6),'benefit'] = 'NCB'
profile_anno.loc[(~profile_anno.ttf.isna()) & profile_anno.benefit.isna(), 'benefit'] = 'ICB'
cm025_anno['benefit'] = cm025_anno['Benefit']

# add in profile and cm025 info
treatment_info = pd.concat([cm025_anno[['recist','drug_type','benefit']], profile_anno[['recist','drug_type','benefit']]])
treatment_info.index = treatment_info.index.astype(str)
treatment_info.index.name = 'unique_id'
combined_metrics = get_merged_df(combined_metrics, treatment_info.fillna('no_info'))
combined_metrics.shape

(1439, 43)

In [33]:
combined_metrics.to_csv('./rerun_patientlevel_harmonized_annotations.csv')