## Imports & Helpers

In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../')
from checkmate_imports import *

# global variables 
HUE_ORDER = ['stroma','pred_g2','intermediate_grade','pred_g4']
MIN_SEGMENT_SIZE = 50
GRADE_DIFF_THRESH = 0.35
TUMOR_DIFF_THRESH = 0.35
MIN_TUMOR_SEG_MEAN = 0.70
NODE_DIFF_CUTOFF = invert_rag_weight(GRADE_DIFF_THRESH) 
MIN_TIL_COUNT = 10

# GRADE_TC_CUTOFF = 0.4  # minimum segment mean grade score to consider in TIL processing 
# GRADE_TC_CUTOFF = 0.  # minimum segment mean grade score to consider in TIL processing 
GRADE_TC_CUTOFF = 0.8  # focusing on high grade foci only 

print('MODIFYING EARLIER TIL CUTOFFS')
TIL_ISO_CUTOFF = 14  # based on none vs any AUROC bootstrap on high grade foci + no hard cases
TIL_HIGH_CUTOFF = 48 # based on not-high vs high AUROC bootstrap on high grade foci + no hard cases
FRAC_CUTOFF = 0.25
TIL_AREA_CUTOFF = 10

EXCLUSION_RATIO_CUTOFF = 1.5  # margin vs center ratio threshold for "exclusion" call
TILES_PER_MM2 = 0.256**-2

# assume 7x7 minimum case for a square area focus
# going 2 tiles inner would result in a 5x5 inner cube and thus area cutoff of 25
# MIN_CENTER_AREA = 25
MIN_CENTER_AREA = 10  # relaxing from 25 to try to recover possible interesting foci

MODIFYING EARLIER TIL CUTOFFS


### Load saved files 

In [2]:
metrics = pd.read_csv('./rerun_additional_feature_subset.csv', index_col=0)

tilewise_anno = pd.read_csv('./rerun_tilewise_grade_til_annotations.csv', index_col=0) # TIL specific CM025 subset
tilewise_nontil_info = pd.read_csv('./rerun_tilewise_grade_nontil_annotations.csv', index_col=0)

# read in non-TIL tilewise info 
seg_agg = pd.read_csv('./rerun_tilewise_grade_nontil_annotations.csv', index_col=0)

node_descriptions = pd.read_csv('./rerun_node_descriptions.csv', index_col=0)
feature_subset = pd.read_csv('./rerun_additional_feature_subset.csv', index_col=0)

edge_info = pd.read_csv('./rerun_base_rag_edge_info_annotation.csv', index_col=[0,1,2,3])
# restore set objects (stored as string when saved to csv earlier)
edge_info['edge_set'] = edge_info.reset_index()[['edge0','edge1']].apply(lambda x: set(x),1).values

#### Add average TIL counts info as well

In [3]:
anno = pd.read_csv('/home/jupyter/manual_cm025_merged_braunsupp_annotations.csv') 
anno['unique_id'] = 'cm025_' + anno.subjid.astype(str)
paper_desert = anno.loc[anno.ImmunoPhenotype == 'Desert','unique_id'].values
paper_infl = anno.loc[anno.ImmunoPhenotype == 'Infiltrated','unique_id'].values
paper_excl = anno.loc[anno.ImmunoPhenotype == 'Excluded','unique_id'].values
anno = anno.set_index('unique_id')

In [4]:
edge_agg_pivot = pd.read_csv('./rerun_base_rag_edge_info_annotation_processed.csv', index_col=[0,1,2])
edge_class_sum = pd.read_csv('./rerun_base_rag_edge_info_annotation_processed_sum.csv', index_col=0)
edge_binary = edge_class_sum > 0

cases_with_edges = list(edge_agg_pivot.index.levels[0].unique())
cases_with_passing_edges = get_indices((edge_binary['edge_class_distal'] | edge_binary['edge_class_proximal']))

len(get_indices(edge_binary['edge_class_proximal']))

398

In [5]:
def get_til_infiltration_fraction_difference(row):
    node0_grade = row['node0_grade_score']
    node1_grade = row['node1_grade_score']
    lower_grade_node_rel_idx = np.argmin([node0_grade, node1_grade])
    higher_grade_node_rel_idx = 1 - lower_grade_node_rel_idx
    
    lower_grade_inf = row[f'node{lower_grade_node_rel_idx}_infiltration_frac']
    higher_grade_inf = row[f'node{higher_grade_node_rel_idx}_infiltration_frac']

    return higher_grade_inf - lower_grade_inf

def get_til_infiltration_fraction_ratio(row, eps=1e-3):
    node0_grade = row['node0_grade_score']
    node1_grade = row['node1_grade_score']
    lower_grade_node_rel_idx = np.argmin([node0_grade, node1_grade])
    higher_grade_node_rel_idx = 1 - lower_grade_node_rel_idx
    
    lower_grade_inf = row[f'node{lower_grade_node_rel_idx}_infiltration_frac'] + eps
    higher_grade_inf = row[f'node{higher_grade_node_rel_idx}_infiltration_frac'] + eps

    return higher_grade_inf/lower_grade_inf

def find_disconnected_nodes(edge_df, all_nodes):
    """
    edge_df only has proximal and distal edges in it and is node0,node1 indexed
    all_nodes is an iterable of the whole set of nodes to consider
    """
    pair_nodes = []
    for a,b in edge_df.index.values:
        pair_nodes.extend([a,b])
    pair_nodes = set(pair_nodes)
    
    return set(all_nodes).difference(pair_nodes)

In [6]:
def get_higher_grade_infiltration(row):
    node0_grade = row['node0_grade_score']
    node1_grade = row['node1_grade_score']
    lower_grade_node_rel_idx = np.argmin([node0_grade, node1_grade])
    higher_grade_node_rel_idx = 1 - lower_grade_node_rel_idx
    
    higher_grade_inf = row[f'node{higher_grade_node_rel_idx}_infiltration_frac']

    return higher_grade_inf

In [7]:
prox_dist_edges = edge_agg_pivot.loc[edge_agg_pivot['edge_class'] != 'not_eligible'].dropna(subset=['node0_tiles_above_til_cutoff','node1_tiles_above_til_cutoff'])
unfiltered_prox_dist_edges = prox_dist_edges.copy()

prox_dist_edges.loc[(prox_dist_edges['node0_grade_score'] >= GRADE_TC_CUTOFF) | (prox_dist_edges['node1_grade_score'] >= GRADE_TC_CUTOFF), 'high_grade_involvement'] = True
prox_dist_edges['high_grade_involvement'] = prox_dist_edges['high_grade_involvement'].fillna(False)
print('LIMITING PROXIMAL/DISTAL EDGE DESCRIPTION DF TO HIGH GRADE INVOLVEMENT')
prox_dist_edges = prox_dist_edges.loc[prox_dist_edges['high_grade_involvement']]

prox_dist_edges['node0_infiltration_frac'] = prox_dist_edges['node0_tiles_above_til_cutoff']/prox_dist_edges['node0_tile_count']
prox_dist_edges['node1_infiltration_frac'] = prox_dist_edges['node1_tiles_above_til_cutoff']/prox_dist_edges['node1_tile_count']
prox_dist_edges['edge_mean_infiltration_frac'] = (prox_dist_edges['node0_infiltration_frac'] + prox_dist_edges['node1_infiltration_frac'])/2
# prox_dist_edges['normalized_area_fmeasure'] = prox_dist_edges.groupby('unique_id')['area_fmeasure'].apply(lambda x: x/x.sum())

prox_dist_edges['til_infiltration_frac_diff_high_minus_low'] = prox_dist_edges.apply(lambda x: get_til_infiltration_fraction_difference(x), 1)
prox_dist_edges['til_infiltration_frac_ratio_high_vs_low'] = prox_dist_edges.apply(lambda x: get_til_infiltration_fraction_ratio(x), 1)

prox_dist_edges['grade_abs_diff'] = np.abs(prox_dist_edges['node0_grade_score'] - prox_dist_edges['node1_grade_score'])
prox_dist_edges['both_zero_area_above_cutoff'] = (prox_dist_edges['node0_tiles_above_til_cutoff'] == 0) & (prox_dist_edges['node1_tiles_above_til_cutoff'] == 0)

LIMITING PROXIMAL/DISTAL EDGE DESCRIPTION DF TO HIGH GRADE INVOLVEMENT


In [8]:
prox_dist_edges['til_infiltration_high_grade_node'] = prox_dist_edges.apply(lambda x: get_higher_grade_infiltration(x), 1)

In [9]:
# clean out edge case where both proximal and distal edges called for same edge 
df = prox_dist_edges.reset_index()
df['sort_n0'] = df[['edge0','edge1']].apply(lambda x: np.sort(x)[0], 1)
df['sort_n1'] = df[['edge0','edge1']].apply(lambda x: np.sort(x)[1], 1)

dups = df.set_index(['unique_id','sort_n0','sort_n1']).loc[df[['unique_id','sort_n0','sort_n1']].value_counts() > 1]
print(dups.shape)
df_filtered = df.set_index(['unique_id','sort_n0','sort_n1']).loc[df[['unique_id','sort_n0','sort_n1']].value_counts() == 1]
df_filtered = df_filtered.append(dups.loc[dups['edge_class'] == 'proximal'])

print(df_filtered.reset_index().duplicated(subset=['unique_id','sort_n0','sort_n1']).sum())

prox_dist_edges = reset_set_idx(df_filtered, ['unique_id','edge0','edge1'])
prox_dist_edges.head()

(12, 43)
0


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sort_n0,sort_n1,seg_size,node1_grade_score,node1_area_frac,node1_tiles_above_til_cutoff,node1_tile_count,node1_til_status_label_exp,node1_til_status_basic,node1_til_status_combined,...,edge_class_proximal_f_weighted,high_grade_involvement,node0_infiltration_frac,node1_infiltration_frac,edge_mean_infiltration_frac,til_infiltration_frac_diff_high_minus_low,til_infiltration_frac_ratio_high_vs_low,grade_abs_diff,both_zero_area_above_cutoff,til_infiltration_high_grade_node
unique_id,edge0,edge1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
cm025_1001,1,5,1,5,91,0.482654,0.060707,0.0,91.0,non_infiltrated,non_infiltrated,non_infiltrated,...,0.0,True,0.0,0.0,0.0,0.0,1.0,0.325642,True,0.0
cm025_1037,4,3,3,4,603,0.800765,0.269919,114.0,603.0,localized_infiltration,non_infiltrated,localized_infiltration,...,0.569454,True,0.673105,0.189055,0.43108,-0.484051,0.281936,0.32233,False,0.189055
cm025_1042,4,5,4,5,154,0.685652,0.054882,4.0,154.0,non_infiltrated,non_infiltrated,non_infiltrated,...,0.0,True,0.065421,0.025974,0.045697,0.039447,2.46239,0.208274,False,0.065421
cm025_1066,1,2,1,2,845,0.529441,0.354298,19.0,845.0,localized_infiltration,non_infiltrated,localized_infiltration,...,0.0,True,0.110876,0.022485,0.066681,0.088391,4.763699,0.276598,False,0.110876
cm025_1069,1,3,1,3,1603,0.884696,0.483122,948.0,1603.0,dispersed_infiltration,intermed_infiltrated,intermed_infiltrated_dispersed,...,0.848212,True,0.461722,0.591391,0.526557,0.129669,1.28023,0.347511,False,0.591391


In [10]:
# only do area weight normalization relative to kept high grade foci
prox_dist_edges['normalized_area_fmeasure'] = prox_dist_edges.loc[prox_dist_edges['high_grade_involvement']].groupby('unique_id')['area_fmeasure'].apply(lambda x: x/x.sum())

In [11]:
node_descriptions = node_descriptions.set_index(['merged_labels'],append=True)
# first need to flag whether a node isn't involved in any edges 
for uid in node_descriptions.index.levels[0]:
    try:
        all_nodes = node_descriptions.loc[uid].reset_index()['merged_labels']
        edge_df = prox_dist_edges.loc[uid]
        out = find_disconnected_nodes(edge_df, all_nodes)
    except KeyError:
        out = all_nodes
    for node in out:
        node_descriptions.loc[(uid,node),'no_prox_dist_involvement'] = True

node_descriptions['no_prox_dist_involvement'] = node_descriptions['no_prox_dist_involvement'].fillna(False)

node_descriptions['high_grade_involvement'] = node_descriptions['smoothed_prob_g4_not_g2'] >= GRADE_TC_CUTOFF
print('LIMITING NODE DESCRIPTION DF TO HIGH GRADE')
node_descriptions = node_descriptions.loc[node_descriptions['high_grade_involvement']]

node_descriptions['infiltration_frac'] = node_descriptions['tiles_above_til_cutoff']/node_descriptions['tile_count']
node_descriptions['segment_area_frac'] = node_descriptions.groupby('unique_id').tile_count.apply(lambda x: x/x.sum())
node_descriptions['grade_score_rel_ratio'] = node_descriptions.groupby('unique_id')['smoothed_prob_g4_not_g2'].apply(lambda x: x/x.mean())
node_descriptions['grade_score_rel_ratio_diff'] = node_descriptions['grade_score_rel_ratio'].apply(lambda x: np.abs(1-x))

LIMITING NODE DESCRIPTION DF TO HIGH GRADE


In [12]:
node_descriptions

Unnamed: 0_level_0,Unnamed: 1_level_0,x,y,prob_tumor,prob_g4_not_g2,smoothed_prob_tumor,smoothed_prob_g4_not_g2,tumor_seg_label,merge_thresh,putative_tumor,labels,...,til_status_basic,til_status_combined,til_counts_tile_avg,til_cutoff,no_prox_dist_involvement,high_grade_involvement,infiltration_frac,segment_area_frac,grade_score_rel_ratio,grade_score_rel_ratio_diff
unique_id,merged_labels,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
cm025_1,1.0,58.347277,29.844418,0.763328,0.922870,0.761946,0.928682,4.0,0.35,1.0,103.485706,...,non_infiltrated,localized_infiltration,5.169425,14.0,True,True,0.067108,1.000000,1.000000,0.000000
cm025_1001,1.0,30.095575,33.300885,0.906519,0.804434,0.905916,0.808296,12.0,0.35,1.0,44.734513,...,non_infiltrated,non_infiltrated,0.826549,14.0,False,True,0.000000,1.000000,1.000000,0.000000
cm025_1002,3.0,70.778689,12.155738,0.864145,0.912478,0.867090,0.914939,6.0,0.35,1.0,11.377049,...,highly_infiltrated,highly_infiltrated_dispersed,51.053279,14.0,True,True,0.971311,1.000000,1.000000,0.000000
cm025_1006,1.0,81.385787,25.035533,0.766975,0.866643,0.755270,0.868300,7.0,0.35,1.0,24.238579,...,intermed_infiltrated,intermed_infiltrated_dispersed,14.526227,14.0,True,True,0.350254,0.395053,0.953178,0.046822
cm025_1006,2.0,47.716667,33.283333,0.901317,0.932445,0.886053,0.933048,9.0,0.35,1.0,47.583333,...,non_infiltrated,low_infiltrated_dispersed,13.016667,14.0,True,True,0.333333,0.040107,1.024256,0.024256
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
cm025_995,1.0,114.279762,23.635417,0.810584,0.950981,0.801733,0.955915,1.0,0.35,1.0,23.116071,...,non_infiltrated,localized_infiltration,5.092262,14.0,True,True,0.032738,0.929461,0.998800,0.001200
cm025_995,2.0,127.803922,40.705882,0.749716,0.957535,0.731375,0.958212,15.0,0.35,1.0,43.823529,...,non_infiltrated,non_infiltrated,3.156863,14.0,True,True,0.000000,0.070539,1.001200,0.001200
cm025_996,1.0,49.448649,50.435135,0.901608,0.893623,0.891644,0.897006,10.0,0.35,1.0,25.141892,...,intermed_infiltrated,intermed_infiltrated_dispersed,28.522973,14.0,True,True,0.778378,1.000000,1.000000,0.000000
cm025_997,1.0,83.121431,46.752118,0.888757,0.889300,0.888055,0.890834,14.0,0.35,1.0,212.887982,...,intermed_infiltrated,intermed_infiltrated_dispersed,20.878255,14.0,True,True,0.339504,0.568397,0.997376,0.002624


### Define a simplified score scheme that only considers infiltration for high grade foci

In [13]:
score_agg = {}

for edge_type in ['proximal','distal','proxdist']:
    for grade_diff_flag in [True, False]:
        if edge_type != 'proxdist':
            temp_edges = prox_dist_edges.loc[prox_dist_edges['edge_class'] == edge_type]
        else:
            temp_edges =  prox_dist_edges.loc[prox_dist_edges['edge_class'].isin(['proximal','distal'])]
            
        name_map = 'grade_diff_and_area_weighted' if grade_diff_flag else 'area_weighted'
        if grade_diff_flag:
            scores = temp_edges.groupby('unique_id').apply(lambda x: (x['grade_abs_diff'] * x['normalized_area_fmeasure'] * x['til_infiltration_high_grade_node']).sum())
        else:
            scores = temp_edges.groupby('unique_id').apply(lambda x: (x['normalized_area_fmeasure'] * x['til_infiltration_high_grade_node']).sum())
        print(len(scores))
        score_agg[f'{edge_type}_{name_map}'] = scores
        
### omit since we're not going to compare along edges
#     if edge_type != 'proxdist': 
#         cat_group= pd.get_dummies(prox_dist_edges.loc[prox_dist_edges.edge_class == edge_type], columns=['hg_til_cat'], prefix_sep='', prefix='')
#         cat_cols = ['higher_hg_infiltration','lower_hg_infiltration','no_hg_infiltration']
#         for col in cat_cols:
#             score_agg[f'{edge_type}_{col}'] = (cat_group['normalized_area_fmeasure'] * cat_group[col]).groupby('unique_id').sum()
#     else:
#         cat_group= pd.get_dummies(prox_dist_edges, columns=['hg_til_cat'], prefix_sep='', prefix='')
#         cat_cols = ['higher_hg_infiltration','lower_hg_infiltration','no_hg_infiltration']
#         for col in cat_cols:
#             score_agg[f'{edge_type}_{col}'] = (cat_group['normalized_area_fmeasure'] * cat_group[col]).groupby('unique_id').sum() 
       
        
for til_type in ['infiltration_frac']:
    for isolated_only_flag in [True, False]:
        iso_map = 'isolated_only' if isolated_only_flag else 'all_nodes'
        if isolated_only_flag:
            temp_nodes = node_descriptions.loc[node_descriptions['no_prox_dist_involvement']]
        else:
            temp_nodes = node_descriptions.copy()
        temp_nodes = temp_nodes.loc[temp_nodes['high_grade_involvement']]
        scores = temp_nodes.groupby('unique_id').apply(lambda x: (x['segment_area_frac'] * x[til_type]).sum())
        score_agg[f'{iso_map}_{name_map}'] = scores

65
65
39
39
86
86


In [14]:
score_agg = pd.DataFrame.from_dict(score_agg)

In [15]:
unfiltered_prox_dist_edges.to_csv('./rerun_unfiltered_proximal_and_distal_edge_annotations_with_score_components.csv')

score_agg.to_csv('./rerun_slidewise_grade_til_scores.csv')
node_descriptions.to_csv('./rerun_node_descriptions_with_score_components.csv')
prox_dist_edges.to_csv('./rerun_proximal_and_distal_edge_annotations_with_score_components.csv')

#### Add additional info to edge descriptions


In [16]:
base_edge_info = edge_info.reorder_levels([1,0,2,3]).loc[0]
edge_agg_pivot = edge_agg_pivot.join(base_edge_info)

#### Add Distal/Proximal RAG annotation count for all cases

In [17]:
feature_subset = feature_subset.join(edge_class_sum)  # add ALL updated edge info, incl weighted edges

# cases without entries in `edge_class_sum` are presumed to have no RAG edges, proximal or distal 
feature_subset[['edge_class_distal','edge_class_proximal']] = feature_subset[['edge_class_distal','edge_class_proximal']].fillna(0)
feature_subset['any_distal_edge'] = feature_subset['edge_class_distal'] > 0 
feature_subset['any_proximal_edge'] = feature_subset['edge_class_proximal'] > 0 
feature_subset['any_diff_edge'] = feature_subset['any_distal_edge'] | feature_subset['any_proximal_edge']
feature_subset[['all_rag_edge_total']] = feature_subset[['edge_class_distal','edge_class_proximal']].sum(1)


#### omit since not focusing on DIE/TCTM
# # indicate whether we see differential DIE status for cases where we DO see general RAG any_diff_edge
# z = feature_subset.loc[(feature_subset['cohort'] == 'cm025') & (feature_subset['any_diff_edge']), 'diff_contact_infiltrated_excluded_or_desert'] > 0 
# feature_subset.loc[(feature_subset['cohort'] == 'cm025') & (feature_subset['any_diff_edge']), 'any_diff_DIE_rag_edge'] = z

# # annotate when category not applicable
# feature_subset.loc[(feature_subset['cohort'] == 'cm025') & (~feature_subset['any_diff_edge']), 'any_diff_DIE_rag_edge'] = 'not_applicable_no_edges'

# for col in edge_class_sum_with_DIE.columns:
#     feature_subset.loc[(feature_subset['cohort'] == 'cm025') & (~feature_subset['any_diff_edge']), col] = 'not_applicable_no_edges'

#### Recalculate `nonstroma_grade_mean` based on updated segmentation scheme 

In [18]:
updated_slide_grade_scores = tilewise_nontil_info.loc[tilewise_nontil_info.putative_tumor,'smoothed_prob_g4_not_g2'].groupby('unique_id').mean()
feature_subset['updated_nonstroma_grade_mean'] = updated_slide_grade_scores

# override previous slide grade score
feature_subset['nonstroma_grade_mean'] = updated_slide_grade_scores

#### [omitted] Annotate whether TIL info available for TCTM based analysis

In [19]:
# cm025_cases_with_usable_til_data = get_indices(~tctm_segment_mean_pivot['til_count_mean']['center'].isna().groupby('unique_id').apply(np.all))

- So this is the area where we might see some divergence 


In [20]:
cm025_cases_with_usable_til_data = get_indices(node_descriptions['high_grade_involvement'].groupby('unique_id').apply(np.any))

In [21]:
len(cm025_cases_with_usable_til_data)

327

In [22]:
feature_subset.loc[cm025_cases_with_usable_til_data, 'usable_til_data'] = True
feature_subset['usable_til_data'] = feature_subset['usable_til_data'].fillna(False)

#### Annotate whether Non-TIL info available for TCTM based analysis

In [23]:
feature_subset.loc[seg_agg.index.unique(), 'usable_nontil_data'] = True
feature_subset['usable_nontil_data'] = feature_subset['usable_nontil_data'].fillna(False)

#### Add scores to feature DF

In [24]:
feature_subset = feature_subset.join(score_agg)

#### Add additional slide-level summary metrics for TIL status

In [25]:
clip_max = 50
slidewise_til_mean = tilewise_anno.loc[tilewise_anno.meta == 'tumor'].groupby('unique_id').til_counts.mean()
# segmentwise_til_mean = tilewise_anno.loc[tilewise_anno.meta == 'tumor'].groupby(['unique_id','merged_labels']).til_counts.mean()

# clipped_slidewise_til_mean = slidewise_til_mean.apply(lambda x: np.clip(x, 0, clip_max))
# clipnorm_slidewise_til_mean = clipped_slidewise_til_mean/clip_max
# exp_slidewise_til_mean_scale = np.e ** clipnorm_slidewise_til_mean

slidewise_frac_infiltrated_tumor = tilewise_anno.loc[tilewise_anno.meta == 'tumor'].groupby('unique_id').til_counts.apply(lambda x: (x > TIL_ISO_CUTOFF).mean())
slidewise_frac_infiltrated_all = tilewise_anno.groupby('unique_id').til_counts.apply(lambda x: (x > TIL_ISO_CUTOFF).mean())

In [26]:
feature_subset['avg_til_counts_tumor_area'] = slidewise_til_mean
feature_subset['avg_til_counts_all_area'] = tilewise_anno.groupby('unique_id').til_counts.mean()

feature_subset['avg_frac_infiltrated_tumor_area'] = slidewise_frac_infiltrated_tumor
feature_subset['avg_frac_infiltrated_all_area'] = slidewise_frac_infiltrated_all

for col in ['avg_til_counts_tumor_area','avg_til_counts_all_area',]:
    temp = feature_subset[col]
    temp_clipped = temp.apply(lambda x: np.clip(x, 0, clip_max))
    temp_clipnorm = temp_clipped/clip_max
    temp_exp = np.e ** temp_clipnorm
    
    feature_subset[f'{col}_clipped'] = temp_clipped
    feature_subset[f'{col}_clipped_normed'] = temp_clipnorm
    feature_subset[f'{col}_exp_clipnorm'] = temp_exp

### Save updated post-processing features to file

In [27]:
feature_subset.to_csv('./rerun_additional_feature_subset_expanded_postprocessing.csv')

---
## Capture tertiary processing that was happening in main figure NB

### Load saved files 

In [28]:
feature_subset = pd.read_csv('./rerun_additional_feature_subset_expanded_postprocessing.csv', index_col=0)

anno = pd.read_csv('manual_cm025_merged_braunsupp_annotations.csv') 
anno['unique_id'] = 'cm025_' + anno.subjid.astype(str)
paper_desert = anno.loc[anno.ImmunoPhenotype == 'Desert','unique_id'].values
paper_infl = anno.loc[anno.ImmunoPhenotype == 'Infiltrated','unique_id'].values
paper_excl = anno.loc[anno.ImmunoPhenotype == 'Excluded','unique_id'].values
anno = anno.set_index('unique_id')

node_descriptions = pd.read_csv('./rerun_node_descriptions_with_score_components.csv',index_col=[0])
node_descriptions['merged_labels'] = node_descriptions['merged_labels'].astype(int)

prox_dist_edges = pd.read_csv('./rerun_proximal_and_distal_edge_annotations_with_score_components.csv',index_col=0)

scores = pd.read_csv('./rerun_slidewise_grade_til_scores.csv', index_col=0)

load_tile_info = False

if load_tile_info:
    tilewise_anno = pd.read_csv('./rerun_tilewise_grade_til_annotations.csv', index_col=0) # TIL specific CM025 subset

    edge_info = pd.read_csv('./rerun_base_rag_edge_info_annotation.csv', index_col=[0,1,2,3])
    # restore set objects (stored as string when saved to csv earlier)
    edge_info['edge_set'] = edge_info.reset_index()[['edge0','edge1']].apply(lambda x: set(x),1).values

    # read in non-TIL tilewise info 
    seg_agg = pd.read_csv('./rerun_tilewise_grade_nontil_annotations.csv', index_col=0)
    
    edge_agg_pivot = pd.read_csv('./rerun_base_rag_edge_info_annotation_processed.csv', index_col=[0,1,2])
    edge_class_sum = pd.read_csv('./rerun_base_rag_edge_info_annotation_processed_sum.csv', index_col=0)
    edge_binary = edge_class_sum > 0

- Clean up some features that warped through subset dependencies

In [29]:
unfiltered_prox_dist_edges = pd.read_csv('./rerun_unfiltered_proximal_and_distal_edge_annotations_with_score_components.csv', index_col=0)

unfiltered_prox_dist_edges['grade_abs_diff'] = np.abs(unfiltered_prox_dist_edges['node0_grade_score'] - unfiltered_prox_dist_edges['node1_grade_score'])
unfiltered_prox_dist_edges['both_zero_area_above_cutoff'] = (unfiltered_prox_dist_edges['node0_tiles_above_til_cutoff'] == 0) & (unfiltered_prox_dist_edges['node1_tiles_above_til_cutoff'] == 0)

# clean out edge case where both proximal and distal edges called for same edge 
df = unfiltered_prox_dist_edges.reset_index()
df['sort_n0'] = df[['edge0','edge1']].apply(lambda x: np.sort(x)[0], 1)
df['sort_n1'] = df[['edge0','edge1']].apply(lambda x: np.sort(x)[1], 1)

dups = df.set_index(['unique_id','sort_n0','sort_n1']).loc[df[['unique_id','sort_n0','sort_n1']].value_counts() > 1]
print(dups.shape)
df_filtered = df.set_index(['unique_id','sort_n0','sort_n1']).loc[df[['unique_id','sort_n0','sort_n1']].value_counts() == 1]
df_filtered = df_filtered.append(dups.loc[dups['edge_class'] == 'proximal'])

print(df_filtered.reset_index().duplicated(subset=['unique_id','sort_n0','sort_n1']).sum())

unfiltered_prox_dist_edges = reset_set_idx(df_filtered, ['unique_id','edge0','edge1'])
unfiltered_prox_dist_edges.head()

edge_counts = unfiltered_prox_dist_edges.groupby('unique_id').sum()[['edge_class_proximal','edge_class_distal']]
edge_flag = edge_counts > 0 

feature_subset.loc[get_indices(edge_flag['edge_class_proximal']), 'any_proximal_edge'] = True
feature_subset.loc[get_indices(edge_flag['edge_class_distal']), 'any_distal_edge'] = True
feature_subset['any_diff_edge'] = feature_subset['any_proximal_edge'] | feature_subset['any_distal_edge']

tilewise_anno = pd.read_csv('./tilewise_grade_til_annotations.csv', index_col=0) # TIL specific CM025 subset
nonstroma_tilewise_anno = tilewise_anno.loc[tilewise_anno.meta !='stroma']
segmentmean = nonstroma_tilewise_anno.groupby(['unique_id','merged_labels']).mean()

seg_counts = nonstroma_tilewise_anno.groupby(['unique_id','merged_labels']).aggregate(['count','mean'])['smoothed_prob_g4_not_g2']
seg_counts_filtered = seg_counts.loc[seg_counts['count'] >= MIN_SEGMENT_SIZE]
node_count = seg_counts_filtered.value_counts('unique_id')
node_count.name = 'total_nodes'
# node_count.index= node_count.index.levels[0]

feature_subset = feature_subset.join(node_count)

feature_subset['single_node_case'] = feature_subset['total_nodes'] == 1

(38, 36)
0


### HG Foci + TIL calling subset features

In [30]:
hg_passing = node_descriptions.groupby('unique_id')['high_grade_involvement'].apply(np.any).map({True:'high_grade_present',False:'no_high_grade_present'})
feature_subset['high_grade_passing'] = hg_passing
feature_subset['high_grade_passing'] = feature_subset['high_grade_passing'].fillna('no_high_grade_present')

In [31]:
subset = feature_subset.loc[feature_subset['cohort'] == 'cm025']

hg_inf_cutoff = 0.0556179

# no edges at all to consider
crit = ~subset['any_proximal_edge']
subset.loc[crit, 'proximal_hg_bin'] = 'no_proximal_edge'

# proximal edge but higher grade node does not pass grade score cutoff; slide lacking any HG foci
crit = subset['any_proximal_edge'] & (subset['high_grade_passing'] == 'no_high_grade_present')
subset.loc[crit, 'proximal_hg_bin'] = 'nonpassing_proximal_edge'

# proximal edge but higher grade node does not pass grade score cutoff; slide does have HG foci though
crit = subset['any_proximal_edge'] & (subset['high_grade_passing'] == 'high_grade_present') & (subset['proximal_area_weighted'].isna())
subset.loc[crit, 'proximal_hg_bin'] = 'nonpassing_proximal_edge'


# proximal edge where higher grade node DOES pass grade score cutoff and is low infiltration overalll
crit = subset['any_proximal_edge'] & (subset['high_grade_passing'] == 'high_grade_present') & (subset['proximal_area_weighted'] < hg_inf_cutoff)
subset.loc[crit, 'proximal_hg_bin'] = 'proximal_hg_noninfiltrated'

# proximal edge where higher grade node DOES pass grade score cutoff and is low infiltration overalll
crit = subset['any_proximal_edge'] & (subset['high_grade_passing'] == 'high_grade_present') & (subset['proximal_area_weighted'] >= hg_inf_cutoff)
subset.loc[crit, 'proximal_hg_bin'] = 'proximal_hg_infiltrated'

feature_subset = merge_nonoverlapping(feature_subset, subset)
subset['proximal_hg_bin'].fillna('MISSING').value_counts()

# no edges at all to consider
crit = ~subset['any_distal_edge']
subset.loc[crit, 'distal_hg_bin'] = 'no_distal_edge'

# distal edge but higher grade node does not pass grade score cutoff; slide lacking any HG foci
crit = subset['any_distal_edge'] & (subset['high_grade_passing'] == 'no_high_grade_present')
subset.loc[crit, 'distal_hg_bin'] = 'nonpassing_distal_edge'

# distal edge but higher grade node does not pass grade score cutoff; slide does have HG foci though
crit = subset['any_distal_edge'] & (subset['high_grade_passing'] == 'high_grade_present') & (subset['distal_area_weighted'].isna())
subset.loc[crit, 'distal_hg_bin'] = 'nonpassing_distal_edge'


# distal edge where higher grade node DOES pass grade score cutoff and is low infiltration overalll
crit = subset['any_distal_edge'] & (subset['high_grade_passing'] == 'high_grade_present') & (subset['distal_area_weighted'] < hg_inf_cutoff)
subset.loc[crit, 'distal_hg_bin'] = 'distal_hg_noninfiltrated'

# distal edge where higher grade node DOES pass grade score cutoff and is low infiltration overalll
crit = subset['any_distal_edge'] & (subset['high_grade_passing'] == 'high_grade_present') & (subset['distal_area_weighted'] >= hg_inf_cutoff)
subset.loc[crit, 'distal_hg_bin'] = 'distal_hg_infiltrated'

feature_subset = merge_nonoverlapping(feature_subset, subset)
subset['distal_hg_bin'].fillna('MISSING').value_counts()

no_distal_edge              533
nonpassing_distal_edge       51
distal_hg_infiltrated        20
distal_hg_noninfiltrated     19
Name: distal_hg_bin, dtype: int64

In [32]:
# annotate which cases have passing edges (higher grade node above 0.80 GS)
prox_hg_cases = prox_dist_edges.loc[prox_dist_edges['edge_class'] == 'proximal'].index.unique()
dist_hg_cases = prox_dist_edges.loc[prox_dist_edges['edge_class'] == 'distal'].index.unique()

subset.loc[prox_hg_cases,'any_hg_proximal_edge'] = True
subset['any_hg_proximal_edge'] = subset['any_hg_proximal_edge'].fillna(False)

subset.loc[dist_hg_cases,'any_hg_distal_edge'] = True
subset['any_hg_distal_edge'] = subset['any_hg_distal_edge'].fillna(False)

In [33]:
# slide lacking any HG foci
crit = (subset['high_grade_passing'] == 'no_high_grade_present')
subset.loc[crit, 'global_infiltration_bin'] = 'no_hg_present'

# slide does have HG foci and is lower infiltration overall
crit = (subset['high_grade_passing'] == 'high_grade_present') & (subset['all_nodes_area_weighted'] < hg_inf_cutoff)
subset.loc[crit, 'global_infiltration_bin'] = 'low_hg_global_infiltration'

# slide does have HG foci and is lower infiltration overall
crit = (subset['high_grade_passing'] == 'high_grade_present') & (subset['all_nodes_area_weighted'] >= hg_inf_cutoff)
subset.loc[crit, 'global_infiltration_bin'] = 'high_hg_global_infiltration'

subset['global_infiltration_bin'].fillna('MISSING').value_counts()

######### add detailed HG edge categories
### proximal, binary version
crit = ~subset['any_hg_proximal_edge'] & subset['any_proximal_edge']
subset.loc[crit, 'proximal_edge_detailed'] = 'no_high_grade_proximal_edge'

crit = subset['any_hg_proximal_edge']
subset.loc[crit, 'proximal_edge_detailed'] = 'high_grade_proximal_edge'

crit = ~subset['any_proximal_edge']
subset.loc[crit, 'proximal_edge_detailed'] = 'no_high_grade_proximal_edge'

### distal, binary version
crit = ~subset['any_hg_distal_edge'] & subset['any_distal_edge']
subset.loc[crit, 'distal_edge_detailed'] = 'no_high_grade_distal_edge'

crit = subset['any_hg_distal_edge']
subset.loc[crit, 'distal_edge_detailed'] = 'high_grade_distal_edge'

crit = ~subset['any_distal_edge']
subset.loc[crit, 'distal_edge_detailed'] = 'no_high_grade_distal_edge'

### any diff, binary version
crit_a = subset['any_hg_proximal_edge'] | subset['any_hg_distal_edge']
subset.loc[crit_a, 'any_hg_diff_edge'] = 'high_grade_diff_edge'

crit_b = (subset['any_proximal_edge'] | subset['any_distal_edge']) & ~crit_a
subset.loc[crit_b, 'any_hg_diff_edge'] = 'no_high_grade_diff_edge'  # lg diff edge only produces 6 cases and causes overfit problems

crit_c = (~subset['any_proximal_edge'] & ~subset['any_distal_edge'])
subset.loc[crit_c, 'any_hg_diff_edge'] = 'no_high_grade_diff_edge'

# add to main df
feature_subset = merge_nonoverlapping(feature_subset, subset)

In [35]:
# compatibility addition
feature_subset['candidate_category'] = (feature_subset['tumor_tile_count'] >= 200).map({False:'other',True:'passing'})

In [36]:
feature_subset.to_csv('./rerun_final_patient_features.csv')