## Imports & Helpers

In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../')

from checkmate_imports import *

# global variables 
HUE_ORDER = ['stroma','pred_g2','intermediate_grade','pred_g4']
MIN_SEGMENT_SIZE = 50
GRADE_DIFF_THRESH = 0.35
TUMOR_DIFF_THRESH = 0.35
MIN_TUMOR_SEG_MEAN = 0.70
NODE_DIFF_CUTOFF = invert_rag_weight(GRADE_DIFF_THRESH)
TILES_PER_MM2 = 0.256**-2

MIN_TIL_COUNT = 10
TIL_ISO_CUTOFF = 14  # based on none vs any AUROC bootstrap on high grade foci + no hard cases
TIL_HIGH_CUTOFF = 48 # based on not-high vs high AUROC bootstrap on high grade foci + no hard cases
FRAC_CUTOFF = 0.25
TIL_AREA_CUTOFF = 10

# assume 7x7 minimum case for a square area focus
# going 2 tiles inner would result in a 5x5 inner cube and thus area cutoff of 25
# MIN_CENTER_AREA = 25
MIN_CENTER_AREA = 10  # relaxing from 25 to try to recover possible interesting foci

### Load saved files 

In [2]:
metrics = pd.read_csv('./rerun_patientlevel_harmonized_annotations.csv', index_col=0)
assigned_df = pd.read_pickle('./rerun_smoothed_tile_level_info.pkl')  # tile level smoothed p(tumor) and p(g4 not g2) for all cohorts
assigned_df['full_path'] = assigned_df['full_path'].str.replace('tcga-kidney-tiles', 'kirc')

# 20210624NB run on GPU VM using dask `summarize_nucleus_calls_dask_manual_intensity_til_only`
summaries = pd.read_csv('/home/jupyter/20210624NB_aggregating_cm025_nuclei_summaries_both_arms.csv')

# this is the full cm025 annotation set
anno = pd.read_csv('/home/jupyter/manual_cm025_merged_braunsupp_annotations.csv') 
anno['unique_id'] = 'cm025_' + anno.subjid.astype(str)
anno['ImmunoPhenotype'].value_counts()

paper_desert = anno.loc[anno.ImmunoPhenotype == 'Desert','unique_id'].values
paper_infl = anno.loc[anno.ImmunoPhenotype == 'Infiltrated','unique_id'].values
paper_excl = anno.loc[anno.ImmunoPhenotype == 'Excluded','unique_id'].values
anno = anno.set_index('unique_id')

combined_metrics = metrics.copy()

#### Gather two-stage watershed outputs from generic runner
- `full_rerun_all_cohorts_seg_rerun_070_tumor_segmean.py`: Generation of two-stage segmentation labels

In [3]:
seg_outs = {}
for cohort in ['cm025','kirc','profile']:
    seg_outs[cohort] = torch.load(f'rerun_{cohort}_twostage_watershed_out_rerun.pkl')

seg_out_agg = {}
failed_ids = []
failed_data_agg = {}
for cohort, seg_out_data in seg_outs.items():
    for entry in seg_out_data:
        try:
            unique_id = entry['unique_id']
            seg_out_agg[unique_id] = entry
        except:
            uid = entry['seg_df']['unique_id'].unique()[0]
            failed_ids.append(uid)
            failed_data_agg[uid] = entry['seg_df']

In [4]:
len(failed_ids)

82

In [5]:
len(seg_out_agg)

1297

#### Save tile level info DF 

In [6]:
tilewise_nontil_info = [] 
for unique_id, outs in seg_out_agg.items():
    try:
        tilewise_nontil_info.append(outs['seg_df'].copy())
    except Exception as e:
        print(e)
        pass

tilewise_nontil_info = pd.concat(tilewise_nontil_info).reset_index()
# tilewise_nontil_info = tilewise_nontil_info.loc[tilewise_nontil_info['seg_label'] != 'all_tumor']  # TCTM legacy not needed
tilewise_nontil_info = tilewise_nontil_info.drop_duplicates(['unique_id','x','y'])
tilewise_nontil_info = tilewise_nontil_info.set_index(['unique_id'])
tilewise_nontil_info.to_csv('./rerun_all_cohorts_passing_twostage_segmentation_tile_level_info.csv')

Omitted: `20210826_cm025_tctm_calling_all_cohorts.py`

#### Aggregate RAG edge information 

In [7]:
rag_type = 'premerge_expansion_rags'

# initially based on 20210513 NB on pivoting to get counts of edge types
count_store = []
weighted_edge_info_store= []
merged_descriptions = []
node_summaries = {}
processed_graphs = {}


for uid, outs in seg_out_agg.items():
    rags = deepcopy(outs[rag_type])
    rags[0] = outs['rag']

    for dist, entry in rags.items():
        g = entry.copy()
        g = post_process_seg_graph_simplified(g, uid, MIN_SEGMENT_SIZE, NODE_DIFF_CUTOFF)
        
        if dist == 0: # store info derived from base graph only; we only need the others for edge related info
            processed_graphs[uid] = g
            node_summaries[uid] = summarize_nodes(g, outs['seg_df'])

        if len(g.edges) > 0:
            edge_desc = pd.DataFrame({k:v for k,v in g.edges.items()}).transpose()
            edge_desc.index = edge_desc.index.set_names(['edge0','edge1'])
            edge_desc['unique_id'] = uid
            edge_desc['expansion_dist'] = dist
            weighted_edge_info_store.append(reset_set_idx(edge_desc, ['unique_id','expansion_dist','edge0','edge1']))
        
weighted_edge_info_store = pd.concat(weighted_edge_info_store)
weighted_edge_info_store['min_node_area'] = weighted_edge_info_store[['node0_tumor_area_frac','node1_tumor_area_frac']].min(1)
weighted_edge_info_store['edge_set'] = weighted_edge_info_store.reset_index()[['edge0','edge1']].apply(lambda x: set(x),1).values

In [8]:
weighted_edge_info_store.shape

(9685, 8)

#### Aggregate RAG edges into "proximal" vs "distal" categories
- Collapse default and expansion distance 1 into "proximal"
- Collapse expansion distance 10 & 25 into "distal"

In [9]:
edge_class_agg = []
for uid in weighted_edge_info_store.index.levels[0]:
    ex_out = seg_out_agg[uid]
    seg_df = ex_out['seg_df']
    edge_subset = weighted_edge_info_store.loc[[uid]]
    edge_subset = check_label_set_df(seg_df, edge_subset)
    edge_class_agg.append(edge_subset)
    
edge_class_agg = pd.concat(edge_class_agg)

# fix object dtype assignment
for col in edge_class_agg.columns:
    try:
        edge_class_agg[col] = edge_class_agg[col].astype(float)
    except:
        pass

edge_class_agg_pivot = edge_class_agg.reset_index().pivot_table(index=['unique_id','edge0','edge1','passing_edge'], columns='expansion_dist', values='diff')
edge_class_agg_pivot = edge_class_agg_pivot.reset_index(level=3)
edge_class_agg_pivot['passing_edge'] = edge_class_agg_pivot['passing_edge'].astype(bool)
edge_class_agg_pivot.loc[edge_class_agg_pivot.passing_edge, 'edge_class'] = edge_class_agg_pivot.loc[edge_class_agg_pivot.passing_edge].apply(lambda x: classify_distal_vs_proximal_edge(x), 1)
edge_class_agg_pivot['edge_class'] = edge_class_agg_pivot['edge_class'].fillna('not_eligible')

edge_class_sum = pd.get_dummies(edge_class_agg_pivot, columns=['edge_class']).groupby('unique_id').sum()

In [10]:
edge_class_sum.shape

(1001, 8)

#### Integrate previous 20210708 TIL info upstream of iterating through RAGs for further annotation

In [11]:
count_cols = ['tumor_counts','til_counts','stroma_counts']
summary_derived_cols = ['stroma_tumor_ratio','stroma_til_ratio','tumor_counts','til_counts','stroma_counts']

tilewise_til_info = merge_nonoverlapping(tilewise_nontil_info.set_index(['x','y'],append=True), summaries.rename(columns={'tx':'x', 'ty':'y'}).set_index(['unique_id','x','y']))
tilewise_til_info = tilewise_til_info.loc[tilewise_til_info.index.levels[0].intersection(summaries.unique_id.unique())]
tilewise_til_info[count_cols]  = tilewise_til_info[count_cols].fillna(0)  # tiles with 0 TIL counts need to be filled 

tilewise_nontil_inforegation = tilewise_til_info.groupby(['unique_id','merged_labels']).aggregate(['mean','std','count'])
tilewise_nontil_info_filtered = tilewise_nontil_inforegation.loc[tilewise_nontil_inforegation['til_counts']['count'] > MIN_SEGMENT_SIZE]

In [12]:
# key: get % of area in a given segment that is above TIL_ISO_CUTOFF
til_context_agg = tilewise_til_info.groupby(['unique_id','merged_labels'])[['til_counts']].apply(lambda x: (x > TIL_ISO_CUTOFF).sum())
til_context_agg = til_context_agg.rename(columns={'til_counts':'tiles_above_til_cutoff'}).join(tilewise_nontil_info_filtered['til_counts']['count'])
til_context_agg = til_context_agg.dropna()
til_context_agg['til_status_label_exp'] = til_context_agg.apply(lambda x: assign_til_status_label_exp(x['tiles_above_til_cutoff'], x['count'], frac_cutoff=FRAC_CUTOFF, tile_area_cutoff=TIL_AREA_CUTOFF), 1)

# assign low vs high (can be redundant if only using a single cutoff)
til_status = tilewise_nontil_info_filtered['til_counts']['mean'].apply(lambda x: assign_til_status_label(x, TIL_ISO_CUTOFF, TIL_HIGH_CUTOFF))
til_status.name = 'til_status_basic'
til_context_agg = til_context_agg.join(til_status)

# again, redundant if we only use one cutoff
til_context_agg.loc[(til_context_agg['til_status_label_exp'] == 'dispersed_infiltration') & (til_context_agg['til_status_basic'] == 'highly_infiltrated'), 'til_status_combined'] = 'highly_infiltrated_dispersed'
til_context_agg.loc[(til_context_agg['til_status_label_exp'] == 'dispersed_infiltration') & (til_context_agg['til_status_basic'] == 'intermed_infiltrated'), 'til_status_combined'] = 'intermed_infiltrated_dispersed'
til_context_agg.loc[(til_context_agg['til_status_label_exp'] == 'dispersed_infiltration') & (til_context_agg['til_status_basic'] == 'non_infiltrated'), 'til_status_combined'] = 'low_infiltrated_dispersed'
til_context_agg.loc[(til_context_agg['til_status_label_exp'] == 'localized_infiltration'), 'til_status_combined'] = 'localized_infiltration'
til_context_agg.loc[(til_context_agg['til_status_label_exp'] == 'non_infiltrated'), 'til_status_combined'] = 'non_infiltrated'

# add actual TIL counts/avgs back
til_context_agg = til_context_agg.join(tilewise_til_info.groupby(['unique_id','merged_labels'])['til_counts'].mean()).rename(columns={'til_counts':'til_counts_tile_avg', 'count':'tile_count'}) 

til_context_agg_piv = til_context_agg.reset_index().value_counts(subset=['unique_id','til_status_combined'])
til_context_agg_piv.name = 'cat_counts'
til_context_agg_piv  = pd.DataFrame(til_context_agg_piv.reset_index())
til_context_agg_piv = til_context_agg_piv.pivot_table(index='unique_id', columns=['til_status_combined'], values='cat_counts').fillna(0)
til_context_agg_piv['any_dispersed'] = til_context_agg_piv[[x for x in til_context_agg_piv.columns if 'dispersed' in x]].sum(1)
til_context_agg['til_cutoff'] = TIL_ISO_CUTOFF

contrast_candidates = get_indices(til_context_agg.groupby('unique_id').til_status_combined.apply(lambda x: np.any(x == 'non_infiltrated') & np.any(np.isin(x, ['intermed_infiltrated_dispersed', 'highly_infiltrated_dispersed']))))

context_mapper = {
    'til_contrast_category': ['non_infiltrated_bordering_localized',
                             'contrasting_dispersed_highly_bordering_intermed',
                             'contrasting_dispersed_intermed_bordering_highly',
                             'non_infiltrated_bordering_dispersed',
                             'localized_bordering_dispersed'],
    'combined_edge_context': ['lower_grade_non_infiltrated_higher_grade_localized','higher_grade_non_infiltrated_lower_grade_localized',
                             'lower_grade_non_infiltrated_higher_grade_dispersed','higher_grade_non_infiltrated_lower_grade_dispersed',
                             'lower_grade_localized_higher_grade_dispersed','higher_grade_localized_lower_grade_dispersed',
                              'lower_grade_more_infiltrated_both_dispersed','higher_grade_more_infiltrated_both_dispersed'
                             ],
    'general_edge_context': ['lower_grade_more_infiltrated','higher_grade_more_infiltrated']
}

#### Add additional post-processing (TIL, etc) info, aggregate counts for discrete descriptions

In [13]:
%%capture
# 20210513 NB on pivoting to get counts of edge types
count_store = []
merged_descriptions = []
# tilewise_nontil_info = tilewise_nontil_info.set_index('unique_id')

# for uid in til_context_agg.index.levels[0]: # one case doesn't overlap
for uid in til_context_agg.index.get_level_values(0).unique():
    g = processed_graphs[uid].copy()
    g = add_TIL_info(g, til_context_agg.loc[uid])
    summary = summarize_nodes(g, tilewise_nontil_info.loc[uid])
    
    counts = pd.DataFrame()
    for col in ['til_contrast_category','general_edge_context','combined_edge_context']:
        temp_agg = [v[col] for k,v in g.edges.items() if col in v.keys()]
        temp_agg = pd.Series(temp_agg, name=col).astype('category')
        temp_agg = temp_agg.value_counts()
        temp_agg.name = col+'_count'
        temp_agg = pd.DataFrame(temp_agg)
        temp_agg['unique_id'] = uid
        temp_agg.index.name = col
        temp_agg = temp_agg.pivot_table(columns=col, index='unique_id')
        counts = pd.concat([counts, temp_agg], 1)
    count_store.append(counts)

    merged_node_desc = summary['seg_df'].join(til_context_agg.loc[uid])
    merged_node_desc['unique_id'] = uid
    merged_descriptions.append(merged_node_desc.set_index(['unique_id'], append=True).reorder_levels([1,0]))
          
count_store = pd.concat(count_store).fillna(0)
merged_descriptions = pd.concat(merged_descriptions)

#### Describe stromal infiltration [basic]

In [14]:
stroma_til_mapper = {
    'low_infiltrated_dispersed':'dispersed_infiltration',
    'intermed_infiltrated_dispersed':'dispersed_infiltration',
    'highly_infiltrated_dispersed':'dispersed_infiltration',
    'non_infiltrated':'non_infiltrated',
    'localized_infiltration':'localized_infiltration',

}
stromal_til_description_basic = til_context_agg.loc[(slice(None),0),:].til_status_combined.map(stroma_til_mapper)
stromal_til_description_basic.name = 'stroma_til_status'
stromal_til_description_basic = stromal_til_description_basic.reset_index(level=1, drop=True)
stromal_til_description_basic.value_counts()

localized_infiltration    383
dispersed_infiltration    122
non_infiltrated            50
Name: stroma_til_status, dtype: int64

#### Define general isolation+infiltration status labels

In [15]:
slidemean = tilewise_til_info.loc[tilewise_til_info.meta !='stroma'].groupby('unique_id').mean()

iso_desc = []
for uid in merged_descriptions.index.levels[0]:
    x = merged_descriptions.loc[uid,['smoothed_prob_g4_not_g2', 'connection_category', 'degree','tiles_above_til_cutoff','tile_count','til_counts_tile_avg','til_status_combined']]
    x = x.join(compare_nodes(merged_descriptions.loc[uid], slidemean.loc[uid],'smoothed_prob_g4_not_g2'))
    x.loc[x.connection_category=='isolated'][['connection_category','til_status_combined','rel_grade_label']].apply(lambda x: '_'.join(x), 1).values
    x['unique_id'] = uid
    iso_desc.append(x.set_index(['unique_id'], append=True).reorder_levels([1,0]))
iso_desc = pd.concat(iso_desc)

iso_combined_labels = iso_desc.loc[iso_desc.connection_category=='isolated'][['connection_category','til_status_combined','rel_grade_label']].apply(lambda x: '_'.join(x), 1)
iso_desc.loc[iso_desc.connection_category=='isolated', 'isolation_til_status'] = iso_combined_labels
isolation_til_status_cats = iso_desc.isolation_til_status.dropna().unique()
iso_desc['isolation_til_status'] = iso_desc['isolation_til_status'].astype('category')

iso_counts = pd.DataFrame()
for uid, df in iso_desc.groupby('unique_id'):
    y = df['isolation_til_status'].value_counts()
    y.name = uid
    y = pd.DataFrame(y)
    iso_counts = pd.concat([iso_counts, y.transpose()])

updated_count_stats = pd.concat([count_store['combined_edge_context_count'], iso_counts], 1)

# updated_count_stats = pd.concat([pd.concat([count_store[x] for x in count_store.columns.levels[0]], 1), iso_counts], 1)\
updated_count_stats['isolated_infiltrated_total'] = iso_counts[[x for x in iso_counts.columns if 'non_infiltrated' not in x]].sum(1)
updated_count_stats['isolated_non_infiltrated_total'] = iso_counts[[x for x in iso_counts.columns if 'non_infiltrated' in x]].sum(1)

iso_desc['is_isolated'] = iso_desc['connection_category'] == 'isolated'
iso_desc['is_infiltrated'] = iso_desc['til_status_combined'] != 'non_infiltrated'
iso_desc['general_iso_inf_status'] = iso_desc['is_isolated'].map({True:'isolated',False:'nonisolated'}) + '_' + iso_desc['is_infiltrated'].map({True:'infiltrated',False:'noninfiltrated'})
iso_desc['general_iso_inf_status'].value_counts()
iso_desc['general_iso_inf_status'] = iso_desc['general_iso_inf_status'].astype('category')

updated_count_stats = pd.concat([updated_count_stats, aggregate_counts(iso_desc, 'general_iso_inf_status')], 1)

generic_combined_labels = iso_desc[['til_status_combined','rel_grade_label']].fillna('no_info').apply(lambda x: '_'.join(x), 1)
infiltration_cats = generic_combined_labels.loc[(~generic_combined_labels.str.startswith('non_infiltrated')) & (~generic_combined_labels.str.startswith('no_info'))].unique()
non_infiltration_cats = generic_combined_labels.loc[(generic_combined_labels.str.startswith('non_infiltrated'))].unique()
infiltration_cats_mapper = {x: 'infiltrated_' + '_'.join(x.split('_')[-3:]) for x in infiltration_cats}

for entry in non_infiltration_cats:
    infiltration_cats_mapper[entry] = entry

iso_desc['general_node_infiltration_rel_grade'] = generic_combined_labels.map(infiltration_cats_mapper).astype('category')
general_counts = aggregate_counts(iso_desc, 'general_node_infiltration_rel_grade')
updated_count_stats = pd.concat([updated_count_stats, general_counts], 1)
simple_iso_inf_cats = ['nonisolated_noninfiltrated', 'isolated_infiltrated', 'isolated_noninfiltrated', 'nonisolated_infiltrated']

#### Define groups of features and create formulas 

In [16]:
til_semantic_cols = ['highly_infiltrated_dispersed', 'intermed_infiltrated_dispersed',
       'localized_infiltration', 'low_infiltrated_dispersed',
       'non_infiltrated']

rag_feature_cols = ['major_diff_edge_count',
       'minor_diff_edge_count', 'largest_cc_size', 'avg_degree',
       'high_amidst_low_count', 'high_bordering_low_count', 'isolated_count',
       'low_amidst_high_count', 'low_bordering_high_count',
       ]

combined_info_cols = [
    'combined_rag_differential_label','total_foci_infiltration_label'
]


rag_feature_formula = ' + '.join(rag_feature_cols)
til_semantic_feature_formula = ' + '.join(til_semantic_cols)
combined_feature_formula = ' + '.join(combined_info_cols)
general_iso_inf_formula = ' + '.join(list(iso_desc['general_iso_inf_status'].unique()))

In [17]:
iso_comparison = []
for iso_cutoff in [TIL_ISO_CUTOFF]:
# for iso_cutoff in [5, 10, 25]:
    df = pd.DataFrame()
    temp_overall = (tilewise_til_info['til_counts'] > iso_cutoff).groupby('unique_id').mean()
    temp_overall.name = 'overall_til_fraction'
    temp_tumor = (tilewise_til_info.loc[tilewise_til_info.meta !='stroma']['til_counts'] > iso_cutoff).groupby('unique_id').mean()
    temp_tumor.name = 'tumor_til_fraction'
    df['overall_til_fraction'] = temp_overall
    df['tumor_til_fraction'] = temp_tumor
    df['til_iso_cutoff'] = iso_cutoff
    iso_comparison.append(df)

iso_comparison = pd.concat(iso_comparison)

#### Combine info to synthesize further labels

In [18]:
weighted_edge_info_store.value_counts(['unique_id','diff_category']).reset_index().pivot_table(index='unique_id', columns='diff_category')[0].fillna(0)

diff_category,major,minor
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1
TCGA-3Z-A93Z,0.0,2.0
TCGA-A3-3306,0.0,20.0
TCGA-A3-3308,8.0,5.0
TCGA-A3-3311,1.0,0.0
TCGA-A3-3316,0.0,6.0
...,...,...
profile_1096511,15.0,24.0
profile_1096516,1.0,4.0
profile_1096695,12.0,10.0
profile_1096876,0.0,2.0


In [19]:
major_minor_edge_agg = weighted_edge_info_store.value_counts(['unique_id','expansion_dist','diff_category']).reset_index().pivot_table(index=['unique_id','expansion_dist'], columns='diff_category')[0].fillna(0)

In [20]:
major_minor_edge_agg = major_minor_edge_agg.groupby('unique_id').max()

In [21]:
major_minor_edge_agg.head()

diff_category,major,minor
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1
TCGA-3Z-A93Z,0.0,1.0
TCGA-A3-3306,0.0,10.0
TCGA-A3-3308,2.0,2.0
TCGA-A3-3311,1.0,0.0
TCGA-A3-3316,0.0,4.0


In [22]:
binarize_count_info = False

# combined_metrics = metrics.copy()
# combined_metrics = feature_subset.drop(columns=['tumor_til_fraction']).join(iso_comparison.loc[iso_comparison.til_iso_cutoff == TIL_ISO_CUTOFF,'tumor_til_fraction'])
combined_metrics = metrics.copy()
combined_metrics = combined_metrics.join(iso_comparison.loc[iso_comparison.til_iso_cutoff == TIL_ISO_CUTOFF,'tumor_til_fraction'])

if binarize_count_info:
    print('using binarized count info')
    raise NotImplementedError
else:
    print('using native count format data')
#     combined_metrics = combined_metrics.drop(columns=updated_count_stats.columns).join(updated_count_stats)
    combined_metrics = combined_metrics.join(updated_count_stats)

# new / reworked
combined_metrics['major_diff_edge_count'] = major_minor_edge_agg['major']
combined_metrics['minor_diff_edge_count'] = major_minor_edge_agg['minor']
combined_metrics['major_diff_edge_count'] = combined_metrics['major_diff_edge_count'].fillna(0)
combined_metrics['minor_diff_edge_count'] = combined_metrics['minor_diff_edge_count'].fillna(0)

combined_metrics['any_major_edge'] = (combined_metrics['major_diff_edge_count'] > 0).map({False:'not_present', True:'present'})
combined_metrics['any_minor_edge'] = (combined_metrics['minor_diff_edge_count'] > 0).map({False:'not_present', True:'present'})
combined_metrics['any_rag_edge'] = ((combined_metrics['major_diff_edge_count'] + combined_metrics['minor_diff_edge_count']) > 0).map({False:'not_present', True:'present'})

combined_edge_context_similar_cols = [x for x in count_store['combined_edge_context_count'].columns if (('both' in x) | ('same' in x)) & ('non' not in x)]
combined_edge_context_dif_cols = count_store['combined_edge_context_count'].columns.difference(combined_edge_context_similar_cols)

subset = combined_metrics.reindex(list(til_context_agg.index.levels[0]))
subset['similar_infiltration_along_rag_count'] = subset[combined_edge_context_similar_cols].sum(1)
subset['diff_infiltration_along_rag_count'] = subset[combined_edge_context_dif_cols].sum(1)

subset.loc[(subset['similar_infiltration_along_rag_count'] > 0) & (subset['diff_infiltration_along_rag_count'] == 0), 'rag_differential_infiltration_label'] = 'all_similar_any_infiltration_along_rag_edges'
subset.loc[subset['same_non_bordering_non'] > 0, 'rag_differential_infiltration_label'] = 'all_similar_noninfiltration_along_rag_edges'

subset.loc[(subset['similar_infiltration_along_rag_count'] == 0) & (subset['diff_infiltration_along_rag_count'] > 0), 'rag_differential_infiltration_label'] = 'all_differential_along_rag_edges'
subset.loc[(subset['similar_infiltration_along_rag_count'] > 0) & (subset['diff_infiltration_along_rag_count'] > 0), 'rag_differential_infiltration_label'] = 'mixed_differential_along_rag_edges'
subset.loc[(subset['similar_infiltration_along_rag_count'] == 0) & (subset['diff_infiltration_along_rag_count'] == 0), 'rag_differential_infiltration_label'] = 'other'

##########
subset['rag_differential_infiltration_label_broad'] = subset['rag_differential_infiltration_label']
crit = (subset['rag_differential_infiltration_label'] == 'all_differential_along_rag_edges') | (subset['rag_differential_infiltration_label'] == 'mixed_differential_along_rag_edges')
subset.loc[crit, 'rag_differential_infiltration_label_broad'] = 'any_differential_along_rag_edges'

##########
crit = (subset['any_rag_edge'] == 'not_present') & (subset['isolated_infiltrated'] == 0) & (subset['isolated_noninfiltrated'] > 0)
subset.loc[crit, 'isolation_infiltration_summary'] = 'all_isolated_noninfiltrated'

crit = (subset['any_rag_edge'] == 'not_present') & (subset['isolated_infiltrated'] > 0) & (subset['isolated_noninfiltrated'] == 0)
subset.loc[crit, 'isolation_infiltration_summary'] = 'all_isolated_infiltrated'

crit = (subset['any_rag_edge'] == 'not_present') & (subset['isolated_infiltrated'] > 0) & (subset['isolated_noninfiltrated'] > 0)
subset.loc[crit, 'isolation_infiltration_summary'] = 'isolated_mixed_infiltration'

##########
crit = (subset['any_rag_edge'] == 'not_present') & (subset['isolated_infiltrated'] > 0) & (subset['isolated_noninfiltrated'] == 0)
subset.loc[crit, 'isolation_all_infiltrated'] = 'all_isolated_infiltrated'

crit = (subset['any_rag_edge'] == 'not_present') & (subset['isolated_noninfiltrated'] > 0)
subset.loc[crit, 'isolation_all_infiltrated'] = 'isolated_mixed_or_noninfiltrated'

##########
subset['isolation_all_infiltrated'] = subset['isolation_all_infiltrated'].fillna('other')

##########
subset.loc[(subset['any_rag_edge'] == 'present'), 'combined_rag_differential_label'] = subset.loc[(subset['any_rag_edge'] == 'present'), 'rag_differential_infiltration_label_broad']
subset.loc[(subset['any_rag_edge'] == 'not_present'), 'combined_rag_differential_label'] = subset.loc[(subset['any_rag_edge'] == 'not_present'), 'isolation_all_infiltrated']

subset['total_noninfiltrated_foci'] = updated_count_stats[['nonisolated_noninfiltrated','isolated_noninfiltrated']].sum(1)
subset['total_infiltrated_foci'] = updated_count_stats[['nonisolated_infiltrated','isolated_infiltrated']].sum(1)

crit = (subset['total_noninfiltrated_foci'] > 0 )
subset.loc[crit, 'total_foci_infiltration_label'] = 'some_foci_noninfiltrated'

crit = (subset['total_infiltrated_foci'] > 0) * (subset['total_noninfiltrated_foci'] == 0 )
subset.loc[crit, 'total_foci_infiltration_label'] = 'all_foci_infiltrated'

subset['total_foci_infiltration_label'].value_counts()

using native count format data


all_foci_infiltrated        370
some_foci_noninfiltrated    182
Name: total_foci_infiltration_label, dtype: int64

In [23]:
(subset['total_infiltrated_foci'] == 0).sum()

68

#### [Ignore??] Pare down some of the features in RAG set 

In [24]:
# binarize_count_info = False
# feature_subset = metrics.copy()
# feature_subset = feature_subset.join(iso_comparison.loc[iso_comparison.til_iso_cutoff == 10,'tumor_til_fraction'])
# feature_subset = feature_subset.join(stromal_til_description_basic)

# if binarize_count_info:
#     print('using binarized count info and limiting to columns with at least 10% labels == present')
#     binarized_count_col_subset = get_indices((updated_count_stats_bool == 'present').mean() >= 0.1)
#     feature_subset = feature_subset.join(updated_count_stats_bool[binarized_count_col_subset])
# else:
#     print('using native count format data')
#     feature_subset = feature_subset.join(updated_count_stats)

# feature_subset = feature_subset.join(subset['combined_rag_differential_label'])
# feature_subset = feature_subset.join(subset['total_foci_infiltration_label'])
# feature_subset = feature_subset.join(subset[['diff_infiltration_along_rag_count','similar_infiltration_along_rag_count']])

# feature_subset['any_major_edge'] = (feature_subset['major_diff_edge_count'] > 0).map({False:'not_present', True:'present'})
# feature_subset['any_minor_edge'] = (feature_subset['minor_diff_edge_count'] > 0).map({False:'not_present', True:'present'})
# feature_subset['any_rag_edge'] = ((feature_subset['major_diff_edge_count'] + feature_subset['minor_diff_edge_count']) > 0).map({False:'not_present', True:'present'})
# feature_subset['rag_edge_total'] = feature_subset['minor_diff_edge_count'] + feature_subset['major_diff_edge_count']

- Note: 'combined_rag_differential_label' causing big loss of CM025 cases when following previous removal of cases that == 'other'

In [25]:
edge_agg = []
for uid in weighted_edge_info_store.index.levels[0]:
    try:
        seg_df = tilewise_nontil_info.loc[[uid]]
        edge_subset = weighted_edge_info_store.loc[[uid]]
        edge_subset = check_label_set_df(seg_df, edge_subset)
        edge_agg.append(edge_subset)
    except:
        print(f'{uid} failed to aggregate')

edge_agg = pd.concat(edge_agg)

# fix object dtype assignment
for col in edge_agg.columns:
    try:
        edge_agg[col] = edge_agg[col].astype(float)
    except:
        pass

#### skip over `feature_subset` and use `combined_metrics`

In [26]:
feature_subset = combined_metrics.copy()

#### Get Segment-Wise Mean Info

In [27]:
segmentwise_mean = tilewise_nontil_info.loc[tilewise_nontil_info.meta !='stroma'].groupby(['unique_id','merged_labels']).mean()

#### Call Proximal vs Distal Edges & Sum

In [28]:
edge_agg_pivot = edge_agg.reset_index().pivot_table(index=['unique_id','edge0','edge1','passing_edge'], columns='expansion_dist', values=['diff','tumor_area_frac_sum'])
edge_agg_pivot = edge_agg_pivot.reset_index(level=3)
edge_agg_pivot['passing_edge'] = edge_agg_pivot['passing_edge'].astype(bool)
edge_agg_pivot['edge_class'] = edge_agg_pivot['diff'].apply(lambda x: classify_distal_vs_proximal_edge(x), 1)
edge_agg_pivot.loc[~edge_agg_pivot['passing_edge'], 'edge_class'] = 'not_eligible'
edge_agg_pivot['edge_class'] = edge_agg_pivot['edge_class'].fillna('not_eligible')

edge_class_sum = pd.get_dummies(edge_agg_pivot, columns=['edge_class']).groupby('unique_id').sum()

#### Add node0/1 annotations to each edge 

In [29]:
all_seg_counts = tilewise_nontil_info.loc[tilewise_nontil_info['meta'] == 'tumor'].value_counts(subset=['unique_id','merged_labels'])
all_seg_counts.name = 'seg_size'
all_seg_counts = pd.DataFrame(all_seg_counts).join(segmentwise_mean['smoothed_prob_g4_not_g2']) # add segment mean grade score
all_seg_counts = all_seg_counts.reset_index()
all_seg_counts['seg_area_frac'] = all_seg_counts.groupby(['unique_id']).seg_size.apply(lambda x: x/x.sum())
all_seg_counts['merged_labels'] = all_seg_counts['merged_labels'].astype(int)

edge_agg_pivot = edge_agg_pivot.reset_index(level=1)
edge_agg_pivot.index.set_names('merged_labels',level=1, inplace=True)
edge_agg_pivot = edge_agg_pivot[['edge0','edge_class']]
edge_agg_pivot.columns = [' '.join(col).strip() for col in edge_agg_pivot.columns.values]

# add node 1 info 
edge_agg_pivot = edge_agg_pivot.join(all_seg_counts.set_index(['unique_id','merged_labels'])).rename(columns={'seg_area_frac':'node1_area_frac', 'smoothed_prob_g4_not_g2':'node1_grade_score'})
edge_agg_pivot = edge_agg_pivot.join(til_context_agg).rename(columns={x:'node1_'+x for x in til_context_agg.columns})

edge_agg_pivot.index.set_names('edge1',level=1, inplace=True)
edge_agg_pivot = reset_set_idx(edge_agg_pivot, ['unique_id','edge0'])
edge_agg_pivot.index.set_names('merged_labels',level=1, inplace=True)

# add node 0 info 
edge_agg_pivot = merge_nonoverlapping(edge_agg_pivot, all_seg_counts.set_index(['unique_id','merged_labels'])).rename(columns={'seg_area_frac':'node0_area_frac' ,'smoothed_prob_g4_not_g2':'node0_grade_score'})
edge_agg_pivot = edge_agg_pivot.join(til_context_agg).rename(columns={x:'node0_'+x for x in til_context_agg.columns})

edge_agg_pivot.index.set_names('edge0',level=1, inplace=True)
edge_agg_pivot = reset_set_idx(edge_agg_pivot, ['unique_id','edge0','edge1'])

#### Add an F1 style measure of the node area fractions 
- If close to 1, it's a balanced edge; closer to 0 implies one of the nodes is much smaller

In [30]:
edge_agg_pivot['edge_pair_area'] = edge_agg_pivot['node0_area_frac'] + edge_agg_pivot['node1_area_frac']
edge_agg_pivot['min_node_area_in_edge'] = edge_agg_pivot[['node0_area_frac', 'node1_area_frac']].apply(min, 1)

ec_recover = edge_agg_pivot['edge_class'].copy()
edge_agg_pivot = pd.get_dummies(edge_agg_pivot, columns=['edge_class'])
edge_agg_pivot['edge_class'] = ec_recover

edge_agg_pivot['area_fmeasure'] = 4*(edge_agg_pivot['node0_area_frac']*edge_agg_pivot['node1_area_frac'])/(edge_agg_pivot['node0_area_frac']+edge_agg_pivot['node1_area_frac'])

edge_agg_pivot['edge_class_distal_min_weighted'] = edge_agg_pivot['min_node_area_in_edge'] * edge_agg_pivot['edge_class_distal']
edge_agg_pivot['edge_class_proximal_min_weighted'] = edge_agg_pivot['min_node_area_in_edge'] * edge_agg_pivot['edge_class_proximal']

edge_agg_pivot['edge_class_distal_total_weighted'] = edge_agg_pivot['edge_pair_area'] * edge_agg_pivot['edge_class_distal']
edge_agg_pivot['edge_class_proximal_total_weighted'] = edge_agg_pivot['edge_pair_area'] * edge_agg_pivot['edge_class_proximal']

edge_agg_pivot['edge_class_distal_f_weighted'] = edge_agg_pivot['area_fmeasure'] * edge_agg_pivot['edge_class_distal']
edge_agg_pivot['edge_class_proximal_f_weighted'] = edge_agg_pivot['area_fmeasure'] * edge_agg_pivot['edge_class_proximal']

edge_agg_pivot = edge_agg_pivot.dropna(subset=['node1_area_frac','node0_area_frac'])
edge_class_sum = edge_agg_pivot.iloc[:,5:].groupby('unique_id').sum()

## Push to file 

In [31]:
tilewise_til_info.to_csv('./rerun_tilewise_grade_til_annotations.csv')
tilewise_nontil_info.to_csv('./rerun_tilewise_grade_nontil_annotations.csv')

merged_descriptions.to_csv('./rerun_node_descriptions.csv')
feature_subset.to_csv('./rerun_additional_feature_subset.csv')
weighted_edge_info_store.to_csv('./rerun_base_rag_edge_info_annotation.csv')

edge_agg.to_csv('./rerun_base_rag_edge_info_raw__edge_agg.csv')
edge_agg_pivot.to_csv('./rerun_base_rag_edge_info_annotation_processed.csv')
edge_class_sum.to_csv('./rerun_base_rag_edge_info_annotation_processed_sum.csv')

til_context_agg.to_csv('./rerun_segmentwise_til_context_info.csv')

---

In [32]:
tilewise_til_info.shape

(2300577, 33)

In [33]:
tilewise_nontil_info.shape

(4720757, 16)