## Imports & Helpers

In [28]:
def flex_save(path, dpi=400, extensions=['.png','.pdf']):
    for ext in extensions:
        plt.savefig(path+ext, dpi=dpi, bbox_inches='tight')

In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../../')
from checkmate_imports import *

# global variables 
HUE_ORDER = ['stroma','pred_g2','intermediate_grade','pred_g4']
MIN_SEGMENT_SIZE = 50
GRADE_DIFF_THRESH = 0.35
TUMOR_DIFF_THRESH = 0.35
MIN_TUMOR_SEG_MEAN = 0.70
NODE_DIFF_CUTOFF = invert_rag_weight(GRADE_DIFF_THRESH) 
MIN_TIL_COUNT = 10

# GRADE_TC_CUTOFF = 0.4  # minimum segment mean grade score to consider in TIL processing 
# GRADE_TC_CUTOFF = 0.  # minimum segment mean grade score to consider in TIL processing 
GRADE_TC_CUTOFF = 0.8  # focusing on high grade foci only 

print('MODIFYING EARLIER TIL CUTOFFS')
TIL_ISO_CUTOFF = 14  # based on none vs any AUROC bootstrap on high grade foci + no hard cases
TIL_HIGH_CUTOFF = 48 # based on not-high vs high AUROC bootstrap on high grade foci + no hard cases
FRAC_CUTOFF = 0.25
TIL_AREA_CUTOFF = 10

EXCLUSION_RATIO_CUTOFF = 1.5  # margin vs center ratio threshold for "exclusion" call
TILES_PER_MM2 = 0.256**-2

# assume 7x7 minimum case for a square area focus
# going 2 tiles inner would result in a 5x5 inner cube and thus area cutoff of 25
# MIN_CENTER_AREA = 25
MIN_CENTER_AREA = 10  # relaxing from 25 to try to recover possible interesting foci

MODIFYING EARLIER TIL CUTOFFS


In [2]:
from statannotations.Annotator import Annotator
from statannot import add_stat_annotation
from itertools import combinations, product

In [3]:
feature_subset = pd.read_csv('../rerun_final_patient_features.csv', index_col=0)

anno = pd.read_csv('../manual_cm025_merged_braunsupp_annotations.csv') 
anno['unique_id'] = 'cm025_' + anno.subjid.astype(str)
paper_desert = anno.loc[anno.ImmunoPhenotype == 'Desert','unique_id'].values
paper_infl = anno.loc[anno.ImmunoPhenotype == 'Infiltrated','unique_id'].values
paper_excl = anno.loc[anno.ImmunoPhenotype == 'Excluded','unique_id'].values
anno = anno.set_index('unique_id')

node_descriptions = pd.read_csv('../rerun_node_descriptions_with_score_components.csv',index_col=[0])
node_descriptions['merged_labels'] = node_descriptions['merged_labels'].astype(int)

prox_dist_edges = pd.read_csv('../rerun_proximal_and_distal_edge_annotations_with_score_components.csv',index_col=0)

scores = pd.read_csv('../rerun_slidewise_grade_til_scores.csv', index_col=0)

load_tile_info = False

if load_tile_info:
    tilewise_anno = pd.read_csv('../rerun_tilewise_grade_til_annotations.csv', index_col=0) # TIL specific CM025 subset

    edge_info = pd.read_csv('../rerun_base_rag_edge_info_annotation.csv', index_col=[0,1,2,3])
    # restore set objects (stored as string when saved to csv earlier)
    edge_info['edge_set'] = edge_info.reset_index()[['edge0','edge1']].apply(lambda x: set(x),1).values

    # read in non-TIL tilewise info 
    seg_agg = pd.read_csv('../rerun_tilewise_grade_nontil_annotations.csv', index_col=0)
    
    edge_agg_pivot = pd.read_csv('../rerun_base_rag_edge_info_annotation_processed.csv', index_col=[0,1,2])
    edge_class_sum = pd.read_csv('../rerun_base_rag_edge_info_annotation_processed_sum.csv', index_col=0)
    edge_binary = edge_class_sum > 0

In [4]:
NONKM_FONT_SCALE = 1.8
KM_FONT_SCALE = 1.2

### Re-filter TCGA-KIRC to match Ricketts 2018 reclassification


In [5]:
mkdir './main_figs'

mkdir: cannot create directory ‘./main_figs’: File exists


In [6]:
rickets = pd.read_csv('./rickets2018_ccrcc_only.csv')

In [7]:
rickets_ids = rickets['bcr_patient_barcode'].unique()

In [8]:
rickets_ids.shape

(488,)

In [9]:
overlapping_ids = feature_subset.index.intersection(rickets_ids)

In [10]:
filtered_kirc = feature_subset.loc[overlapping_ids]

In [11]:
a = feature_subset.loc[feature_subset['cohort'] != 'kirc']
b = filtered_kirc

In [12]:
feature_subset = pd.concat([a,b])

In [13]:
feature_subset.shape

(1392, 147)

In [14]:
feature_subset.value_counts('cohort')

cohort
cm025      623
kirc       465
profile    304
dtype: int64

In [15]:
rickets['unique_id'] = rickets['bcr_patient_barcode']

In [16]:
rickets.set_index('unique_id')['neoplasm_histologic_grade']

unique_id
TCGA-A3-3331    G2
TCGA-A3-3378    G3
TCGA-A3-3380    G2
TCGA-A3-3383    G2
TCGA-AK-3425    G2
                ..
TCGA-B0-4814    G3
TCGA-BP-4354    G4
TCGA-BP-4770    G4
TCGA-B0-4700    G4
TCGA-CJ-4900    G4
Name: neoplasm_histologic_grade, Length: 488, dtype: object

In [17]:
rickets['tcga_ricketts_grade'] = rickets['neoplasm_histologic_grade']

In [18]:
feature_subset = feature_subset.join(rickets.set_index('unique_id')['tcga_ricketts_grade'])

In [19]:
feature_subset.loc[feature_subset['cohort'] == 'kirc'][['grade','tcga_ricketts_grade']].value_counts()

grade  tcga_ricketts_grade
G3     G3                     163
G2     G2                     140
G4     G4                      58
GX     G2                      55
       G3                      26
       G4                       9
G1     G1                       6
GX     G1                       5
       [Not Available]          2
       GX                       1
dtype: int64

In [20]:
feature_subset.loc[feature_subset['cohort'] == 'kirc', 'grade'] = feature_subset.loc[feature_subset['cohort'] == 'kirc', 'tcga_ricketts_grade']

In [21]:
feature_subset.loc[feature_subset['cohort'] == 'kirc', 'grade'].value_counts()

G2                 195
G3                 189
G4                  67
G1                  11
[Not Available]      2
GX                   1
Name: grade, dtype: int64

In [22]:
feature_subset.loc[(feature_subset['cohort'] == 'kirc') & (feature_subset['grade'] == '[Not Available]'), 'grade'] = 'GX'

In [23]:
feature_subset.loc[feature_subset['cohort'] == 'kirc', 'grade'].value_counts()

G2    195
G3    189
G4     67
G1     11
GX      3
Name: grade, dtype: int64

In [24]:
feature_subset.value_counts(['cohort','grade'], sort=False)

cohort   grade
cm025    G1         1
         G2        68
         G3        76
         G4       104
         GX       374
kirc     G1        11
         G2       195
         G3       189
         G4        67
         GX         3
profile  G1         5
         G2       110
         G3       110
         G4        79
dtype: int64

In [25]:
feature_subset.value_counts(['cohort'], sort=False)

cohort 
cm025      623
kirc       465
profile    304
dtype: int64

In [26]:
feature_subset.to_csv('./rerun_final_patient_features_RICKETTS_FILTER.csv')