# Selecting imaging and pathology reports for labelling


Andres Tamm

2023-08-12


<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Prepare-reports-for-TNM-and-recurrence" data-toc-modified-id="Prepare-reports-for-TNM-and-recurrence-1">Prepare reports for TNM and recurrence</a></span></li><li><span><a href="#Select-reports-for-TNM-and-recurrence" data-toc-modified-id="Select-reports-for-TNM-and-recurrence-2">Select reports for TNM and recurrence</a></span><ul class="toc-item"><li><span><a href="#Select-reports-for-TNM" data-toc-modified-id="Select-reports-for-TNM-2.1">Select reports for TNM</a></span></li><li><span><a href="#Recurrence" data-toc-modified-id="Recurrence-2.2">Recurrence</a></span></li></ul></li><li><span><a href="#Additional.-How-many-reports-to-select?" data-toc-modified-id="Additional.-How-many-reports-to-select?-3">Additional. How many reports to select?</a>

In [None]:
import os
import pandas as pd
import numpy as np
import importlib as imp
import regex as re
import matplotlib.pyplot as plt
from scipy.stats import norm, truncnorm
from itertools import product
from time import sleep
from IPython.display import clear_output
from datetime import datetime
import textmining.utils as ut
from textmining.recurrence import get_recurrence
from textmining.reports import get_crc_reports
from textmining.tnm.clean import add_tumour_tnm
from textmining.tnm.tnm import get_tnm_phrase, get_tnm_values
from textmining.crm_emvi import get_crm, get_emvi
from pathlib import Path

In [None]:
# Check current directory
os.getcwd()

In [None]:
# Paths
root = Path("z:\\Andres\\project_textmining\\textmining\\")
code_path = root
out_path  = root / 'labelled_data'
data_path = root / 'data'

print(out_path.exists())
print(data_path.exists())

In [None]:
os.listdir(data_path)

In [None]:
# Dbl check newer histopath report files
#f0 = '20230329.vw_histopathology.csv'
#f1 = '20230329.vw_pathology_reports.csv'
f0 = '20230728.imaging_newredaction_selectedcodes.csv'
f1 = '20230728.pathology_reports_newredaction.csv'
d0 = pd.read_csv(data_path / f0)
d1 = pd.read_csv(data_path / f1)

## 1. Prepare reports for TNM and recurrence

### 1.1. Gather reports

In [None]:
# ---- Identify individuals with CRC in OUH FIT data: as it contains both patients with and without CRC ----
f0 = data_path / 'vw_outpatient_attendances_diagnoses.csv'
d0 = pd.read_csv(f0, usecols=['brc', 'subject', 'diagnosis_code_icd10', 'diagnosis_date'])
d0['src'] = 'outpat'

f1 = data_path / 'vw_inpat_episodes_diagnoses.csv'
d1 = pd.read_csv(f1, usecols=['brc', 'subject', 'diagnosis_code_icd10', 'diagnosis_date'])
d1['src'] = 'inpat'

d = pd.concat(objs=[d0, d1], axis=0)
print(d.shape)

print(d.diagnosis_date.iloc[0:5])
d.diagnosis_date = pd.to_datetime(d.diagnosis_date)
print(d.diagnosis_date.iloc[0:5])
print(d.diagnosis_date.min(), d.diagnosis_date.max())
print(d.diagnosis_date.sort_values().drop_duplicates())

d = d.loc[d.diagnosis_code_icd10.fillna('').str.lower().str.contains('^c(?:18|19|20)', regex=True), :]
print(d.diagnosis_code_icd10.unique())
print(d.groupby('src')['subject'].nunique())

crc = d.subject.unique()
print(len(crc))


In [None]:
df = pd.DataFrame()

In [None]:
# ---- Get newer OUH pathology reports for individuals with CRC ----
f = data_path / '20230728.pathology_reports_newredaction.csv'

t = pd.read_csv(f)
print(t.shape, t.columns)
t = t.rename(columns={'subject': 'subject_id', 'received_date': 'report_date', 'safe_report': 'report_text_anon'})
t = t.drop(labels=['authorised_date', 'snomed_t'], axis=1)
print(t.shape, t.columns)

print(t.report_date.iloc[0:5])
t.report_date = pd.to_datetime(t.report_date, format='%Y-%m-%d %H:%M:%S')
print(t.report_date.iloc[0:5])
print(t.report_date.min(), t.report_date.max())

t = t.loc[t.report_date >= '2022-04-01']
#t = t.loc[t.report_date >= '2022-03-31']
print(t.shape)

t['brc'] = 'OXFORD'
t['report_type'] = 'pathology_future'

t = t.loc[t.subject_id.isin(crc)]
print(t.shape)

df = pd.concat(objs=[df, t], axis=0)


In [None]:
# ---- Add newer OUH imaging reports for individuals with CRC ----
f = '20230728.imaging_newredaction_selectedcodes.csv'

print('\n----Reading data from: {}'.format(f))
t = pd.read_csv(data_path / f)    
print('\nColumns: {}'.format(t.columns.to_list()))
print('\nShape of data: {}'.format(t.shape))
print('\nImaging code is available for {}% of reports'.format((~t.imaging_code.isna()).mean()*100))

# Imaging codes
#c = t.imaging_code.value_counts()
#print('\nTop value counts of imaging codes: \n{}'.format(c[0:10]))
#mask = t.imaging_code.fillna('').str.lower().str.contains('|'.join(codes).lower(), regex=True)
#t = t.loc[mask]
#print('\nShape of data after including relevant img reports: {}'.format(t.shape))  

# Reformat
t = t[['subject', 'imaging_date', 'imaging_report_date', 'imaging_code', 'safe_report']]
t = t.rename(columns={'imaging_report_date':'report_date', 'subject': 'subject_id', 
                      'safe_report': 'report_text_anon'})
t['brc'] = 'OXFORD'
t['report_type'] = 'imaging_future'

# Date range
print(t.report_date.iloc[0:5])
t.report_date = pd.to_datetime(t.report_date, format='%Y-%m-%d %H:%M:%S')
print(t.report_date.iloc[0:5])
print(t.report_date.min(), t.report_date.max())
t = t.loc[t.report_date >= '2022-04-01']
#t = t.loc[t.report_date >= '2022-03-01']
print(t.shape)

# Retain CRC
t = t.loc[t.subject_id.isin(crc)]
print(t.shape)

df = pd.concat(objs=[df, t], axis=0)


In [None]:
# Check count
df.groupby(['brc', 'report_type']).size()

In [None]:
# Drop reports with duplicate text
print(df.shape[0], df.report_text_anon.nunique(), df.drop_duplicates().shape[0])

df = df.drop_duplicates(subset=['report_text_anon'])
print(df.shape[0])

In [None]:
# Check count again
df.groupby(['brc', 'report_type']).size()

In [None]:
# Check date range again
df['report_date'] = pd.to_datetime(df['report_date'])
s = df.groupby(['brc', 'report_type'])['report_date'].agg([np.min, np.max])
print(s)

In [None]:
# Save 
df.to_csv(out_path / 'reports-ouhfuture.csv', index=False)

### 1.2. Run NLP

In [None]:
# Read reports 
df = pd.read_csv(out_path / 'reports-ouhfuture.csv')
print(df.shape[0], df.columns)

# Date to datetime
print(df.report_date.iloc[0:5])
df.report_date = pd.to_datetime(df.report_date)

# Use only small number of reports? For testing
testmode = False
if testmode:
    df = df.sample(100, random_state=42)

# Check count
df.groupby(['brc', 'report_type']).size()

In [None]:
# Find reports that describe current colorectal cancer, but do not remove non-crc reports (ran about 43 minutes for 74k reports)
__, matches_crc = get_crc_reports(df, 'report_text_anon', add_subj_to_matches=True, subjcol='subject_id')

df['row'] = np.arange(df.shape[0])
df['crc_nlp'] = 0
matches_incl = matches_crc.loc[matches_crc.exclusion_indicator==0]
df.loc[df.row.isin(matches_incl.row), 'crc_nlp'] = 1
print(df.groupby(['brc', 'report_type'])['crc_nlp'].sum())

# Identify reports where all matches for CRC were marked as false 
# This helps check whether some cases of CRC may be completely missed when using the code
# As otherwise, a report could be marked as describing CRC if it has at least one valid match
df['row'] = np.arange(df.shape[0])
df['false_crc_nlp'] = 0
matches_excl = matches_crc.loc[matches_crc.exclusion_indicator==1]
row_false = np.setdiff1d(matches_excl.row, matches_incl.row)
df.loc[df.row.isin(row_false), 'false_crc_nlp'] = 1
print(df.groupby(['brc', 'report_type'])['false_crc_nlp'].sum())

# Save to disk
os.chdir(out_path)
#tstamp = datetime.now().strftime('%Y-%m-%d_%H%M%S')

fname = 'matches-crc_reports-ouhfuture.csv'
print('\nSaving matches to file {}...'.format(fname))
matches_crc.to_csv(fname, index=False)

fname = 'reports-ouhfuture_crc-true_tnm-false_recur-false.csv'
print('\nSaving reports to file {}...'.format(fname))
df.to_csv(fname, index=False)

In [None]:
# Read reports (with CRC status)
read_from_disk=True
if read_from_disk:
    os.chdir(out_path)
    files = os.listdir()
    fname = [f for f in files if f.startswith('reports-ouhfuture_crc-true_tnm-false_recur-false')][0]
    print(fname)
    df = pd.read_csv(fname)
    print(df.crc_nlp.mean())
    print(df.shape)
    display(df.head())

In [None]:
# Extract TNM phrases (ran about 163 minutes for 74k reports)
# TNM phrases marked as historical are not removed - could be removed later, and value extraction rerun
matches_tnm, check_phrases_tnm, check_cleaning_tnm, check_rm_tnm = get_tnm_phrase(df=df, col='report_text_anon', 
                                                                                  remove_unusual=True, 
                                                                                  remove_historical=False, 
                                                                                  remove_falsepos=True)

# Add nearby tumour keywords (can help decide which tumour the TNM phrase refers to, if needed)
matches_tnm = add_tumour_tnm(df, matches_tnm, col_report='report_text_anon', targetcol='target_before_clean')

# Get TNM values from phrases
df, check_values_tnm = get_tnm_values(df, matches=matches_tnm, col='report_text_anon', pathology_prefix=False)

# Mark all reports that have T, N or M values
mask = ~(df['T'].isna() & df['N'].isna() & df['M'].isna())
df['has_tnm'] = 0
df.loc[mask, 'has_tnm'] = 1
print('Number of reports with and without T, N or M value according to code:\n\n{}'.format(df.has_tnm.value_counts()))

# Get excluded TNM matches & add indicator
# This helps check reports that had some matches marked as invalid
# Contrary to CRC detection, it is useful to check these, as the final result includes max and min of all matches marked as valid
df['false_tnm'] = 0
df['row'] = np.arange(df.shape[0])
#row_false = np.setdiff1d(check_rm_tnm.row, matches_tnm.row)
row_false = check_rm_tnm.row
df.loc[df.row.isin(row_false), 'false_tnm'] = 1
print(df.groupby(['brc', 'report_type'])['false_tnm'].mean())

# Lil summary
print('--------')
cols = ['T', 'N', 'M']
for c in cols:
    print(c)
    display(df[c].value_counts())
    
n = df.groupby(['brc', 'has_tnm']).size()
ntot = df.groupby('brc').size()
print(n)
print(n/ntot)
print('--------')

# Save to disk for reference
os.chdir(out_path)
#tstamp = datetime.now().strftime('%Y-%m-%d_%H%M%S')
dfs   = [df, 
         matches_tnm, check_phrases_tnm, check_cleaning_tnm, check_rm_tnm]
names = ['reports-ouhfuture_crc-true_tnm-true_recur-false', 
         'tnm-matches_reports-ouhfuture', 'tnm-check-phrases_reports-ouhfuture',
         'tnm-check-cleaning_reports-ouhfuture', 'tnm-check-rm_reports-ouhfuture']
for n, d in zip(names,dfs):
    fname = n + '.csv'
    print('Saving to file {}...'.format(fname))
    d.to_csv(fname, index=False)

In [None]:
# Read reports (with CRC status and TNM staging)
read_from_disk=True
if read_from_disk:
    os.chdir(out_path)
    files = os.listdir()
    fname = [f for f in files if f.startswith('reports-ouhfuture_crc-true_tnm-true_recur-false')][0]
    print(fname)
    df = pd.read_csv(fname)
    display(df.head())

In [None]:
# Get recurrence and metastasis (about 27 min per 74k reports)
df, matches_rec = get_recurrence(df, 'report_text_anon', verbose=False)

# Mark all reports that have recurrence or metastasis
print(df.recurrence.unique(), df.metastasis.unique())
df['has_recurrence'] = 0
df.loc[~df['recurrence'].isna(), 'has_recurrence'] = 1
df['has_metastasis'] = 0
df.loc[~df['metastasis'].isna(), 'has_metastasis'] = 1
print('Number of reports with recurrence:\n\n{}'.format(df.has_recurrence.value_counts()))
print('Number of reports with metastasis:\n\n{}'.format(df.has_metastasis.value_counts()))

# Add indicator for excluded matches
ex_rec = matches_rec.loc[(matches_rec.exclusion_indicator==1) & (matches_rec.concept=='recurrence')]
print(ex_rec.shape[0])
df['false_recur'] = 0
df['row'] = np.arange(df.shape[0])
df.loc[df.row.isin(ex_rec.row), 'false_recur'] = 1
print(df.groupby(['brc', 'report_type'])['false_recur'].mean())

ex_met = matches_rec.loc[(matches_rec.exclusion_indicator==1) & (matches_rec.concept=='metastasis')]
print(ex_met.shape[0])
df['false_met'] = 0
df['row'] = np.arange(df.shape[0])
df.loc[df.row.isin(ex_met.row), 'false_met'] = 1
print(df.groupby(['brc', 'report_type'])['false_met'].mean())

# Save to disk for reference
os.chdir(out_path)
#tstamp = datetime.now().strftime('%Y-%m-%d_%H%M%S')
dfs   = [df, matches_rec]
names = ['reports-ouhfuture_crc-true_tnm-true_recur-true', 'recur-matches_reports-ouhfuture']
for n, d in zip(names,dfs):
    fname = n + '.csv'
    print('Saving to file {}...'.format(fname))
    d.to_csv(fname, index=False)

In [None]:
# Read reports (with CRC status and TNM staging and recurrence)
read_from_disk=True
if read_from_disk:
    os.chdir(out_path)
    files = os.listdir()
    fname = [f for f in files if f.startswith('reports-ouhfuture_crc-true_tnm-true_recur-true')][0]
    print(fname)
    df = pd.read_csv(fname)
    print(df.shape)
    display(df.head())

In [None]:
df.columns

In [None]:
# Get crm
df, matches_crm, nonmatches_crm = get_crm(df, 'report_text_anon')

In [None]:
# Get emvi
df, matches_emvi, nonmatches_emvi = get_emvi(df, 'report_text_anon')

In [None]:
# Double check
print(df.columns)
print(df.shape)

In [None]:
# Save to disk for reference
os.chdir(out_path)
#tstamp = datetime.now().strftime('%Y-%m-%d_%H%M%S')
dfs   = [df, matches_crm, nonmatches_crm, matches_emvi, nonmatches_emvi]
names = ['reports-ouhfuture_crc-true_tnm-true_recur-true_crmemvi-true', 
         'crm-matches_reports-ouhfuture', 'crm-nonmatches_reports-ouhfuture',
         'emvi-matches_reports-ouhfuture', 'emvi-nonmatches_reports-ouhfuture']
for n, d in zip(names,dfs):
    fname = n + '.csv'
    print('Saving to file {}...'.format(fname))
    d.to_csv(fname, index=False)

### 1.3. Double check the results

In [None]:
files = os.listdir(out_path)
[f for f in files if f.startswith('reports-ouhfuture_crc-true_tnm-true_recur-true_crmemvi-true')]

In [None]:
# Read reports
#  NB -- need to have lineterminator='\n' (works with c engine), as otherwise '\r' interpreted as lineterminator too
#usecols = ['brc', 'subject_id', 'row', 'imaging_date', 'report_date', 'imaging_code',
#           'report_text_anon', 'report_type', 'crc_nlp', 'has_tnm', 'has_recurrence', 'has_metastasis',
#           'false_tnm', 'false_recur', 'T', 'T_sub', 'T_min', 'T_sub_min', 'N', 'N_sub']
os.chdir(out_path)
files = os.listdir()
#fname = [f for f in files if f.startswith('reports-all_crc-true_tnm-true_recur-true')][0]
fname = 'reports-ouhfuture_crc-true_tnm-true_recur-true_crmemvi-true.csv'
print('Reading from file {}'.format(fname))
#df = pd.read_csv(fname, usecols=None, engine='c', sep=',', lineterminator='\n')
df = pd.read_csv(fname)

print('\nColumns: {}'.format(df.columns))
print('Shape: {}'.format(df.shape))
print('Proportion of reports with crc ({:.2f}), recurrence ({:.2f}), tnm ({:.2f}), metastasis ({:.2f})'.format(\
       df.crc_nlp.mean(), df.has_recurrence.mean(), df.has_tnm.mean(), df.has_metastasis.mean()))
print('Unique values for BRC (dummy checking read csv):{}'.format(df.brc.unique()))
#display(df.head())

In [None]:
# Counts
def count(df, vcol, gcols=['report_type']):
    st = df.groupby(gcols)[vcol].size().rename('n')
    s0 = df.groupby(gcols)[vcol].sum().rename('count')
    s1 = df.groupby(gcols)[vcol].mean().round(3).rename('percent')
    s1 *= 100
    s = pd.concat(objs=[st, s0, s1], axis=1)
    return s



In [None]:
cols = ['crc_nlp', 'false_crc_nlp']
for vcol in cols:
    print('\n---{}'.format(vcol))
    print(count(df, vcol, ['report_type']))

In [None]:
cols = ['has_tnm', 'false_tnm']
for vcol in cols:
    print('\n---{}'.format(vcol))
    print(count(df, vcol, ['report_type']))

In [None]:
cols = ['has_recurrence', 'false_recur']
for vcol in cols:
    print('\n---{}'.format(vcol))
    print(count(df, vcol, ['report_type']))

In [None]:
cols = ['has_metastasis', 'false_met']
for vcol in cols:
    print('\n---{}'.format(vcol))
    print(count(df, vcol, ['report_type']))

In [None]:
# Check matches for pathology future - why does it seem to have more false tnm?
# Seems that certain reporting format is used more
fname = 'tnm-check-rm_reports-ouhfuture.csv'
print('Reading from file {}'.format(fname))
#df = pd.read_csv(fname, usecols=None, engine='c', sep=',', lineterminator='\n')
matches = pd.read_csv(fname)
print(matches.shape, matches.columns)
print(df.shape[0])

df['row'] = np.arange(df.shape[0])

mask = (df.report_type == 'pathology_future') & (df.false_tnm == 1)
rows = df.loc[mask, 'row']
m = matches.loc[matches.row.isin(rows)]
print(m.shape, m.row.nunique())
m[['left', 'target', 'right', 'exclusion_reason']].drop_duplicates(subset=['target'])

In [None]:
# Check matches for imaging - why does it seem to have so many false crc?
# Seems that certain reporting format is used more
fname = 'matches-crc_reports-ouhfuture.csv'
print('Reading from file {}'.format(fname))
#df = pd.read_csv(fname, usecols=None, engine='c', sep=',', lineterminator='\n')
matches = pd.read_csv(fname)
print(matches.shape, matches.columns)
print(df.shape[0])

mask = (df.report_type == 'imaging') & (df.false_crc_nlp == 1)
rows = df.loc[mask, 'row']
m = matches.loc[matches.row.isin(rows)]
print(m.shape, m.row.nunique())
m[['left', 'target', 'right', 'exclusion_reason']].drop_duplicates(subset=['target'])

In [None]:
# Check how many reports have both CRC, TNM and recurrence 
#  Doesn't seem there's large overlap + recur needs to be extracted from reports that are not directly CRC reports
df[['crc_nlp', 'has_tnm', 'has_recurrence']].value_counts().reset_index()

In [None]:
# Check how many reports have CRC and TNM
df[['crc_nlp', 'has_tnm']].value_counts().reset_index()

In [None]:
# Dbl check report counts 
s = df[['brc', 'report_type', 'crc_nlp', 'has_tnm']].value_counts().rename('n').reset_index()
s = s.sort_values(['brc', 'report_type', 'crc_nlp', 'has_tnm'])

pd.set_option('display.max_colwidth', 500, 'display.min_rows', 50, 'display.max_rows', 50)
s

In [None]:
# Dbl check report counts 
s = df[['brc', 'report_type', 'crc_nlp', 'has_tnm', 'false_tnm']].value_counts().rename('n').reset_index()
s = s.sort_values(['brc', 'report_type', 'crc_nlp', 'has_tnm', 'false_tnm'])

pd.set_option('display.max_colwidth', 500, 'display.min_rows', 50, 'display.max_rows', 50)
s

In [None]:
# Dbl check reports where min and max T-stage differ
"""
cols = ['T_pre_indecision', 'T_indecision',
       'T_sub_indecision', 'N_indecision', 'N_sub_indecision', 'M_indecision',
       'M_sub_indecision']
df[cols].mean(axis=0)
"""

In [None]:
for i in range(10):
    print('\n====')
    row = dfsub.iloc[i]
    print(row['T'], row['T_min'])
    print(row.report_text_anon)

In [None]:
# Dbl check report counts 
s = df[['brc', 'report_type', 'has_recurrence', 'has_metastasis']].value_counts().rename('n').reset_index()
s = s.sort_values(['brc', 'report_type'])

pd.set_option('display.max_colwidth', 500, 'display.min_rows', 50, 'display.max_rows', 50)
s

## 2. Select reports

### 2.1. Select reports for TNM and CRC (2023-05-14)

---- Newer report selection strategy ----

Set 1
* TNM : [OXFORD] x [img, path] x [has_tnm, ~has_tnm] -> 4 categories -> 400 reports
* CRC : [OXFORD] x [img, path] x [has_crc, ~has_crc] -> 4 categories -> 400 reports

Set 2
* TNM : [OXFORD_FUTURE] x [img, path] x [has_tnm, ~has_tnm] -> 4 categories -> 400 reports
* CRC : [OXFORD_FUTURE] x [img, path] x [has_crc, ~has_crc] -> 4 categories -> 400 reports


In [None]:
# Read reports
#  NB -- in previous version, needed to have lineterminator='\n' (works with c engine), as otherwise '\r' interpreted as lineterminator too
#usecols = ['brc', 'subject_id', 'row', 'imaging_date', 'report_date', 'imaging_code',
#           'report_text_anon', 'report_type', 'crc_nlp', 'has_tnm', 'has_recurrence', 'has_metastasis',
#           'false_tnm', 'false_recur', 'T', 'T_sub', 'T_min', 'T_sub_min', 'N', 'N_sub']
os.chdir(out_path)
files = os.listdir()
#fname = [f for f in files if f.startswith('reports-ouhfuture_crc-true_tnm-true_recur-true')][0]
fname = 'reports-ouhfuture_crc-true_tnm-true_recur-true_crmemvi-true.csv'
print('Reading from file {}'.format(fname))
#df = pd.read_csv(fname, usecols=None, engine='c', sep=',', lineterminator='\n')
df = pd.read_csv(fname)

print('\nColumns: {}'.format(df.columns))
print('Shape: {}'.format(df.shape))
print('Proportion of reports with crc ({:.2f}), recurrence ({:.2f}), tnm ({:.2f}), metastasis ({:.2f})'.format(\
       df.crc_nlp.mean(), df.has_recurrence.mean(), df.has_tnm.mean(), df.has_metastasis.mean()))
print('Unique values for BRC (dummy checking read csv):{}'.format(df.brc.unique()))
#display(df.head())

In [None]:
print(df.columns)

In [None]:
# if min-max values are the same, retain max only
cols = ['T_pre', 'T', 'N', 'M', 'V', 'R', 'L', 'Pn', 'SM', 'H', 'G', 'CRM', 'EMVI']
cols_min = [c + '_min' for c in cols]
for c, cmin in zip(cols, cols_min):
    print('--')
    print(c, cmin)

    mask = df[c] == df[cmin]
    print(mask.sum())
    #print(df.loc[mask, [c, cmin]])
    df.loc[mask, cmin] = np.nan
    #print(df.loc[mask, [c, cmin]])


In [None]:
def select(dfsub, n_select, rng):

    # Randomly sample indices
    n = dfsub.shape[0]
    #n_select = np.floor(p_select*n).astype(int)
    c = n_select/n*100
    if n < n_select:
        i = np.arange(n)
    else:
        i = rng.choice(n, n_select, replace=False)
    
    # Retain sampled indices
    return dfsub.iloc[i,:], c

In [None]:
print(df[['brc', 'report_type']].drop_duplicates())

In [None]:
seed = 42
rng = np.random.default_rng(seed=seed)
n_select = 100
#p_select  = 0.05
dfa = pd.DataFrame()
dfb = pd.DataFrame()
#dfc = pd.DataFrame()
suma  = pd.DataFrame()
sumb  = pd.DataFrame()

# ---- REPORTS WITH AND WITHOUT TNM ----

# Oxford reports
for has_tnm in [0, 1]:
    for report_type in ['pathology_future', 'imaging_future']:
        for brc in ['OXFORD']:

            # Subset the data
            mask = (df.has_tnm == has_tnm) & (df.report_type == report_type) & (df.brc == brc)
            dfsub = df.loc[mask].copy()
            n = dfsub.shape[0]
            print(n, n_select)
            dfsub, c = select(dfsub, n_select, rng)
            dfa = pd.concat(objs=[dfa, dfsub], axis=0)

            # Summarise
            s = pd.DataFrame([[brc, dfsub.has_tnm.mean(), report_type, dfsub.crc_nlp.mean(), dfsub.false_tnm.mean(), n, n_select, seed, c]])
            s.columns = ['brc', 'has_tnm', 'report_type', 'crc_nlp', 'false_tnm', 'n', 'n_select', 'seed', 'coverage (%)']
            suma = pd.concat(objs=[suma, s], axis=0)


# ---- REPORTS WITH AND WITHOUT CRC ----

# Oxford reports
for crc_nlp in [0, 1]:
    for report_type in ['pathology_future', 'imaging_future']:
        for brc in ['OXFORD']:

            # Subset the data
            mask = (df.crc_nlp == crc_nlp) & (df.report_type == report_type) & (df.brc == brc)
            dfsub = df.loc[mask].copy()
            n = dfsub.shape[0]
            print(n, n_select)
            dfsub, c = select(dfsub, n_select, rng)
            dfb = pd.concat(objs=[dfb, dfsub], axis=0)

            # Summarise
            s = pd.DataFrame([[brc, dfsub.has_tnm.mean(), report_type, dfsub.crc_nlp.mean(), dfsub.false_tnm.mean(), n, n_select, seed, c]])
            s.columns = ['brc', 'has_tnm', 'report_type', 'crc_nlp', 'false_tnm', 'n', 'n_select', 'seed', 'coverage (%)']
            sumb = pd.concat(objs=[sumb, s], axis=0)





In [None]:
suma

In [None]:
sumb

In [None]:
# Dbl check
s = dfa.groupby(['brc', 'has_tnm', 'report_type', 'crc_nlp']).size().rename('n_select').reset_index()
s = s.sort_values(['brc', 'crc_nlp', 'has_tnm'])
s

In [None]:
# Dbl check
s = dfb.groupby(['brc', 'has_tnm', 'report_type', 'crc_nlp']).size().rename('n_select').reset_index()
s = s.sort_values(['brc', 'report_type', 'crc_nlp', 'has_tnm'])
s

In [None]:
# Dbl check false_tnm proportion
dfa.groupby(['brc', 'report_type', 'crc_nlp', 'has_tnm'])['false_tnm'].value_counts().rename('n').reset_index()

In [None]:
# Dbl check there are no duplicates
print(dfa.shape)
print(dfa.drop_duplicates(subset=['report_text_anon']).shape)

print(dfb.shape)
print(dfb.drop_duplicates(subset=['report_text_anon']).shape)


In [None]:
# Sort reports randomly
dfa_sort = dfa.sample(n=dfa.shape[0], random_state=42, replace=False)
print(dfa_sort.drop_duplicates().shape[0] == dfa_sort.shape[0])

dfb_sort = dfb.sample(n=dfb.shape[0], random_state=42, replace=False)
print(dfb_sort.drop_duplicates().shape[0] == dfb_sort.shape[0])



In [None]:
# Save (duplicates - orig file, and file to be modified by checking labels)
#tstamp = datetime.now().strftime('%Y-%m-%d_%H%M%S')
fnames = ['set2_tnm.csv', 'set2_tnm_labelled.csv']
for fname in fnames:
    print('Saving to {}'.format(fname))
    dfa_sort.to_csv(out_path / fname, index=False)

    # Dbl check that file can be read 
    test = pd.read_csv(out_path / fname) #, engine='c', lineterminator='\n')
    print(test.brc.unique())


# Save
#tstamp = datetime.now().strftime('%Y-%m-%d_%H%M%S')
fnames = ['set2_crc.csv', 'set2_crc_labelled.csv']
for fname in fnames:
    print('Saving to {}'.format(fname))
    dfb_sort.to_csv(out_path / fname, index=False)

    # Dbl check that file can be read 
    test = pd.read_csv(out_path / fname) #, engine='c', lineterminator='\n')
    print(test.brc.unique())


In [None]:
# Sanity check that old and new sets do not overlap

df1 = pd.read_csv(out_path / 'set1_tnm.csv')
df2 = pd.read_csv(out_path / 'set2_tnm.csv')

test1 = df1.report_text_anon.str.lower().isin(df2.report_text_anon.str.lower()).mean()
test2 = df2.report_text_anon.str.lower().isin(df1.report_text_anon.str.lower()).mean()
print(test1, test2)

df1 = pd.read_csv(out_path / 'set1_crc.csv')
df2 = pd.read_csv(out_path / 'set2_crc.csv')

test1 = df1.report_text_anon.str.lower().isin(df2.report_text_anon.str.lower()).mean()
test2 = df2.report_text_anon.str.lower().isin(df1.report_text_anon.str.lower()).mean()
print(test1, test2)