In [13]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from functools import reduce
from typing import Dict
from io import StringIO
from multiprocessing import Pool

In [2]:
exams_df = pd.read_csv('data/TCGA_Reports.csv')
exams_df.head()

Unnamed: 0,patient_filename,text
0,TCGA-BP-5195.25c0b433-5557-4165-922e-2c1eac9c26f0,Date of Recelpt: Clinical Diagnosis & History:...
1,TCGA-D7-8573.b7306a47-697d-4ed3-bbe1-81d49674a8f8,"Material: 1) Material: stomach, Method of coll..."
2,TCGA-EI-7004.13591eed-30e5-47a3-91be-7a370663d2d4,page 1 / 1. copy No. 3. Examination: Histopath...
3,TCGA-EB-A82B.23E186C6-739C-4EF1-8788-79AA89C6E87A,Patient ID: Gross Description: A mass is locat...
4,TCGA-A6-3808.e1505f65-72ef-438d-a5e1-93ed8bf6635d,SPECIMEN. Right colon. CLINICAL NOTES. PRE-OP ...


In [3]:
# nltk.download("punkt")
# nltk.download("stopwords")

In [20]:
words_counter = nltk.FreqDist([])
def f(patient):
    return nltk.word_tokenize(patient[1])
with Pool(12) as p:
    o = p.map(f, exams_df['text'].items())

for tokens in o:
    words_counter.update(tokens)

In [19]:
words_counter

FreqDist({'.': 858011, ':': 260530, ',': 236663, 'of': 130371, 'the': 115314, 'and': 112014, 'is': 99737, ')': 96575, '(': 88604, 'cm': 87263, ...})

In [5]:
redundant_tokens_set = set(stopwords.words('english'))
def word_filter(acc: Dict[str, int], curr: str, words_counter: nltk.FreqDist):
    if curr.lower() in redundant_tokens_set or not curr.isalpha():
        return acc
    lowercase_key = curr.lower()
    if lowercase_key in acc:
        acc[lowercase_key] += words_counter[curr]
    else:
        acc[lowercase_key] = words_counter[curr]
        
    return acc

In [21]:
redundant_tokens_set = set(stopwords.words('english'))
filtered_words_count = reduce(lambda acc, curr: word_filter(acc, curr, words_counter), words_counter, {})
filtered_words_count

{'cm': 91406,
 'x': 89728,
 'lymph': 82775,
 'tumor': 74368,
 'tissue': 46570,
 'nodes': 47743,
 'node': 44055,
 'margin': 36661,
 'specimen': 42406,
 'submitted': 30070,
 'right': 45963,
 'left': 42139,
 'labeled': 23734,
 'carcinoma': 28846,
 'identified': 21150,
 'section': 21051,
 'measuring': 16162,
 'sections': 19779,
 'mass': 16438,
 'soft': 14170,
 'consists': 11718,
 'one': 18199,
 'surface': 11975,
 'received': 22884,
 'frozen': 14803,
 'diagnosis': 23664,
 'b': 10431,
 'fresh': 10569,
 'invasion': 17511,
 'measures': 9911,
 'resection': 12150,
 'anterior': 11661,
 'posterior': 11338,
 'present': 14209,
 'dimension': 9801,
 'e': 8930,
 'formalin': 9070,
 'lobe': 10341,
 'inked': 8760,
 'negative': 18437,
 'patient': 9129,
 'entirely': 9295,
 'tan': 7794,
 'h': 7718,
 'bisected': 7445,
 'grade': 14069,
 'part': 12010,
 'greatest': 8881,
 'portion': 7395,
 'tube': 7841,
 'representative': 11499,
 'cassette': 9516,
 'two': 9541,
 'grossly': 7110,
 'ovary': 7952,
 'margins': 1267

In [22]:
filtered_words_count['breast']

7637

In [23]:
with open('data/data_bcr_clinical_data_sample.txt', 'r') as f:
	bcr_sample_df = pd.read_csv(StringIO(''.join(f.readlines()[4:]).replace('[Not Available]', '')), sep='\t')
bcr_sample_df

Unnamed: 0,PATIENT_ID,SAMPLE_ID,OTHER_SAMPLE_ID,SPECIMEN_CURRENT_WEIGHT,DAYS_TO_COLLECTION,DAYS_TO_SPECIMEN_COLLECTION,SPECIMEN_FREEZING_METHOD,INITIAL_WEIGHT,SPECIMEN_SECOND_LONGEST_DIMENSION,IS_FFPE,...,PATHOLOGY_REPORT_UUID,SAMPLE_TYPE,SAMPLE_TYPE_ID,SHORTEST_DIMENSION,TIME_BETWEEN_CLAMPING_AND_FREEZING,TIME_BETWEEN_EXCISION_AND_FREEZING,VIAL_NUMBER,ONCOTREE_CODE,CANCER_TYPE,CANCER_TYPE_DETAILED
0,TCGA-AR-A1AR,TCGA-AR-A1AR-01,5fa9998b-deff-493e-8a8e-dc2422192a48,,1416.0,,,250.0,,NO,...,747FB91B-F523-4FA0-91DD-6014EF55643D,Primary Tumor,1,,,,A,IDC,Breast Cancer,Breast Invasive Ductal Carcinoma
1,TCGA-AR-A1AR,TCGA-AR-A1AR-10,c1e5beaa-6103-409d-bdd4-a86c0f210014,,1416.0,,,,,NO,...,,Blood Derived Normal,10,,,,A,,,
2,TCGA-BH-A1EO,TCGA-BH-A1EO-01,9d501ca9-f50d-45fc-bcc3-fee15f9771cd,,4330.0,,,100.0,,NO,...,A2B755DD-00C4-4349-8E11-C1196EBC91D3,Primary Tumor,1,,,,A,MDLC,Breast Cancer,Breast Mixed Ductal and Lobular Carcinoma
3,TCGA-BH-A1EO,TCGA-BH-A1EO-11,fff3e5a3-4b84-4afa-bdb5-1c6b2a6626bc,,4330.0,,,250.0,,NO,...,,Solid Tissue Normal,11,,,,A,,,
4,TCGA-BH-A1ES,TCGA-BH-A1ES-01,aa1037f4-1414-4e05-b11a-c558a23b7e62,,4242.0,,,220.0,,NO,...,9B2253AB-2C41-4E7A-9178-B690FFD9FF78,Primary Tumor,1,,,,A,IDC,Breast Cancer,Breast Invasive Ductal Carcinoma
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2280,TCGA-E2-A1BC,TCGA-E2-A1BC-01,83ec5738-3a27-4100-83f2-8b30ae3b2fb7,,275.0,,,100.0,,NO,...,817D4725-01FE-4148-A8FC-3431F28D5CD8,Primary Tumor,1,,,,A,ILC,Breast Cancer,Breast Invasive Lobular Carcinoma
2281,TCGA-E2-A1BC,TCGA-E2-A1BC-10,6f95b20b-cdc6-4240-9f6e-10e199ff7f97,,275.0,,,,,NO,...,,Blood Derived Normal,10,,,,A,,,
2282,TCGA-E2-A1BC,TCGA-E2-A1BC-11,d221f2c8-5852-4392-b166-92b709000e78,,275.0,,,260.0,,NO,...,,Solid Tissue Normal,11,,,,A,,,
2283,TCGA-E2-A1BD,TCGA-E2-A1BD-01,1ddc4c86-029f-49b5-893c-a218240d3fa4,,260.0,,,100.0,,NO,...,DF746E32-AD2F-49CB-9AA6-4F8494C16B20,Primary Tumor,1,,,,A,IDC,Breast Cancer,Breast Invasive Ductal Carcinoma


In [24]:
bcr_sample_df['CANCER_TYPE_DETAILED'].unique()

array(['Breast Invasive Ductal Carcinoma', nan,
       'Breast Mixed Ductal and Lobular Carcinoma',
       'Breast Invasive Mixed Mucinous Carcinoma',
       'Breast Invasive Lobular Carcinoma', 'Paget Disease of the Nipple',
       'Adenoid Cystic Breast Cancer', 'Invasive Breast Carcinoma',
       'Basal Cell Carcinoma', 'Metaplastic Breast Cancer',
       'Solid Papillary Carcinoma of the Breast',
       'Malignant Phyllodes Tumor of the Breast',
       'Breast Invasive Carcinoma, NOS'], dtype=object)

In [25]:
with open('data/data_bcr_clinical_data_patient 1.txt', 'r') as f:
	bcr_patient_df = pd.read_csv(StringIO(''.join(f.readlines()[4:]).replace('[Not Available]', '').replace('[Not Applicable]', '')), sep='\t')
bcr_patient_df

Unnamed: 0,OTHER_PATIENT_ID,PATIENT_ID,FORM_COMPLETION_DATE,PROSPECTIVE_COLLECTION,RETROSPECTIVE_COLLECTION,DAYS_TO_BIRTH,GENDER,MENOPAUSE_STATUS,RACE,ETHNICITY,...,METASTATIC_TUMOR_INDICATOR,PROJECT_CODE,PRIMARY_SITE_OTHER,STAGE_OTHER,TISSUE_SOURCE_SITE,TUMOR_TISSUE_SITE,OS_STATUS,OS_MONTHS,DFS_STATUS,DFS_MONTHS
0,55262FCB-1B01-4480-B322-36570430C917,TCGA-3C-AALI,2014-7-28,NO,YES,-18538.0,FEMALE,Post (prior bilateral ovariectomy OR >12 mo si...,BLACK OR AFRICAN AMERICAN,NOT HISPANIC OR LATINO,...,,,,,3C,Breast,LIVING,131.57,DiseaseFree,131.57
1,427D0648-3F77-4FFC-B52C-89855426D647,TCGA-3C-AALJ,2014-7-28,NO,YES,-22848.0,FEMALE,Post (prior bilateral ovariectomy OR >12 mo si...,BLACK OR AFRICAN AMERICAN,NOT HISPANIC OR LATINO,...,,,,,3C,Breast,LIVING,48.42,DiseaseFree,48.42
2,C31900A4-5DCD-4022-97AC-638E86E889E4,TCGA-3C-AALK,2014-7-28,NO,YES,-19074.0,FEMALE,,BLACK OR AFRICAN AMERICAN,NOT HISPANIC OR LATINO,...,,,,,3C,Breast,LIVING,47.57,DiseaseFree,47.57
3,6623FC5E-00BE-4476-967A-CBD55F676EA6,TCGA-4H-AAAK,2014-11-13,YES,NO,-18371.0,FEMALE,Post (prior bilateral ovariectomy OR >12 mo si...,WHITE,NOT HISPANIC OR LATINO,...,,,,,4H,Breast,LIVING,11.43,DiseaseFree,11.43
4,86C6F993-327F-4525-9983-29C55625593A,TCGA-5L-AAT0,2014-8-15,NO,YES,-15393.0,FEMALE,Post (prior bilateral ovariectomy OR >12 mo si...,WHITE,HISPANIC OR LATINO,...,,,,,5L,Breast,LIVING,48.52,DiseaseFree,48.52
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1091,5CD79093-1571-4F71-8136-0D84CCABDCAC,TCGA-WT-AB44,2014-7-16,NO,YES,,FEMALE,Post (prior bilateral ovariectomy OR >12 mo si...,WHITE,NOT HISPANIC OR LATINO,...,,,,,WT,Breast,LIVING,29.01,DiseaseFree,29.01
1092,F89588E9-CA73-4465-A7FB-7246EDB45E3A,TCGA-XX-A899,2014-2-21,NO,YES,-17022.0,FEMALE,Post (prior bilateral ovariectomy OR >12 mo si...,WHITE,NOT HISPANIC OR LATINO,...,,,,,XX,Breast,LIVING,15.34,DiseaseFree,15.34
1093,CA20249F-B7EA-4FD9-9ECB-34F74755AE35,TCGA-XX-A89A,2014-2-21,NO,YES,-25000.0,FEMALE,Post (prior bilateral ovariectomy OR >12 mo si...,WHITE,NOT HISPANIC OR LATINO,...,,,,,XX,Breast,LIVING,16.03,DiseaseFree,16.03
1094,23F438BD-1DBB-4D46-972F-1E8E74DDBD37,TCGA-Z7-A8R5,2014-7-9,NO,YES,-22280.0,FEMALE,Post (prior bilateral ovariectomy OR >12 mo si...,WHITE,NOT HISPANIC OR LATINO,...,,,,,Z7,Breast,LIVING,107.98,Recurred/Progressed,107.98


In [26]:
bcr_patient_df['OS_STATUS'].unique()

array(['LIVING', 'DECEASED'], dtype=object)