In [243]:
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [244]:
data = pd.read_csv('../ground_truths/ground_truth.csv')
data.head()
stage_level = data[['text', 'stage_level']].groupby('stage_level').agg({'text': lambda text: ' '.join(text),
                                                                        })
data = stage_level.reset_index(level=0)
data


Unnamed: 0,stage_level,text
0,0.0,Because stage 0 NSCLC is limited to the lining...
1,1.0,"If you have stage I NSCLC, surgery may be the ..."
2,2.0,People who have stage II NSCLC and are healthy...
3,3.0,Treatment for stage IIIA NSCLC may include som...
4,4.0,Stage IV NSCLC is widespread when it is diagno...


In [245]:
crawled = pd.read_csv('stage_0.csv')
crawled

Unnamed: 0,url,title,snippet,text
0,https://www.cancer.org/cancer/lung-cancer/trea...,Non-small Cell Lung Cancer Treatment by Stage,"Oct 1, 2019 ... For some stage 0 cancers, trea...","What cancer patients, their families, and care..."
1,https://www.verywellhealth.com/stage-0-non-sma...,Understanding Stage 0 Lung Cancer,"Nov 15, 2019 ... Stage 0 Non-Small Cell Lung C...","Stage 0 non-small cell lung cancer , also know..."
2,https://www.onhealth.com/content/1/lung_cancer,"Lung Cancer Symptoms, Stages, Treatment","Jul 22, 2016 ... This form of cancer tends to ...",Lung cancer has emerged as the leading killer ...
3,https://www.cancer.net/cancer-types/lung-cance...,Lung Cancer - Non-Small Cell: Stages | Cancer.Net,This is because lung cancer is different in ea...,"Skip to Content,,Search,Menu,ON THIS PAGE: You..."
4,https://www.cancer.ca/en/cancer-information/ca...,Treatments for stage 0 non–small cell lung can...,The following are treatment options for stage ...,CCS is actively monitoring and responding to t...
5,https://clinicaltrials.gov/ct2/show/NCT00526461,Photodynamic Therapy Using HPPH in Treating Pa...,... Using HPPH in Treating Patients With Stage...,Study record managers: refer to the Data Eleme...
6,https://www.webmd.com/lung-cancer/qa/how-is-su...,How is surgery used to treat Stage 0 Lung Cancer?,"This might sound extreme, but you can live a n...",",,ANSWER,If your health is good overall, surge..."
7,https://www.cancercenter.com/cancer-types/lung...,Understand how Lung Cancer is Staged and Grade...,"Lung cancer stage is based on tumor size, loca...","Call us 24/7,Experiencing symptoms that concer..."
8,https://www.cancer.gov/publications/dictionari...,Definition of stage 0 non-small cell lung carc...,These abnormal cells may become cancer and spr...,"The NCI Dictionary of Cancer Terms features 8,..."


In [128]:
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import spacy
import re
nlp = spacy.load('en', disable=['parser', 'ner'])#%%

In [137]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out


In [227]:
def text_cleaner(text, remove_short_words=False, remove_xml_tag=False, return_tokens=False, lemmatizing=False):
    newString = text.lower()
    
    if remove_xml_tag:
        newString = BeautifulSoup(newString, "lxml").text
        
    newString = re.sub('[^ ]+\.[^ ]+','',newString)              # Remove http links
    newString = re.sub(r'\([^)]*\)', '', newString)              # Remove any text inside the parenthesis () 
    newString = re.sub('"','', newString)                        # Remove quotation marks
    newString = re.sub(r"'s\b","",newString)                     # Remove 's possessive
    newString = re.sub('\n+', ' ', newString)                    # Remove new line symbol
    newString = re.sub("[^a-zA-Z0-9]", " ", newString)           # Keep alphabets and numbers
    
    doc = nlp(newString, disable=['parser', 'ner'])
    tokens = [tok for tok in doc if not tok.is_stop]
    
    if lemmatizing:
        tokens = [tok.lemma_.lower().strip() for tok in tokens if tok.lemma_ != '-PRON-']
        
    if remove_short_words:
        long_words = []
        for i in tokens:
            if len(i) >= 2:                  # removing short word
                long_words.append(i)
        return long_words if return_tokens else ' '.join(w for w in long_words)
    else:
        return tokens if return_tokens else ' '.join(token for token in tokens)

In [228]:
from nltk.tokenize import sent_tokenize

In [229]:
def sentenize_clean_up(list_of_str, return_tokens=True, debug=False, lemmatizing=False):
   
    if debug: print(f'Original: {list_of_str[0][:100]}')
    
    sentences = []
    for s in list_of_str:
        sentences.append(sent_tokenize(s))
    sentences = [y for x in sentences for y in x] # flatten list    
    if debug: print(f'Sentenize: {sentences[0][:100]}')
    
    cleaned_texts = []
    for sent in sentences:
        cleaned_texts.append((text_cleaner(sent, remove_short_words=True, remove_xml_tag=True, return_tokens=return_tokens, lemmatizing=lemmatizing)))
    if debug: print(f'Cleaned: {cleaned_texts[0][:100]}')
    return cleaned_texts

In [249]:
cleaned_text = sentenize_clean_up([crawled['text'][0]], debug=True, return_tokens=True, lemmatizing=True)

Original: What cancer patients, their families, and caregivers need to know about the coronavirus . How COVID-
Sentenize: What cancer patients, their families, and caregivers need to know about the coronavirus .
Cleaned: ['cancer', 'patient', 'family', 'caregiver', 'need', 'know', 'coronavirus']


In [242]:
accepted_tokens = sentenize_clean_up(data[data['stage_level']==0]['text'], return_tokens=True, debug=False, lemmatizing=True)
accepted_tokens = [y for x in accepted_tokens for y in x]

print(accepted_tokens[:10])

['stage', 'nsclc', 'limit', 'lining', 'layer', 'airways', 'invade', 'deep', 'lung', 'tissue']


In [253]:
set(cleaned_text[0])

{'cancer', 'caregiver', 'coronavirus', 'family', 'know', 'need', 'patient'}

### Remove irrelevant sentences

In [305]:
def check_irrelevance(long_str: str, allowed_tokens: list):
    accepted_tokens_set = set(allowed_tokens)
    print(accepted_tokens_set)

    tokens_list = sentenize_clean_up([long_str], debug=False, return_tokens=True, lemmatizing=True)
    
    article = []
    for i, tokens in enumerate(tokens_list):
        token_set = set(tokens)
                
        print(token_set)

        if len(token_set.intersection(accepted_tokens_set)) > 0:
            article.append(tokens)
    return article

In [306]:
test_str = str(crawled['text'][3][:312])
test_str

'Skip to Content,,Search,Menu,ON THIS PAGE: You will learn about how doctors describe a cancer’s growth or spread. This is called the stage. Use the menu to see other pages.,Staging is a way of describing where the cancer is located, if or where it has spread, and whether it is affecting other parts of the body.'

In [307]:
check_irrelevance(test_str, accepted_tokens)

{'airways', 'sleeve', 'remove', 'healthy', 'treat', 'deep', 'wedge', 'invade', 'area', 'case', 'lung', 'alternative', 'therapy', 'lobe', 'segmentectomy', 'photodynamic', 'radiation', 'surgery', 'laser', 'hard', 'completely', 'truly', 'cure', 'curable', 'need', 'lining', 'layer', 'nsclc', 'limit', 'stage', 'entire', 'treatment', 'location', 'usually', 'resection', 'brachytherapy', 'chemotherapy', 'cancer', 'tissue'}
{'page', 'skip', 'doctor', 'describe', 'spread', 'growth', 'cancer', 'search', 'content', 'learn', 'menu'}
{'call', 'stage'}
{'locate', 'part', 'body', 'describe', 'way', 'spread', 'use', 'affect', 'cancer', 'menu'}


[['skip',
  'content',
  'search',
  'menu',
  'page',
  'learn',
  'doctor',
  'describe',
  'cancer',
  'growth',
  'spread'],
 ['call', 'stage'],
 ['use',
  'menu',
  'way',
  'describe',
  'cancer',
  'locate',
  'spread',
  'affect',
  'part',
  'body']]