In [19]:
import os
import numpy as np
import pandas as pd
import re
import nltk
import spacy
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer

In [20]:
df = pd.DataFrame(columns=['PMID', 'Title', 'Class', 'Class_id','Abstract'])
list_cancer = ['lung','breast','colorectal','prostate','stomach','liver','oesophagus','cervixuteri','thyroid','bladder']

In [21]:
for i in range(len(list_cancer)):
    #Read txt
    data_txt = 'pubmed-'+list_cancer[i]+'-set.txt' 
    temp_abs = ''
    temp_id = ''
    
    #Read csv
    data_csv = 'csv-'+list_cancer[i]+'-set.csv'
    dff = pd.read_csv(data_csv)
    dff = dff[['PMID','Title']]
    label_id = i+1
    label_name = list_cancer[label_id-1]
    dff['Class'] = label_name
    dff['Class_id'] = label_id
    dff['Abstract']='-'
    dff['Title_SingleLine']='-'
    dff['Abstract_SingleLine']='-'
    
    with open(data_txt, encoding="utf8") as each:
        line = each.read()
        content = line.split('\n')
        for j in range(len(content)):
            if(content[j][:4] == 'PMID'):
                temp_id = int(content[j][6:])
            if(content[j][:2] == 'TI'):
                temp_ti = content[j][6:]
                temp_ti = temp_ti.replace('.','')
                dff.loc[dff['PMID'] == temp_id,  'Title_SingleLine'] = temp_ti
                temp_ti=''
            if(content[j][:2] == 'AB'):
                temp_abs = content[j][6:]
                dff.loc[dff['PMID'] == temp_id,  'Abstract_SingleLine'] = temp_abs
                flag = j+1
                while(content[flag][0]==' '):
                    temp = content[flag].strip()
                    temp_abs = temp_abs + ' ' +  temp
                    flag+=1
                dff.loc[dff['PMID'] == temp_id,  'Abstract'] = temp_abs                
                temp_id='' 
                temp_abs=''
                
        df = pd.concat([df,dff])

In [22]:
df = df.reset_index(drop=True)
df

Unnamed: 0,PMID,Title,Class,Class_id,Abstract,Title_SingleLine,Abstract_SingleLine
0,29790681,Multiple primary lung cancer: A literature review,lung,1,"Nowadays, lung cancer is a leading cause of de...",Multiple primary lung cancer: A literature review,"Nowadays, lung cancer is a leading cause of de..."
1,27261907,Epidemiology of Lung Cancer,lung,1,Lung cancer has been transformed from a rare d...,Epidemiology of Lung Cancer,Lung cancer has been transformed from a rare d...
2,29635240,Heterogeneity in Lung Cancer,lung,1,Lung cancer diagnosis is a challenge since it ...,Heterogeneity in Lung Cancer,Lung cancer diagnosis is a challenge since it ...
3,30955514,Lung Cancer,lung,1,Lung cancer is the world's leading cause of ca...,Lung Cancer,Lung cancer is the world's leading cause of ca...
4,26667338,Lung Cancer in Never Smokers,lung,1,Lung cancer is predominantly associated with c...,Lung Cancer in Never Smokers,Lung cancer is predominantly associated with c...
...,...,...,...,...,...,...,...
99995,22779969,Primary and secondary malignant involvement of...,bladder,10,The pathological analysis of cystectomy specim...,Primary and secondary malignant involvement of...,The pathological analysis of cystectomy specim...
99996,31992742,Expression of L-type amino acid transporter 1 ...,bladder,10,L-type amino acid transporter 1 (LAT1) plays a...,Expression of L-type amino acid transporter 1 ...,L-type amino acid transporter 1 (LAT1) plays a...
99997,29191126,Intravesical radiofrequency induced hypertherm...,bladder,10,INTRODUCTION: Non-muscle invasive bladder canc...,Intravesical radiofrequency induced hypertherm...,INTRODUCTION: Non-muscle invasive bladder canc...
99998,9757194,[Recent controversy in treatment for advanced ...,bladder,10,We summarized here the current status and cont...,[Recent controversy in treatment for advanced ...,We summarized here the current status and cont...


In [23]:
df.isna().sum()

PMID                   0
Title                  1
Class                  0
Class_id               0
Abstract               0
Title_SingleLine       0
Abstract_SingleLine    0
dtype: int64

### 1. (single Line)(No Stopwords): Find the most common 20 words within the "title"

In [24]:
Question_1 = df['Title_SingleLine']
Question_1

0        Multiple primary lung cancer: A literature review
1                              Epidemiology of Lung Cancer
2                             Heterogeneity in Lung Cancer
3                                              Lung Cancer
4                             Lung Cancer in Never Smokers
                               ...                        
99995    Primary and secondary malignant involvement of...
99996    Expression of L-type amino acid transporter 1 ...
99997    Intravesical radiofrequency induced hypertherm...
99998    [Recent controversy in treatment for advanced ...
99999    The D2 dopamine receptor gene and nicotine dep...
Name: Title_SingleLine, Length: 100000, dtype: object

In [25]:
nlp = spacy.load('en')

In [26]:
all_stopwords = nlp.Defaults.stop_words

In [27]:
def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = str(doc).lower()
    doc = re.sub(r'[^a-z\s]', '', doc, re.I|re.A)
    doc = doc.strip()
    # tokenize document
    text = doc.split()
    # filter stopwords out of document
    filtered_tokens = [word for word in text if not word in all_stopwords]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

In [28]:
Question_1_after = Question_1.apply(normalize_document)
Question_1_after

0           multiple primary lung cancer literature review
1                                 epidemiology lung cancer
2                                heterogeneity lung cancer
3                                              lung cancer
4                                      lung cancer smokers
                               ...                        
99995    primary secondary malignant involvement gynaec...
99996    expression ltype amino acid transporter molecu...
99997    intravesical radiofrequency induced hypertherm...
99998    recent controversy treatment advanced bladder ...
99999    d dopamine receptor gene nicotine dependence b...
Name: Title_SingleLine, Length: 100000, dtype: object

In [29]:
tokens_1 = [item.split() for item in Question_1_after]
words_1 = [word for sentence in tokens_1 for word in sentence]
result_1 = Counter(words_1)

In [30]:
Result1 = pd.DataFrame(result_1.most_common(20))
Result1.columns = ['Word','Count']
Result1.sort_values('Count', ascending=False)

Unnamed: 0,Word,Count
0,cancer,58833
1,breast,9276
2,treatment,8737
3,lung,8631
4,prostate,8396
5,colorectal,8240
6,thyroid,7882
7,patients,7795
8,carcinoma,7734
9,bladder,7387


### 2. (single Line)(No Stopwords): Find the most 20 common words within the "Abstract" and show the differences top 20 common words between "Title" and "Abstract"

In [31]:
Question_2 = df['Abstract_SingleLine']
Question_2

0        Nowadays, lung cancer is a leading cause of de...
1        Lung cancer has been transformed from a rare d...
2        Lung cancer diagnosis is a challenge since it ...
3        Lung cancer is the world's leading cause of ca...
4        Lung cancer is predominantly associated with c...
                               ...                        
99995    The pathological analysis of cystectomy specim...
99996    L-type amino acid transporter 1 (LAT1) plays a...
99997    INTRODUCTION: Non-muscle invasive bladder canc...
99998    We summarized here the current status and cont...
99999    Multiple twin, family, and genetic studies hav...
Name: Abstract_SingleLine, Length: 100000, dtype: object

In [32]:
nlp = spacy.load('en', disable=['vectors', 'textcat', 'tagger', 'parser', 'ner'])

In [33]:
all_stopwords = nlp.Defaults.stop_words
def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = str(doc).lower()
    doc = re.sub(r'[^a-z\s]', '', doc, re.I|re.A)
    doc = doc.strip()
    # tokenize document
    text = doc.split()
    # filter stopwords out of document
    filtered_tokens = [word for word in text if not word in all_stopwords]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

In [34]:
Question_2_after = Question_2.apply(normalize_document)
Question_2_after

0        nowadays lung cancer leading cause death men w...
1        lung cancer transformed rare disease global pr...
2               lung cancer diagnosis challenge frequently
3        lung cancer worlds leading cause cancer death ...
4        lung cancer predominantly associated cigarette...
                               ...                        
99995    pathological analysis cystectomy specimens fem...
99996    ltype amino acid transporter lat plays role tr...
99997    introduction nonmuscle invasive bladder cancer...
99998    summarized current status controversy surround...
99999    multiple twin family genetic studies rendered ...
Name: Abstract_SingleLine, Length: 100000, dtype: object

In [35]:
tokens_2 = [item.split() for item in Question_2_after]
words_2 = [word for sentence in tokens_2 for word in sentence]
result_2 = Counter(words_2)

In [37]:
Result2 = pd.DataFrame(result_2.most_common(20))
Result2.columns = ['Word','Count']
Result2.sort_values('Count', ascending=False)

Unnamed: 0,Word,Count
0,cancer,27570
1,background,9477
2,patients,7684
3,study,4820
4,objective,4614
5,purpose,4582
6,thyroid,4450
7,prostate,4224
8,carcinoma,4181
9,breast,4055


In [38]:
Result2 = pd.merge(Result1,Result2, left_index=True, right_index=True)

In [42]:
Result2.columns = ['Title_Word','Count_TitleWord','Abstract_Word','Count_AbstractWord']
Result2

Unnamed: 0,Title_Word,Count_TitleWord,Abstract_Word,Count_AbstractWord
0,cancer,58833,cancer,27570
1,breast,9276,background,9477
2,treatment,8737,patients,7684
3,lung,8631,study,4820
4,prostate,8396,objective,4614
5,colorectal,8240,purpose,4582
6,thyroid,7882,thyroid,4450
7,patients,7795,prostate,4224
8,carcinoma,7734,carcinoma,4181
9,bladder,7387,breast,4055


### 3. (multi-lines)(No Stopwords): Find the most 20 common words the "title" and "Abstract" and show the differences top 20 common words between "Title" and "Abstract"

In [86]:
Question_3 = df['Title'] + ' ' + df['Abstract']

In [87]:
Question_3[:2]

0    Multiple primary lung cancer: A literature rev...
1    Epidemiology of Lung Cancer Lung cancer has be...
dtype: object

In [88]:
nlp = spacy.load('en', disable=['vectors', 'textcat', 'tagger', 'parser', 'ner'])

In [89]:
all_stopwords = nlp.Defaults.stop_words
def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = str(doc).lower()
    doc = re.sub(r'[^a-z\s]', '', doc, re.I|re.A)
    doc = doc.strip()
    # tokenize document
    text = doc.split()
    # filter stopwords out of document
    filtered_tokens = [word for word in text if not word in all_stopwords]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

In [90]:
Question_3_after = Question_3.apply(normalize_document)
Question_3_after[:2]

0    multiple primary lung cancer literature review...
1    epidemiology lung cancer lung cancer transform...
dtype: object

In [91]:
tokens_3 = [item.split() for item in Question_3_after]
words_3 = [word for sentence in tokens_3 for word in sentence]
result_3 = Counter(words_3)

In [92]:
Result3 = pd.DataFrame(result_3.most_common(20))
Result3.columns = ['Word','Count']
Result3.sort_values('Count', ascending=False)

Unnamed: 0,Word,Count
0,cancer,295584
1,patients,168252
2,treatment,59013
3,survival,47546
4,prostate,46365
5,thyroid,44130
6,results,43775
7,p,42386
8,tumor,41995
9,bladder,41446


In [102]:
Result3 = pd.merge(Result2,Result3, left_index=True, right_index=True)
Result3.columns = ['Title_Word','Count_TitleWord','Abstract_Word','Count_AbstractWord','Title&Abstract_Word','Count_Title&Abstract']
Result3

Unnamed: 0,Title_Word,Count_TitleWord,Abstract_Word,Count_AbstractWord,Title&Abstract_Word,Count_Title&Abstract
0,cancer,58833,cancer,27570,cancer,295584
1,breast,9276,background,9477,patients,168252
2,treatment,8737,patients,7684,treatment,59013
3,lung,8631,study,4820,survival,47546
4,prostate,8396,objective,4614,prostate,46365
5,colorectal,8240,purpose,4582,thyroid,44130
6,thyroid,7882,thyroid,4450,results,43775
7,patients,7795,prostate,4224,p,42386
8,carcinoma,7734,carcinoma,4181,tumor,41995
9,bladder,7387,breast,4055,bladder,41446


### 4. (multi-lines)(No Stopwords)(Lemmatization): Compare the most 20 common words in the within the "title" and "Abstract"

In [93]:
Question_4 = df['Title'] + ' ' +  df['Abstract']
Question_4[:2]

0    Multiple primary lung cancer: A literature rev...
1    Epidemiology of Lung Cancer Lung cancer has be...
dtype: object

In [94]:
nlp = spacy.load('en', disable=['vectors', 'textcat', 'tagger', 'parser', 'ner'])

In [95]:
def normalize2_document(doc):
    # lower case and remove special characters\whitespaces
    doc = str(doc).lower()
    doc = re.sub(r'[^a-z\s]', '', doc, re.I|re.A)
    doc = doc.strip()
    # tokenize document
    text = nlp(doc)
    # filter stopwords out of document
    filtered_tokens = [token.lemma_ for token in text if not token.is_stop]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

In [99]:
Question_4_after = Question_4.apply(normalize2_document)
Question_4_after[:2]

0    multiple primary lung cancer literature review...
1    epidemiology lung cancer lung cancer transform...
dtype: object

In [100]:
tokens_4 = [item.split() for item in Question_4_after]
words_4 = [word for sentence in tokens_4 for word in sentence]
result_4 = Counter(words_4)

In [101]:
Result4 = pd.DataFrame(result_4.most_common(20))
Result4.columns = ['Word','Count']
Result4.sort_values('Count', ascending=False)

Unnamed: 0,Word,Count
0,cancer,311697
1,patient,186075
2,study,64004
3,treatment,63040
4,tumor,61015
5,cell,57895
6,result,50138
7,survival,47970
8,prostate,46467
9,thyroid,44193
