In [1]:
import requests #Used for api calls
import json #Used for loading json file
import xmltodict  #Used to convert xml to json
import gensim  #LDA library
from gensim.utils import simple_preprocess #LDA library
from gensim.parsing.preprocessing import STOPWORDS #LDA library
import numpy as np  
import re  #Used for processing text (removing punctuations/empty spaces)
import copy #Used to copy a variable without referencing to the same address (deepcopy)
from nltk.stem import WordNetLemmatizer, SnowballStemmer #nltk use for pre-processing the abstract
np.random.seed(2018)

In [2]:
'''
Description: Lemmatize words
Parameter: a string
Return: lemmatized string
'''
def lemmatize_stemming(text):
    stemmer = SnowballStemmer('english')
    result = stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
    return result

# Import spacy library and dictionary for scientic terms

In [4]:
import spacy
import scispacy
import string
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
punctuations = string.punctuation
parser = English()
stopwords = list(STOP_WORDS)

'''
Description: Tokenize a document 
Parameter: A list of string
Return: tokenized list of string
Note: It does not remove digit by default, any word length less than 4 will be removed
'''
def spacy_tokenizer(document):
    mytokens = parser(document)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations and len(word) > 3]
    return mytokens

#Load the scispacy dictionary download from  https://allenai.github.io/scispacy/
#We are using the large (lg) dictionary, it is optional to use small (sm) or medium (md) size dictionary which can be found in the website
nlp = spacy.load("en_core_sci_lg")

## Reading API and extract abstract from the link

In [5]:
'''
Description: Search PubMed article ids using api calls 
Parameter: term to search, maximum number of article return by the request
Return: a json object containing all articles
'''
def search_pubmed(search_term,max_article):
    #Document on api calls https://www.ncbi.nlm.nih.gov/books/NBK25499/
    #db = Databse (currently using PubMed), term = Searching term, retmax = Number of maximum abstract return from the request, 
    #sort = sort based on ascending of article id, retmode = return type of request
    search_term = search_term.replace(" ","+") #format the search term properly to fit in the url
    response = requests.get("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=PubMed&term="+search_term+"&retmax="+str(max_article)+
                            "&sort&retmode=json")
    data = response.text  #Convert to text 
    data = json.loads(data) #make the json format (string type) into a JSON object
    return data

'''
Description: Generate a string which contains all articles ids
Parameter: idlist from the api call
Return: a string containing all articles ids
'''
def generate_uilist(uilist):
    id = ""
    for i in range (len(uilist)):
        if i < len(uilist)-1:
            id = id + uilist[i] + ","
        else:
            id = id + uilist[i]
    return id

'''
Description: Retrieve article from PubMed base on the ids
Parameter: a list of ids
Return: a json object containing all the information of each article
'''
def retrieve_article(id):
    pload = {'db':'PubMed','id':str(id),'retmode':'xml'}
    api = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?"
    response1 = requests.post(api,data = pload)
    data1 = response1.text  #Convert to text
    my_dict1=xmltodict.parse(data1)  #parse data into dictionary (in order to convert it to json string)
    json_data1=json.dumps(my_dict1,indent=4)  #convert into json format (but in string type), indent = 4 is for pretty print
    data1 = json.loads(json_data1) #make the json format (string type) into a JSON object
    return data1

'''
Description: Retrieve the abstarct data from the json object returned from PubMed and append it to a list.
Parameter: a json object
Return: a list of abstracts
'''
def extract_abstracts(data1,uilist):
    abstract_list = []  # All abstract will be stored in this list for futher processing
    for j in range (len(uilist)):
        try:
            #Access the abstact data of type article
            if "Abstract" in data1["PubmedArticleSet"]["PubmedArticle"][j]["MedlineCitation"]["Article"]:
                abstract_text = data1["PubmedArticleSet"]["PubmedArticle"][j]["MedlineCitation"]["Article"]["Abstract"]["AbstractText"]
                #Each article has their own format of abstract therefore mutiple if statement is needed to catch every format
                #So far we have found 3 different format and we are not sure if PubMed will have new article that has different format, therefore
                #skip the abstract if the abstract format cannot be recognised to avoid crashing
                if type(abstract_text) == list:
                    combine_abstract = ""
                    for i in range(len(abstract_text)):
                        if type(abstract_text[i]) == dict:
                            combine_abstract += abstract_text[i]["#text"]
                        else:
                            combine_abstract += abstract_text[i]
                    abstract_list.append(combine_abstract)
                elif type(abstract_text) == dict:
                    combine_abstract = ""
                    combine_abstract += abstract_text["#text"]
                    abstract_list.append(combine_abstract)
                elif abstract_text is not None:
                    abstract_list.append(abstract_text)
                else:
                    #Skip if it does not belong to any format listed above
                    continue
        except:
            #Access the abstact data of type book
            for k in range (len(data1["PubmedArticleSet"]["PubmedBookArticle"])):
                if "Abstract" in data1["PubmedArticleSet"]["PubmedBookArticle"][k]["BookDocument"]:
                    abstract_text = data1["PubmedArticleSet"]["PubmedBookArticle"][k]["BookDocument"]["Abstract"]["AbstractText"]
                    #Each book has their own format of abstract therefore mutiple if statement is needed to catch every format
                    #So far we have found 3 different format and we are not sure if PubMed will have new book that has different format, therefore
                    #skip the abstract if the abstract format cannot be recognised to avoid crashing
                    if type(abstract_text) == list:
                        combine_abstract = ""
                        for i in range(len(abstract_text)):
                            combine_abstract += abstract_text[i]["#text"]
                        abstract_list.append(combine_abstract)
                    elif type(abstract_text) == dict:
                        combine_abstract = ""
                        combine_abstract += abstract_text["#text"]
                        abstract_list.append(combine_abstract)
                    elif abstract_text is not None:
                        abstract_list.append(abstract_text)
                    else:
                        #Skip if it does not belong to any format listed above
                        continue
            break
    return abstract_list


#search the term in pubmed
data = search_pubmed("colorectal risk",3000)

#Append all abstract id into a list
uilist = data["esearchresult"]["idlist"]

#get all the article ids
id = generate_uilist(uilist)

#retrieve all the article using the ids
data1 = retrieve_article(id)

#extract the abstracts out of the json object in data1
abstract_list = extract_abstracts(data1,uilist)

print(len(abstract_list))


2508


# Preprocess the abstract and make it into list of words

In [6]:
'''
Description: Pre-process the abstract list using the scispacy dictionary (nlp) and spacy_tokenizer function
Parameter: list of abstracts
Return: -
'''
def preprocess_abstracts(abstract_list):
    processed_texts = []
    for document in abstract_list:
        if(document):
            processed = nlp(document)  
            entities = processed.ents
            mytokens = spacy_tokenizer(str(entities))
            processed_texts.append(mytokens)
    return processed_texts

processed_texts = preprocess_abstracts(abstract_list)
#Initialization of bigram and trigram for further pre-processing of the abstracts
bigram = gensim.models.Phrases(processed_texts,min_count=5,threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram = gensim.models.Phrases(bigram[processed_texts],threshold=100)
trigram_mod = gensim.models.phrases.Phraser(trigram)

'''
Description: To generate new word if there is two or more words are related
Parameter: a list of string
Return: a new list of string that have more words with different meaning
Example: ["family","history"] => ["family","history","family_history"]
'''
def make_trigram(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

#pass in the previously processed abstract data into trigram for further processing
data_words_bigrams = make_trigram(processed_texts)
processed_texts = data_words_bigrams

print('\n\nTokenized and lemmatized document: ')
print(processed_texts)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



# Create a dictionary (bag of words)

In [7]:
#create a dictionary (bag of words)
dictionary = gensim.corpora.Dictionary(processed_texts)
dictionary.save("mydictionary")

# Running LDA algorithm

In [8]:
'''
Description: Building the LDA model and converting the output from LDA model to a list of keywords from each topic
Parameter: the filename of the dictionary that saved locally, number of topics desired to generate

Return: a list of keywords from each topic, Lda model and bow_corpus
'''
def ldaModelling(dictionaryName, topicNum):
    dictionary = gensim.corpora.Dictionary.load(dictionaryName) #loading the dictionary
    bow_corpus = [dictionary.doc2bow(doc) for doc in processed_texts] #creating the bag of words using the dictionary and pre-processed data
    lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=topicNum, id2word=dictionary, passes=200, workers=4, 
                                           random_state=1, chunksize=100, alpha=1, eta=0.02) #Running LDA
    mylist = []  #saving all the words in each topics into a list for our labelling part

    #Loop through generated topics and append words inside each topic into a list
    for idx, topic in lda_model.print_topics(-1):
        a = topic
        result = ''.join([i for i in a if not i.isdigit()])  #Removing all digit 
        res = re.sub(r'[^\w\s]', '', result)  #removing all punctuations
        res = res.split("  ") #spilt by two empty spaces
        for i in range(len(res)):
            res[i] = lemmatize_stemming(res[i]) #lemmatize all the words inside the list after splitting
        mylist.append(res) #append the list into a list
        print('Topic: {} \nWords: {}'.format(idx, topic)) 
    return mylist,lda_model,bow_corpus

mylist,lda_model,bow_corpus = ldaModelling("mydictionary", 40)
print(mylist)

Topic: 0 
Words: 0.059*"tumor" + 0.054*"cells" + 0.044*"cell" + 0.027*"activity" + 0.026*"growth" + 0.022*"therapeutic" + 0.018*"immune" + 0.015*"human" + 0.015*"study" + 0.013*"induced"
Topic: 1 
Words: 0.072*"polyps" + 0.061*"lesions" + 0.061*"adenoma" + 0.055*"colorectal" + 0.047*"adenomas" + 0.044*"surveillance" + 0.030*"size" + 0.027*"polyp" + 0.020*"dysplasia" + 0.018*"risk"
Topic: 2 
Words: 0.200*"years" + 0.070*"risk" + 0.057*"individuals" + 0.032*"aged" + 0.029*"increased" + 0.028*"death" + 0.020*"older" + 0.018*"risks" + 0.018*"cumulative" + 0.016*"younger"
Topic: 3 
Words: 0.058*"health" + 0.043*"care" + 0.019*"adherence" + 0.018*"symptoms" + 0.017*"system" + 0.015*"survey" + 0.015*"primary" + 0.015*"covid-19" + 0.011*"recommendations" + 0.011*"healthcare"
Topic: 4 
Words: 0.580*"cancer" + 0.372*"colorectal" + 0.006*"subgroups" + 0.002*"americans" + 0.001*"unhealthy" + 0.001*"tobacco_smoking" + 0.001*"ebf1" + 0.001*"participate" + 0.001*"expressing" + 0.001*"nlrp3"
Topic: 5 

# Building dictionary for auto labelling

In [9]:
#New words/label can be updated in here
topic_dictionary = {
    "Lifestyle" : ["nafld","physical","exercise","alcohol","inactive","smoking","diet","lifestyle","sedentary","drink","tobacco","cigarette","meat","frying","grill","broiling"],
    "Gene Mutation" : ["obesity","diabetes","mutation","polyps","adenoma","overweight","obese","dyplasia","adenocarcinoma","carcinoma","premalignant","lesions","polymorphism"],
    "Genetic" : ["gene","family_history","lynch_syndrome","familial","adenomatous","polyposis","FAP","inheritance","HNPCC","hereditary","Turcot Syndrome","PJS","raial","ethnic","AFAP","MAP","family","associated"],
    "Aged" : ["age","old","older","elderly","elder","adult"],
    "Personal History" : ["metastatic","bowel","inflammatory_bowel","inflammation","inflammatory","bowel","personal history","IBD","ulcerative colitis","crohn's","colitis","recurrence","metastasis","metastases","diverticulitis"] 
}

topic_dicitonary_copy = copy.deepcopy(topic_dictionary) #Create a copy of the dictionary before lemmatizing it 


'''
Description: Lemmatize the dictionary so that the words will match the input which is also lemmatized
Parameter: the dictionary builded manually 
Return: a lemmatized dictionary
'''
def lemmatize_topic_dictionary(topic_dictionary):
    for item in topic_dictionary:
        for i in range(len(topic_dictionary[item])):
            topic_dictionary[item][i] = lemmatize_stemming(topic_dictionary[item][i])
    return topic_dictionary

topic_dictionary = lemmatize_topic_dictionary(topic_dictionary)

'''
Description: Retrieve the keywords from each generated topic and classifying each topic into each label based on the dictionary builded manually
Parameter: the dictionary builded manually , a copy of dictionary builded manually before lemmatize
Return: a dictionary which contains labelled topics (result topic from LDA topic modelling).
'''
def get_keywords(topic_dictionary,topic_dicitonary_copy,mylist):
    #List to store all the label belongs to each topic after running the LDA
    topics = []

    #A dictionary to store all keywords for each label
    words_in_topics = {
        "Lifestyle" : [],
        "Gene Mutation" : [],
        "Genetic" : [],
        "Aged" : [],
        "Personal History" : []
    }

    for item in mylist: 
        match_counter = 0 #A counter to store the word that 
        topic_in_dictionary = []  #A list to store all the matched
        for topic in topic_dictionary:

            #Initialise counter to keep track how many times a keyword matched in the topic
            temp_match_counter = 0

            #Loop through each keywords in the topic
            for i in range(len(topic_dictionary[topic])):
                if topic_dictionary[topic][i] in item:
                    #how many times it matched 
                    temp_match_counter += 1

                    #This part is to save all the keywords that belong to each label so that we can present it in our website
                    #Replace the underscore to dash because the words generated by LDA in each topic will have underscore if the keyword consist of more than two words
                    #For example "lynch_syndrome" to "lynch-syndrome"
                    #For better visualise of word in the website
                    word = topic_dicitonary_copy[topic][i].replace('_','-')

                    #Check that if words inside each topic exist in which label in our dictionary and save it to words_in_topics
                    #To avoid duplicate keyword in each topic, so if the word does not exist inside the list then only append
                    if word not in words_in_topics[topic]:
                        words_in_topics[topic].append(word)

            #The keywords might fall into different label, therefore we compare the numbers of matching in each label and 
            #decide where it should belong to by taking the maximum number of count
            if temp_match_counter != 0 and temp_match_counter >= match_counter:
                match_counter = temp_match_counter
                topic_in_dictionary.append(topic)

        #If the length of the topic_in_dictionary list is 0, that mean it does not match anything so it is irrelevant
        #else append into the topics list
        if len(topic_in_dictionary) == 0:
            topics.append("irrelevant")
        else:
            topics.append(topic_in_dictionary)
    return words_in_topics,topics

words_in_topics,topics = get_keywords(topic_dictionary,topic_dicitonary_copy,mylist)
print(topics)
print(words_in_topics)


['irrelevant', ['Gene Mutation'], ['Aged'], 'irrelevant', 'irrelevant', 'irrelevant', ['Personal History'], 'irrelevant', ['Lifestyle', 'Personal History'], ['Gene Mutation', 'Genetic'], 'irrelevant', ['Lifestyle'], ['Lifestyle'], ['Gene Mutation', 'Genetic'], 'irrelevant', ['Genetic'], 'irrelevant', ['Personal History'], 'irrelevant', ['Personal History'], 'irrelevant', 'irrelevant', 'irrelevant', 'irrelevant', ['Gene Mutation', 'Genetic'], 'irrelevant', ['Genetic'], ['Gene Mutation'], ['Personal History'], 'irrelevant', 'irrelevant', ['Genetic'], 'irrelevant', 'irrelevant', 'irrelevant', 'irrelevant', 'irrelevant', ['Genetic'], ['Gene Mutation'], 'irrelevant']
{'Lifestyle': ['diet', 'smoking', 'lifestyle', 'nafld'], 'Gene Mutation': ['polyps', 'adenoma', 'lesions', 'diabetes', 'polymorphism', 'mutation', 'adenocarcinoma', 'obesity', 'obese'], 'Genetic': ['associated', 'gene', 'lynch-syndrome'], 'Aged': ['age', 'older'], 'Personal History': ['inflammatory-bowel', 'diverticulitis', 'in

# Generate data for trending topics graph

In [10]:
'''
Description: Generate the data for trending topics graph 
Parameter: a list of labelled topics
Return: a json object that stores the number of matched topics in the manual dictionary in the LDA model
'''
def data_for_graph(topics):
    counter = {
        "Lifestyle" : 0,
        "Gene Mutation" : 0,
        "Genetic" : 0,
        "Aged" : 0,
        "Personal History" : 0
    }

    #Count how many topics generated by LDA model is related to each label
    for item in topics:
        if item != "irrelevant":
            for topic in item:
                counter[topic] += 1
    
    return counter
counter = data_for_graph(topics)
print(counter)

{'Lifestyle': 3, 'Gene Mutation': 6, 'Genetic': 7, 'Aged': 1, 'Personal History': 5}


# Bulding the definition of each label

In [11]:
#Definition for each label is hardcoded and can be updated here
definition = {
    "Lifestyle" : "The things that a person usually do in his/her daily life effects the chances of developing colorectal cancer. The common things that determine whether a person has a healthy lifestyle is usually their diet, active or inactive lifestyle.",
    "Gene Mutation" : "Gene Mutation is differ from Genetic as it happen during a person's lifetime rather than having been inherited. The way of living (lifestyle) of a person is significant because it can impact the way genes express themselves as well.",
    "Genetic" : "Inheritance of bad genes from parent is also a risk factor of colorectal cancer. People who have the gene inherited from parent might or might not occur colorectal cancer. Colonoscopy is recommended to screen colorectal cancer by doctor if your father/mother was diagnosed with colorectal cancer.",
    "Aged" : "Colorectal cancer may occur at any age. However, the risk of developing colorectal cancar for people after the age of 45 is dramatically higher. The median age of a patient diagonsed with colorectal cancer is 68 and almost 95 percent of all colorectal cancer patients are 45 or older.",
    "Personal History" : "Medical background of a person determine the probabilty of diagnosed with colorectal cancer. People who previously diagnosed with disease related to colorectal cancer has a higher risk of developing colorectal cancer. Besides that, people who cured from colorectal cancer also have a higher risk of developing it again in the future."
}

# Saving files locally

In [12]:
'''
Description: Save words_in_topics, counter, definition variables into json file locally.
Parameter: -
Return: -
'''
def save_files():
    #Save the output into a json file in Amazon S3 which will be used in our website
    with open("topics_with_keywords.json","w") as fp:
        json.dump(words_in_topics,fp)

    #Save the counter and definition to Amazon S3 which will be used in our website
    with open("topic_statistic.json","w") as fp:
        json.dump(counter,fp)

    with open("definition.json","w") as fp:
        json.dump(definition,fp)

save_files()

## Checking the accuracy using perplexity and coherence

In [13]:
#A way to measure the accuracy of our LDA model, this is just for visualization of LDA model and not used as a feature in our software
from gensim.models.coherencemodel import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim

#compute Perplexity
print("\nPerplexity: ", lda_model.log_perplexity(bow_corpus))

#compute Coherence Scpre 
coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_texts, corpus=bow_corpus, dictionary = dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print("\nCoherence Score: ", coherence_lda)

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model,bow_corpus,dictionary)
vis


Perplexity:  -11.028812458806087

Coherence Score:  0.46962881662007305


# Uploading neccessary files to Amazon S3

In [14]:
import boto3
from botocore.exceptions import NoCredentialsError

'''
Description: Uploading file to Amazon S3
Parameter: local filename, bucket name in S3, desired filename to save in S3
Return: True(Success) / False(Failed)
'''
def upload_to_aws(local_file, bucket, s3_file):
    s3 = boto3.client('s3', aws_access_key_id='',
                      aws_secret_access_key='')
    try:
        s3.upload_file(local_file, bucket, s3_file)
        print("Upload Successful")
        return True
    except FileNotFoundError:
        print("The file was not found")
        return False
    except NoCredentialsError:
        print("Credentials not available")
        return False

#The files name are specified here in a list. Can add more files here if required
update_file = ["definition.json","topic_statistic.json","topics_with_keywords.json"]
for i in range (len(update_file)):
    uploaded = upload_to_aws(update_file[i],'websitefyp',update_file[i])


  and should_run_async(code)
