In [1]:
import re # To use regex operations
# from time import sleep
from bs4 import BeautifulSoup #To parse response text using the html parser
import requests #To make request from the web

In [2]:
import nltk
nltk.download('stopwords', quiet=True) #stopwords
nltk.download('wordnet', quiet=True) #
from newspaper import Article #To read the content of a webpage

In [3]:
import gensim #Gensim is an open-source library for unsupervised topic modeling and natural language processing
from gensim.models.ldamulticore import LdaMulticore # for multipreprocessing
from gensim import corpora, models #
from gensim.corpora import Dictionary
import pyLDAvis.gensim #for visualization

from nltk.corpus import stopwords
import string
from nltk.stem.wordnet import WordNetLemmatizer


# from itertools import chain

In [4]:
import warnings
warnings.simplefilter("ignore")
warnings.filterwarnings('ignore')

  and should_run_async(code)


In [165]:
# We are using Yahoo Search Engine for WebScrapping
template = 'https://in.search.yahoo.com/search;_ylt=AwrwIQa9FFJgmlQA7hq7HAx.;_ylu=Y29sbwNzZzMEcG9zAzEEdnRpZAMEc2VjA3BpdnM-?p={}'

In [183]:
query = input("Enter your main query : ")
url = template.format(query)

Enter ypur main query : chanda india


In [167]:
# To make the get request, we first set the request headers
headers = {
    'accept': '*/*',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'en-US,en;q=0.9',
    'referer': 'https://www.google.com',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
}

In [168]:
response = requests.get(url, headers=headers)

In [169]:
soup = BeautifulSoup(response.text, 'html.parser')

In [170]:
# cards = soup.find_all('div', 'NewsArticle')
cards = soup.find_all('div', 'algo')

In [171]:
# All the articles result currently in the html form
len(cards)

9

In [189]:
#Function to get the headline of the article and its url
def get_article(card):
    '''Extract article information from the raw html'''
#     headline = card.find('h4', 's-title').text
    headline = card.find('h3', 'title').text
    raw_link = card.find('a').get('href')
    unquoted_link = requests.utils.unquote(raw_link)
    pattern = re.compile(r'RU=(.+)\/RK')
    clean_link = re.search(pattern, unquoted_link).group(1)
    article = [headline, clean_link]
    return article

In [190]:
# Storing the headline and url for every article on the first page in the articles list
articles = []
links = set()
for card in cards:
    article = get_article(card)
    link = article[-1]
    if not link in links:
        links.add(link)
        articles.append(article)

In [175]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

In [176]:
class top_article:
    def __init__(self, article_name, article_link):
        '''To set article name and article link for every article'''
        self.article_name = article_name
        self.article_link = article_link

    def get_uncleaned_webcontent(self):
        '''To get the cleaned web article from all the web articles'''
        
        article = Article(self.article_link)
        
        try :
            article.download()
            article.parse()
            nltk.download('punkt', quiet = True)
            article.nlp()
            
            if len(article.text) > 100:
                '''Taking only the articles with total text length greater than 100'''
                self.article_content = article.text
            else:
                self.article_content = "It is invalid. Go for next"
            
        except:
            '''Exception if the article does not have permission to get scrapped'''
            self.article_content = "It is invalid. Go for next"
        
        
    def clean(self):
        '''
        Splitting the data
        Removing the punctuations
        Lemmatization : making sure that words like spectacle/spectacles are considered same
        '''
        stop_free = ' '.join([word for word in self.article_content.lower().split() if word not in stop])
        punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
        normalized = ' '.join([lemma.lemmatize(word) for word in punc_free.split()])
        self.article_cleaned =  normalized.split()

In [163]:
def make_dictionary(bag_of_words):
    '''Make a single dictionary of the words from top 5 articles'''
    dictionary = corpora.Dictionary(bag_of_words)
    return dictionary

def make_doc_term_matrix(dictionary, bag_of_words):
    '''
    A matrix which stores the occurence and frequency of each word from the bag of words
    As we have 5 articles, this matrix will have 5 rows
    '''
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in bag_of_words]
    return doc_term_matrix

def build_lda_model(dictionary, doc_term_matrix, lda, num_topics):
    '''Training the model by passing the hyperparameters'''
    lda_model = lda(doc_term_matrix, num_topics = num_topics, id2word=dictionary, passes=50, minimum_probability=0)
    return lda_model

def print_topic_clusters(lda_model, num_topics):
    '''
    Print the clusters of selected number of topics
    Each cluster shows the relevant keywords associated with that cluster using which we can determine the topic of that cluster
    '''
    print('\033[1m')
    print("\nKeywords/Topics/Tags for the articles\n")
    print('\033[0m')
    keywords_and_probs = lda_model.print_topics(num_topics=num_topics)
    keywords_only = []
    for i in keywords_and_probs:
        '''Original output contains the probability along with the keywords. Here we are trying to extract only the keywords'''
        keywords_only.append(re.findall(r'"(.*?)"', i[1]))
    for i, cluster_keywords in enumerate(keywords_only):
        '''We print the cluster number and the keywords associated with that cluster'''
        print("Cluster",i+1,cluster_keywords)

def build_lda_display(lda_model, doc_term_matrix, dictionary):
    '''Here we build the lda_display object which helps us to visualize our results'''
    lda_display = pyLDAvis.gensim.prepare(lda_model, doc_term_matrix, dictionary, sort_topics = False, mds='mmds')
    return lda_display

In [192]:
top_articles = []
for article in articles:
    '''Getting all articles obtained from the search engine. This articles are obtained on the 1st page'''
    top_articles.append(top_article(article[0], article[1]))

# Initializing the model object
lda = gensim.models.ldamodel.LdaModel


# User input for the number of topics would we like to achieve
num_topics = int(input("Enter the number of topics we want to get(recommended less than 5) :  "))
print("\nPlease wait...")


count = 0
valid_top_articles = []
for num, article in enumerate(top_articles):
    if count >= 5:
        # Inorder to get the top 5 articles
        break
    article.get_uncleaned_webcontent() #Getting the data for every top 5 article
    if article.article_content == "It is invalid. Go for next":
        continue
    count = count + 1
    article.clean() #Cleaning the the data of every top 5 articles
    valid_top_articles.append(article)

print("\nTop 5 articles are....")
bag_of_words = []
for num, article in enumerate(valid_top_articles):
    '''Printing the url for top 5 articles and creating a bag of words from all of them'''
    print(article.article_link)
    bag_of_words.append(article.article_cleaned)

dictionary = make_dictionary(bag_of_words) #Making a dcitionary from the bag of words
doc_term_matrix = make_doc_term_matrix(dictionary, bag_of_words) #creating a matrix

print("\nBuilding the clusters of topics....")
lda_model = build_lda_model(dictionary, doc_term_matrix, lda, num_topics) #training the model

print_topic_clusters(lda_model, num_topics) #printing the cluster results
lda_display = build_lda_display(lda_model, doc_term_matrix, dictionary) #building the object capable to show visualization of the results

Enter the number of topics we want to get(recommended less than 5) :  3

Please wait...

Top 5 articles are....
https://en.wikipedia.org/wiki/Komodo_dragon
https://www.britannica.com/animal/Komodo-dragon
https://www.nationalgeographic.com/animals/reptiles/facts/komodo-dragon
https://animalcorner.org/animals/komodo-dragon/
https://theculturetrip.com/asia/indonesia/articles/11-facts-komodo-dragon-indonesias-national-animal/

Building the clusters of topics....




[1m

Keywords/Topics/Tags for the articles

[0m
Cluster 1 ['komodo', 'dragon', '–', 'foot', 'lizard', 'large', 'also', '3', 'eat', 'able']
Cluster 2 ['komodo', 'dragon', 'komodos', 'female', 'island', 'also', 'year', 'male', 'foot', 'venom']
Cluster 3 ['komodo', 'dragon', 'may', 'island', 'egg', 'prey', 'male', 'human', 'two', 'zoo']


In [193]:
pyLDAvis.display(lda_display) #Visualization of the results(works on local jupyter notebook).   It may not be visible over github.

##### 1. Which ML/DL model architecture you used and why?
- I am using webscrapping by utilising the yahoo's search engine to get the most relevant articles for the given query.
- I have used LDA i.e. <b>Latent Dirichlet Model</b> to train my model primarily because it is currently one of the most popular topic modelling technique.
- There are other techniques as well such Term Frequency and Inverse Document Frequency, NonNegative Matrix Factorization techniques which I haven't used for now.

##### 2. How would you ensure the scalability of your solution?
- j

##### 3. Is there a need for any dataset? If yes then how much data is sufficient to train the model in order to get the required results?
- We are scrapping the top query relevant articles from the web. So, the result of this webscrapping would be multiple web articles out of which we have to ensure few parameters as follows:
    - articles text size
    - number of unique words in each article
    - total vocabulary size from all the articles
    - Permission to scrap the website
- Once we make sure that we satisfy the above parameters then we are good to select our top five required articles.

##### 4. Is there a need to create manual datasets, if yes then what parameters and sample size did you consider to create a dataset? 
- No.
- If we use potentially strong search engine then we will always get enough number of articles.
- Even if the articles on the first page are not sufficient, still we can move over to the next page to webscrap further articles.

##### 5. Is your model and dataset generalized enough for different domains of the use cases, How?
- Yes. As discussed in the point two, I have considered the parameters sensitive for the possible test cases.
- Also the model I have chosed is LDA which is currently the most popular in topic modelling.

##### 6. How would you train, test and deploy your model to production? 
- One of the production effiecint technique which I know is by using GCP's DataFlow Pipeline and AI Platform's tf.estimator API both of which allows us the use of multiple GPUs.
- Using DataFlow pipeline we can have even terabytes of data in our articles which will be converted into tfrecords and get stored on the cloud storage.
- Once the tfrecords are obtained, we can use AI Platform which allows us to perform 3 operations on the dataset anytime. i.e. i) training ii) evaluation iii) prediction
- On AI Platform, we can deploy our model and then our code will run in the prediction mode as per the tf estimator API.

##### 7. How would you perform hyperparameter tuning on your model to improve accuracy?
- I have provided the solution with the visualization, I have observed that the topics don't get overlap mostly when I set 3 topics.
- Possible reason for this could be a less dataset that is getting scrapped becasue we are dealling with only top 5 articles.
- Had been the case that we were using more articles, we would have got the freedom to get the results from even more topics.

##### 8. Anything else you want to let us know about your approach.
- j

##### <hr>
#### Model/Code on GitHub repo or Colab Notebook with the necessary documentation describing the model functioning.
- Code is push on my github repo on this url : https://github.com/vishalw-iitk/ML_new/blob/master/CareerNinja/vishal_w.ipynb

##### <hr>
#### Only Approach to generate different types of questions (short answer type, MCQs, true/false, fill in the blanks, long answer type, etc.) for that same article.
- We can generate questions using <b>allennlp</b> for all the articles