# **Importing Required Packages!**

In [1]:
# The Python itertools module is a collection of tools for handling iterators. Simply put, iterators are data types that can be used in a for loop. The most common iterator in Python is the list.
import itertools
# The Natural Language Toolkit (NLTK) is a Python package for natural language processing.
import nltk
# Requests is a Python module that you can use to send all kinds of HTTP requests. It is an easy-to-use library with a lot of features ranging from passing parameters in URLs to sending custom headers and SSL Verification.
import requests
# Gensim is designed to handle large text collections using data streaming and incremental online algorithms.
import gensim
# bs4 — BeautifulSoup 4. Beautiful Soup is a Python library for pulling data out of HTML and XML files.
from bs4 import BeautifulSoup
from nltk.stem.porter import *
# Lemmatization is the process of grouping together the different inflected forms of a word so they can be analysed as a single item.
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# **Assign News Websites URL**

In [0]:
# Extract news from below websites
urls=[
      ['https://news.google.com','a','DY5T1d'],
    ['https://www.rediff.com/news','h2','hdtitle'],
    ['https://www.scoopwhoop.com/','a','article-title'],
   ['https://www.yahoo.com/news/','h3','Mb(5px)'],
   ['https://www.pinkvilla.com/','div','ypromoted'],
   ['https://www.buzzfeednews.com/','h2','newsblock-story-card__title']
  
]

# **First Step:** Extract the Headline from Viral News Websites

In [3]:
no_of_titles=0
titles=[]
title_url=[]
print("\n\n\n   Extracting headlines of ",len(urls) ," news websites")
for url in urls:
    r1 = requests.get(url[0])
    coverpage = r1.content
    soup1 = BeautifulSoup(coverpage, 'html.parser')
    coverpage_news = soup1.find_all(url[1],class_=url[2])
    no_headlines_per_website=0
    for heading in coverpage_news:
        headline=heading.get_text()
        headline=headline.strip()
        titles.append(headline)
        title_url.append([url[0],headline])
        no_of_titles += 1
        no_headlines_per_website+=1
    print("\n\t\tWebsite : " , url[0], '\n\t\tNo. of headlines collected : ', no_headlines_per_website)

print("\n   Total No. of headlines : ",no_of_titles)




   Extracting headlines of  6  news websites

		Website :  https://news.google.com 
		No. of headlines collected :  196

		Website :  https://www.rediff.com/news 
		No. of headlines collected :  25

		Website :  https://www.scoopwhoop.com/ 
		No. of headlines collected :  17

		Website :  https://www.yahoo.com/news/ 
		No. of headlines collected :  30

		Website :  https://www.pinkvilla.com/ 
		No. of headlines collected :  17

		Website :  https://www.buzzfeednews.com/ 
		No. of headlines collected :  35

   Total No. of headlines :  320


# **Second Step:** Pre processs the text and convert to tokens

In [0]:
stemmer = PorterStemmer()

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))       
    return result

tokanized_titles=[preprocess(title) for title in titles]

# **Third Step:** Build the Bow and LDA models

In [5]:
dictionary = gensim.corpora.Dictionary(tokanized_titles)
corpus = [dictionary.doc2bow(list_of_tokens) for list_of_tokens in tokanized_titles]
num_topics = 15
lda_model = gensim.models.LdaModel(corpus,
                                    num_topics=num_topics, 
                                    id2word=dictionary,
                                    passes=4, 
                                    alpha=[0.01]*num_topics,
                                    eta=[0.01]*len(dictionary.keys()))



  diff = np.log(self.expElogbeta)


# **Visualised The Results**

In [6]:
print('\n\n   Results :')
topic_words=[]
for topic,words in lda_model.show_topics(num_topics=num_topics, num_words=4, log=False, formatted=False):
    word_list=[]
    for word,prob in words:
        word_list.append(word)
    topic_words.append(word_list)

index_list=[]
for topic in topic_words:
    for i in range(len(tokanized_titles)):
        if(set(topic).issubset(set(tokanized_titles[i]))):
            index_list.append(i)

trending=[]
for i in set(index_list):
    trending.append(title_url[i])
trends=list(k for k,_ in itertools.groupby(trending))

for aritcle in trends:
    print('\n\t Headline : ',aritcle[1],'\n\t Website : ',aritcle[0])




   Results :

	 Headline :  Brazil's star justice minister Sérgio Moro resigns in blow to Jair Bolsonaro 
	 Website :  https://news.google.com

	 Headline :  Who Plays Magda on 'Penny Dreadful: City of Angels'? Natalie Dormer Portrays Showtime's Latest Supernatural Antagonist 
	 Website :  https://news.google.com

	 Headline :  In ‘Penny Dreadful: City of Angels,’ California Dreaming, Darkly 
	 Website :  https://news.google.com

	 Headline :  Penny Dreadful: City of Angels Review: Often Very Silly, Rarely Any Fun 
	 Website :  https://news.google.com

	 Headline :  'Penny Dreadful: City of Angels' Review 
	 Website :  https://news.google.com

	 Headline :  Natalie Dormer on Why She Wanted to Be Filmed from the ‘Worst’ Angle in ‘Penny Dreadful: City of Angels’ 
	 Website :  https://news.google.com

	 Headline :  Giant asteroid flying by Earth next week looks like it's wearing a face mask 
	 Website :  https://news.google.com

	 Headline :  Massive asteroid flying past Earth next week