# News-Recommendation and Web Scraping

# Import Libraries

In [1]:
import pandas as pd
from requests import get
from bs4 import BeautifulSoup

# Extracting details of web-page

In [2]:
url = "https://www.indiatoday.in/india?page=1"

In [3]:
# Specify number of pages upto which I want to scrape articles.
noPages=5

In [4]:
Urllinks =[]
for i in range(1,noPages+1):
    print("Processing Page: ", i)
    url = "https://www.indiatoday.in/india?page="+str(i)+"/"
    Urllinks.append(url)

Processing Page:  1
Processing Page:  2
Processing Page:  3
Processing Page:  4
Processing Page:  5


In [5]:
Headlines = []
for i in Urllinks:
    data = get(i)
    soup = BeautifulSoup(data.content,'html.parser')
    for i in soup.find_all('div',class_="catagory-listing"):
        Headlines.append(i.text)
        
Headlines

[" Hindu groups call for bandh over A Raja's remarks against Sanatan Dharma, 18 detainedSeveral shops were shut in Mettupalayam, Ooty and Sathyamangalam in Tamil\nNadu after Hindu groups gave call for a bandh over DMK MP A Raja's remarks\nagainst Sanatan Dharma.",
 ' 2 arrested for links to ISIS in Karnataka’s ShivamoggaTwo men accused of being connected with the terrorist outfit ISIS were\narrested by Shivamogga Police in Karnataka on Tuesday. Police are also\nhunting for another person who is the main accused and is currently\non-the-run.',
 ' Sought time from Centre, but did not get: Delhi minister on plans to curb pollutionDelhi Environment Minister Gopal Rai said the government had sought time from\nthe Centre to discuss pollution in the NCR region but did not get it.',
 " 'Want Rahul Gandhi as party chief': Kerala Congress leaders hint at no support for Shashi TharoorKerala Congress leaders indicated that the state unit is only in favour of\nRahul Gandhi becoming the party chief.

In [6]:
len(Headlines)

60

# Cleaning The Data

In [7]:
p_art =[]
for i in Headlines:
    q = i.upper()
    import re
    q = re.sub("[^A-Z0-9 ]","",q)
    from nltk.stem import PorterStemmer
    tk_q = q.split(" ")
    sent = ""
    for j in tk_q:
        ps = PorterStemmer()
        sent = sent + " " + ps.stem(j).upper()
    p_art.append(sent)

In [8]:
p_art

['  HINDU GROUP CALL FOR BANDH OVER A RAJA REMARK AGAINST SANATAN DHARMA 18 DETAINEDSEVER SHOP WERE SHUT IN METTUPALAYAM OOTI AND SATHYAMANGALAM IN TAMILNADU AFTER HINDU GROUP GAVE CALL FOR A BANDH OVER DMK MP A RAJA REMARKSAGAINST SANATAN DHARMA',
 '  2 ARREST FOR LINK TO ISI IN KARNATAKA SHIVAMOGGATWO MEN ACCUS OF BE CONNECT WITH THE TERRORIST OUTFIT ISI WEREARREST BY SHIVAMOGGA POLIC IN KARNATAKA ON TUESDAY POLIC ARE ALSOHUNT FOR ANOTH PERSON WHO IS THE MAIN ACCUS AND IS CURRENTLYONTHERUN',
 '  SOUGHT TIME FROM CENTR BUT DID NOT GET DELHI MINIST ON PLAN TO CURB POLLUTIONDELHI ENVIRON MINIST GOPAL RAI SAID THE GOVERN HAD SOUGHT TIME FROMTH CENTR TO DISCUSS POLLUT IN THE NCR REGION BUT DID NOT GET IT',
 '  WANT RAHUL GANDHI AS PARTI CHIEF KERALA CONGRESS LEADER HINT AT NO SUPPORT FOR SHASHI THAROORKERALA CONGRESS LEADER INDIC THAT THE STATE UNIT IS ONLI IN FAVOUR OFRAHUL GANDHI BECOM THE PARTI CHIEF',
 '  RUPE RISE MARGIN TO CLOSE AT 7974 AGAINST US DOLLARTH RUPE END AT 7974 UP 7 PAIS

# Cluster for News-Recommendation 

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer()
A = tf.fit_transform(p_art).toarray()

In [10]:
from sklearn.cluster import KMeans
km = KMeans(n_clusters=5)
cl_res = km.fit(A)

In [11]:
cl_res.labels_

array([0, 2, 0, 1, 0, 0, 0, 0, 2, 1, 3, 0, 2, 4, 1, 1, 3, 3, 1, 2, 2, 1,
       2, 2, 3, 0, 1, 1, 1, 4, 4, 3, 2, 2, 4, 2, 3, 2, 0, 1, 2, 2, 1, 2,
       3, 0, 4, 0, 0, 2, 4, 2, 0, 0, 2, 1, 2, 2, 1, 3])

In [12]:
Q = pd.DataFrame(p_art,columns=["Article"])
Q['Cluster']=cl_res.labels_

In [13]:
Q.head(15)

Unnamed: 0,Article,Cluster
0,HINDU GROUP CALL FOR BANDH OVER A RAJA REMAR...,0
1,2 ARREST FOR LINK TO ISI IN KARNATAKA SHIVAM...,2
2,SOUGHT TIME FROM CENTR BUT DID NOT GET DELHI...,0
3,WANT RAHUL GANDHI AS PARTI CHIEF KERALA CONG...,1
4,RUPE RISE MARGIN TO CLOSE AT 7974 AGAINST US...,0
5,ACTOR ARMAAN KOHLI ARREST OVER DRUG CHARG GE...,0
6,BAMBIHA GANG OWN GANGSTER KILL IN RAJASTHAN ...,0
7,WILL RESTOR OLD PENSION SCHEME IN GUJARAT AR...,0
8,DELHI WOMEN PANEL SUMMON TWITTER INDIA POLIC...,2
9,CAN GANDHI NEUTRAL STAND ENSUR FREE AND FAIR...,1


# Categories for Clusters

In [14]:
E = {1 : "Technology",
2 : "Politics",
3 : "Entertainment",
4 : "Geopolotics",
0 : "Sport"}

In [15]:
R = []
for i in Q.Cluster:
    R.append(E[i])

Q['category'] = R

In [16]:
Q

Unnamed: 0,Article,Cluster,category
0,HINDU GROUP CALL FOR BANDH OVER A RAJA REMAR...,0,Sport
1,2 ARREST FOR LINK TO ISI IN KARNATAKA SHIVAM...,2,Politics
2,SOUGHT TIME FROM CENTR BUT DID NOT GET DELHI...,0,Sport
3,WANT RAHUL GANDHI AS PARTI CHIEF KERALA CONG...,1,Technology
4,RUPE RISE MARGIN TO CLOSE AT 7974 AGAINST US...,0,Sport
5,ACTOR ARMAAN KOHLI ARREST OVER DRUG CHARG GE...,0,Sport
6,BAMBIHA GANG OWN GANGSTER KILL IN RAJASTHAN ...,0,Sport
7,WILL RESTOR OLD PENSION SCHEME IN GUJARAT AR...,0,Sport
8,DELHI WOMEN PANEL SUMMON TWITTER INDIA POLIC...,2,Politics
9,CAN GANDHI NEUTRAL STAND ENSUR FREE AND FAIR...,1,Technology
