In [1]:
import requests
from bs4 import BeautifulSoup
from newspaper import Article  
import csv 
import pandas as pd
import numpy as np

# Crawling News from Times of India Website

In [2]:
url = "https://timesofindia.indiatimes.com/world"
r = requests.get(url)

In [3]:
soup = BeautifulSoup(r.content, 'html5lib') 
table = soup.findAll('a', attrs = {'class':'w_img'})

In [4]:
news=[]
for row in table: 
    if not row['href'].startswith('http'):
        news.append('https://timesofindia.indiatimes.com'+row['href'])

In [5]:
df=[]
for i in news:
    article = Article(i, language="en")
    article.download() 
    article.parse() 
    article.nlp() 
    data={}
    data['Title']=article.title
    data['Text']=article.text
    data['Summary']=article.summary
    data['Keywords']=article.keywords
    df.append(data)

In [6]:
dataset=pd.DataFrame(df)
dataset.head()

Unnamed: 0,Title,Text,Summary,Keywords
0,Donald Trump urges states to consider opening ...,File pic of US President Donald Trump\n\nDownl...,File pic of US President Donald TrumpDownload ...,"[urges, donald, consider, school, reopening, o..."
1,Covid-19: Mayor entertains residents after imp...,"Apr 27, 2020, 08:27AM IST\n\nSource: AP\n\nThe...","Apr 27, 2020, 08:27AM ISTSource: APThe videos,...","[true, try, videos, kawakami, tshirt, entertai..."
2,UK PM Boris Johnson returns to work after reco...,"Apr 27, 2020, 08:40AM IST\n\nSource: Times Now...","Apr 27, 2020, 08:40AM ISTSource: Times NowBrit...","[coronavirus, times, uk, virus, worst, recover..."
3,"1 officer dead, 1 wounded in Louisiana shooting","Apr 27, 2020, 08:28AM IST\n\nSource: AP\n\nPol...","Apr 27, 2020, 08:28AM ISTSource: APPolice say ...","[louisiana, standoff, say, shooting, law, want..."
4,Covid-19: Lockdown reveals fresh and clean air...,"Apr 22, 2020, 04:03PM IST\n\nSource: AP\n\nInd...","Apr 22, 2020, 04:03PM ISTSource: APIndia's ext...","[lockdown, average, stay, india, reveals, unpr..."


# Model for predicting virality of news

In [7]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [8]:
def clean_cols(data):
    """Clean the column names by stripping and lowercase."""
    clean_col_map = {x: x.lower().strip() for x in list(data)}
    return data.rename(index=str, columns=clean_col_map)

def TrainTestSplit(X, Y, R=0, test_size=0.2):
    """Easy Train Test Split call."""
    return train_test_split(X, Y, test_size=test_size, random_state=R)


In [9]:
full_data = clean_cols(pd.read_csv("OnlineNewsPopularity.csv"))
train_set, test_set = train_test_split(full_data, test_size=0.20, random_state=42)

x_train = train_set.drop(['url','shares', 'timedelta', 'lda_00','lda_01','lda_02','lda_03','lda_04','num_self_hrefs', 'kw_min_min', 'kw_max_min', 'kw_avg_min','kw_min_max','kw_max_max','kw_avg_max','kw_min_avg','kw_max_avg','kw_avg_avg','self_reference_min_shares','self_reference_max_shares','self_reference_avg_sharess','rate_positive_words','rate_negative_words','abs_title_subjectivity','abs_title_sentiment_polarity'], axis=1)
y_train = train_set['shares']

x_test = test_set.drop(['url','shares', 'timedelta', 'num_self_hrefs', 'kw_min_min', 'kw_max_min', 'kw_avg_min','kw_min_max','kw_max_max','kw_avg_max','kw_min_avg','kw_max_avg','kw_avg_avg','self_reference_min_shares','self_reference_max_shares','self_reference_avg_sharess','rate_positive_words','rate_negative_words','abs_title_subjectivity','abs_title_sentiment_polarity'], axis=1)
y_test = test_set['shares']

In [10]:
clf = RandomForestRegressor(random_state=42)
clf.fit(x_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=42, verbose=0, warm_start=False)

In [11]:
rf_res = pd.DataFrame(clf.predict(x_train),list(y_train))

In [12]:
rf_res.reset_index(level=0, inplace=True)
rf_res_df = rf_res.rename(index=str, columns={"index": "Actual shares", 0: "Predicted shares"})
rf_res_df.head()

Unnamed: 0,Actual shares,Predicted shares
0,16100,11247.11
1,508,882.22
2,1300,1864.97
3,3100,3627.22
4,6900,5415.02


# Converting Crawled News according to Training Set in UCI Dataset

In [13]:
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
stopwords=set(stopwords.words('english'))

In [14]:
def rate_unique(words):
    words=tokenize(words)
    no_order = list(set(words))
    rate_unique=len(no_order)/len(words)
    return rate_unique

In [15]:
def rate_nonstop(words):
    words=tokenize(words)
    filtered_sentence = [w for w in words if not w in stopwords]
    rate_nonstop=len(filtered_sentence)/len(words)
    no_order = list(set(filtered_sentence))
    rate_unique_nonstop=len(no_order)/len(words)
    return rate_nonstop,rate_unique_nonstop


In [16]:
def avg_token(words):
    words=tokenize(words)
    length=[]
    for i in words:
        length.append(len(i))
    return np.average(length)

In [17]:
from textblob import TextBlob

In [18]:
import datefinder
import datetime  
from datetime import date 
def day(article_text):
    article=article_text
    if len(list(datefinder.find_dates(article)))>0:
        date=str(list(datefinder.find_dates(article))[0])
        date=date.split()
        date=date[0]
        year, month, day = date.split('-')     
        day_name = datetime.date(int(year), int(month), int(day)) 
        return day_name.strftime("%A")
    return "Monday"

In [19]:
def tokenize(text):
    text=text
    return word_tokenize(text)

In [20]:
pos_words=[]
neg_words=[]
def polar(words):
    all_tokens=tokenize(words)
    for i in all_tokens:
        analysis=TextBlob(i)
        polarity=analysis.sentiment.polarity
        if polarity>0:
            pos_words.append(i)
        if polarity<0:
            neg_words.append(i)
    return pos_words,neg_words

In [21]:
def rates(words):
    words=polar(words)
    pos=words[0]
    neg=words[1]
    all_words=words
    global_rate_positive_words=(len(pos)/len(all_words))/100
    global_rate_negative_words=(len(neg)/len(all_words))/100
    pol_pos=[]
    pol_neg=[]
    for i in pos:
        analysis=TextBlob(i)
        pol_pos.append(analysis.sentiment.polarity)
        avg_positive_polarity=analysis.sentiment.polarity
    for j in neg:
        analysis2=TextBlob(j)
        pol_neg.append(analysis2.sentiment.polarity)
        avg_negative_polarity=analysis2.sentiment.polarity
    min_positive_polarity=min(pol_pos)
    max_positive_polarity=max(pol_pos)
    min_negative_polarity=min(pol_neg)
    max_negative_polarity=max(pol_neg)
    avg_positive_polarity=np.average(pol_pos)
    avg_negative_polarity=np.average(pol_neg)
    return global_rate_positive_words,global_rate_negative_words,avg_positive_polarity,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity

In [22]:
df2=[]
for i in news:
    pred_info={}
    article = Article(i, language="en") # en for English 
    article.download() 
    article.parse()
    analysis=TextBlob(article.text)
    polarity=analysis.sentiment.polarity
    title_analysis=TextBlob(article.title)
    pred_info['text']=article.text
    pred_info['n_tokens_title']=len(tokenize(article.title))
    pred_info['n_tokens_content']=len(tokenize(article.text))
    pred_info['n_unique_tokens']=rate_unique(article.text)
    pred_info['n_non_stop_words']=rate_nonstop(article.text)[0]
    pred_info['n_non_stop_unique_tokens']=rate_nonstop(article.text)[1]
    pred_info['num_hrefs']=article.html.count("https://timesofindia.indiatimes.com")
    pred_info['num_imgs']=len(article.images)
    pred_info['num_videos']=len(article.movies)
    pred_info['average_token_length']=avg_token(article.text)
    pred_info['num_keywords']=len(article.keywords)
    
    if "life-style" in article.url:
        pred_info['data_channel_is_lifestyle']=1
    else:
        pred_info['data_channel_is_lifestyle']=0
    if "etimes" in article.url:
        pred_info['data_channel_is_entertainment']=1
    else:
        pred_info['data_channel_is_entertainment']=0
    if "business" in article.url:
        pred_info['data_channel_is_bus']=1
    else:
        pred_info['data_channel_is_bus']=0
    if "social media" or "facebook" or "whatsapp" in article.text.lower():
        data_channel_is_socmed=1
        data_channel_is_tech=0
        data_channel_is_world=0
    else:
        data_channel_is_socmed=0
    if ("technology" or "tech" in article.text.lower()) or ("technology" or "tech" in article.url):
        data_channel_is_tech=1
        data_channel_is_socmed=0
        data_channel_is_world=0
    else:
        data_channel_is_tech=0
    if "world" in article.url:
        data_channel_is_world=1
        data_channel_is_tech=0
        data_channel_is_socmed=0
    else:
        data_channel_is_world=0
        
    pred_info['data_channel_is_socmed']=data_channel_is_socmed
    pred_info['data_channel_is_tech']=data_channel_is_tech
    pred_info['data_channel_is_world']=data_channel_is_world
    
    if day(i)=="Monday":
        pred_info['weekday_is_monday']=1
    else:
        pred_info['weekday_is_monday']=0
    if day(i)=="Tuesday":
        pred_info['weekday_is_tuesday']=1
    else:
        pred_info['weekday_is_tuesday']=0
    if day(i)=="Wednesday":
        pred_info['weekday_is_wednesday']=1
    else:
        pred_info['weekday_is_wednesday']=0
    if day(i)=="Thursday":
        pred_info['weekday_is_thursday']=1
    else:
        pred_info['weekday_is_thursday']=0
    if day(i)=="Friday":
        pred_info['weekday_is_friday']=1
    else:
        pred_info['weekday_is_friday']=0
    if day(i)=="Saturday":
        pred_info['weekday_is_saturday']=1
        pred_info['is_weekend']=1
    else:
        pred_info['weekday_is_saturday']=0
    if day(i)=="Sunday":
        pred_info['weekday_is_sunday']=1
        pred_info['is_weekend']=1
    else:
        pred_info['weekday_is_sunday']=0
        pred_info['is_weekend']=0
        
    pred_info['global_subjectivity']=analysis.sentiment.subjectivity
    pred_info['global_sentiment_polarity']=analysis.sentiment.polarity
    pred_info['global_rate_positive_words']=rates(article.text)[0]
    pred_info['global_rate_negative_words']=rates(article.text)[1]
    pred_info['avg_positive_polarity']=rates(article.text)[2]
    pred_info['min_positive_polarity']=rates(article.text)[3]
    pred_info['max_positive_polarity']=rates(article.text)[4]
    pred_info['avg_negative_polarity']=rates(article.text)[5]
    pred_info['min_negative_polarity']=rates(article.text)[6]
    pred_info['max_negative_polarity']=rates(article.text)[7]    
    pred_info['title_subjectivity']=title_analysis.sentiment.subjectivity
    pred_info['title_sentiment_polarity']=title_analysis.sentiment.polarity
    df2.append(pred_info)

In [23]:
pred_df=pd.DataFrame(df2)
pred_test=pred_df.drop(['text'],axis=1)
pred_df.head()

Unnamed: 0,text,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_imgs,num_videos,average_token_length,...,global_rate_positive_words,global_rate_negative_words,avg_positive_polarity,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity
0,File pic of US President Donald Trump\n\nDownl...,10,686,0.483965,0.664723,0.402332,233,13,0,4.491254,...,0.145,0.1,0.326375,0.033333,0.75,-0.273333,-1.0,-0.1,0.0,0.0
1,"Apr 27, 2020, 08:27AM IST\n\nSource: AP\n\nThe...",8,190,0.594737,0.589474,0.431579,185,9,0,3.726316,...,1.19,0.4,0.331676,0.033333,0.75,-0.273333,-1.0,-0.1,0.0,0.0
2,"Apr 27, 2020, 08:40AM IST\n\nSource: Times Now...",11,97,0.773196,0.71134,0.597938,185,9,0,4.505155,...,1.405,0.43,0.336463,0.033333,0.75,-0.304252,-1.0,-0.1,0.0,0.0
3,"Apr 27, 2020, 08:28AM IST\n\nSource: AP\n\nPol...",9,153,0.614379,0.69281,0.48366,185,9,0,4.326797,...,1.45,0.53,0.329657,0.033333,0.75,-0.306061,-1.0,-0.1,0.4,-0.2
4,"Apr 22, 2020, 04:03PM IST\n\nSource: AP\n\nInd...",10,124,0.701613,0.766129,0.604839,185,9,0,4.443548,...,1.55,0.59,0.323873,0.033333,0.75,-0.294359,-1.0,-0.1,0.6,0.333333


# Final Results depicting the Likelihood of Virality of News


In [24]:
test2=pd.DataFrame(clf.predict(pred_test),pred_df['text'])
test2.reset_index(level=0, inplace=True)
test2 = test2.rename(index=str, columns={"index": "News", 0: "Virality"})
test2

Unnamed: 0,text,Virality
0,File pic of US President Donald Trump\n\nDownl...,17719.0
1,"Apr 27, 2020, 08:27AM IST\n\nSource: AP\n\nThe...",24735.88
2,"Apr 27, 2020, 08:40AM IST\n\nSource: Times Now...",26102.47
3,"Apr 27, 2020, 08:28AM IST\n\nSource: AP\n\nPol...",17375.77
4,"Apr 22, 2020, 04:03PM IST\n\nSource: AP\n\nInd...",23125.49
5,"Apr 22, 2020, 03:58PM IST\n\nSource: AP\n\nIra...",22270.43
6,"Apr 22, 2020, 04:04PM IST\n\nSource: AP\n\nAn ...",18589.31
7,"Apr 22, 2020, 03:57PM IST\n\nSource: AP\n\nFou...",18904.23
8,"Apr 21, 2020, 09:25AM IST\n\nSource: Times Now...",26297.45
9,"Apr 21, 2020, 03:38PM IST\n\nSource: AP\n\nA s...",22494.99
