In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np
pd.options.mode.chained_assignment = None 
from tqdm.notebook import tqdm
from bs4 import BeautifulSoup
from bs4.element import Comment
import urllib.request
from urllib.parse import urlparse
import requests
import re

In [2]:
from transformers import pipeline

In [3]:
news = pd.read_table('data/dev/news.tsv',
              header=None,
              names=[
                  'id', 'category', 'subcategory', 'title', 'abstract', 'url',
                  'title_entities', 'abstract_entities'
              ])

In [4]:
# news['category'].unique()
# categories = ['world', 'regional', 'technology', 'business', 'environment', 'economics', 'finance', 'sports'
#             'science', 'entertainment', 'health', 'future', 'politics', 'history', 'art', 'culture']

In [5]:
# tech_news = news[news['subcategory'].isin([
#     'finance-technology', 'autoscartech', 'technologyinvesting', 'newstechnology', 'newsscienceandtechnology',
#     'shop-computers-electronics', 'autoshybrids', 'autosreview', 'autosluxury', 'autosclassics', 'autossuvs'
#     ])]

In [6]:
# news['category'].unique()

In [7]:
subset_news = news[news['category']=='news']
subset_news['text'] = ""

In [23]:
def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True

def text_from_html(body):
    soup = BeautifulSoup(body, 'html.parser')
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)  
    return u" ".join(t.strip() for t in visible_texts)

def process_url(url):
    html = urllib.request.urlopen(url).read()
    return text_from_html(html)

In [9]:
# for index, url in enumerate(tqdm(subset_news['url'])):
#     if subset_news['text'].iloc[index] == '':
#         if urlparse(url).scheme:
#             subset_news['text'].iloc[index] = process_url(url)

Run sentiment analysis on all articles

Take 1 article, look at its sentiment, recommend a similar article (content wise [cetegory or nlp]) with the opposite sentiment

In [10]:
# subset_news.to_csv("withText.csv")

In [11]:
subset_news = pd.read_csv('withPredictions.csv')

In [12]:
# subset_news

In [13]:
classifier = pipeline("zero-shot-classification")

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at roberta-large-mnli.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


In [14]:
candidate_labels = ['positive', 'negative']

In [15]:
# classifier(subset_news['text'].iloc[0], candidate_labels)

In [16]:
for index, text in enumerate(tqdm(subset_news['text'])):
    res = classifier(text, candidate_labels)
    SCORES = res["scores"]
    CLASSES = res["labels"]
    BEST_INDEX = np.argmax(SCORES)
    predicted_class = CLASSES[BEST_INDEX]
    subset_news['sentiment'].iloc[index] = predicted_class

In [17]:
# subset_news.to_csv('withPredictions.csv')    

Part of EDA:
For a given article, get it's category and sentiment.

Get all articles from that category with opposite sentiment.

In [18]:
def recommendArticlesOfOppositeSentiment(articles, index):
    category = articles['subcategory'].iloc[index]
    sentiment = articles['sentiment'].iloc[index]
    articles_opposite_sentiment = articles[articles['sentiment']!=sentiment]
    return articles_opposite_sentiment[articles_opposite_sentiment['subcategory']==category]

In [19]:
# recommendArticlesOfOppositeSentiment(subset_news, 20)

In [20]:
def preprocess(raw_text):
    # keep only words
    letters_only_text = re.sub("[^a-zA-Z]", " ", raw_text)

    # convert to lower case and split 
    words = letters_only_text.lower().split()

    # remove stopwords
    stopword_set = set(stopwords.words("english"))
    cleaned_words = list(set([w for w in words if w not in stopword_set]))

    return cleaned_words

In [21]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vangelistrikoupis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [22]:
gloveFile = "glove.6B.50d.txt"
def loadGloveModel(gloveFile):
    with open(gloveFile, encoding="utf8" ) as f:
        content = f.readlines()
    model = {}
    for line in content:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    return model

In [23]:
model = loadGloveModel(gloveFile)

Loading Glove Model
Done. 400000  words loaded!


In [24]:
def cosine_distance_wordembedding(s1, s2):
    import scipy
    vector_1 = np.mean([model.get(word,0) for word in preprocess(s1) if word in model],axis=0)
    vector_2 = np.mean([model.get(word,0) for word in preprocess(s2) if word in model],axis=0)
    cosine = scipy.spatial.distance.cosine(vector_1, vector_2)
    return round((1-cosine)*100,2)

In [25]:
# cosine_distance_wordembedding(subset_news['text'].iloc[0],subset_news['text'].iloc[10])

In [3]:
subset_news = pd.read_csv("withRecArticle.csv")

In [27]:
subset_news["recommended_article"] = 0

In [4]:
for index, item in enumerate(tqdm(subset_news.text)):
    # if index < 1:
    df_exc_index = subset_news[~subset_news['text'].index.isin([index])]
    for new_index, new_item in enumerate(df_exc_index.text):
        similarity = cosine_distance_wordembedding(item, new_item)
        if similarity > 97:
            # print("article with index", index, "is ", similarity, "% \similar to article of index ", new_index)
            subset_news['recommended_article'].iloc[index] = new_index
            break

  0%|          | 0/13043 [00:00<?, ?it/s]

NameError: name 'cosine_distance_wordembedding_method' is not defined

In [6]:
subset_news["keep"] = False

In [7]:
for index, rec_article in enumerate(subset_news.recommended_article):
    if  subset_news["sentiment"].iloc[index] == subset_news['sentiment'].iloc[rec_article]:
        subset_news["keep"].iloc[index] = True
    else:
        subset_news["keep"].iloc[index] = False

In [13]:
final_df = subset_news[subset_news["keep"]==True]

In [14]:
final_df

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,id,category,subcategory,title,abstract,url,title_entities,abstract_entities,text,sentiment,recommended_article,keep
2,2,2,9,N59295,news,newsworld,Chile: Three die in supermarket fire amid prot...,Three people have died in a supermarket fire a...,https://assets.msn.com/labs/mind/AAJ43pw.html,"[{""Label"": ""Chile"", ""Type"": ""G"", ""WikidataId"":...","[{""Label"": ""Santiago"", ""Type"": ""G"", ""WikidataI...",Chile: Three die in supermarket fire am...,negative,2011,True
3,3,3,22,N9786,news,newspolitics,Elijah Cummings to lie in state at US Capitol ...,"Cummings, a Democrat whose district included s...",https://assets.msn.com/labs/mind/AAJgNxm.html,"[{""Label"": ""Elijah Cummings"", ""Type"": ""P"", ""Wi...","[{""Label"": ""Elijah Cummings"", ""Type"": ""P"", ""Wi...",Elijah Cummings to lie in state at US C...,positive,9,True
5,5,5,36,N43620,news,newsworld,"Without help from US, UN climate fund struggle...",Rich countries gathered Thursday in France to ...,https://assets.msn.com/labs/mind/AAJgO34.html,"[{""Label"": ""United Nations"", ""Type"": ""O"", ""Wik...","[{""Label"": ""France"", ""Type"": ""G"", ""WikidataId""...","Without help from US, UN climate fund s...",negative,232,True
6,6,6,39,N61409,news,newsscienceandtechnology,Netflix hackers: Why they steal passwords,Netflix accounts don't really offer the sellin...,https://assets.msn.com/labs/mind/AAGBXF6.html,"[{""Label"": ""Netflix"", ""Type"": ""O"", ""WikidataId...","[{""Label"": ""Netflix"", ""Type"": ""O"", ""WikidataId...",Netflix hackers: Why they steal passwor...,negative,95,True
11,11,11,73,N21802,news,newspolitics,The Democratic candidates who want to face Trump,With Bill de Blasio's Sept. 19 decision to dro...,https://assets.msn.com/labs/mind/AAEmD6T.html,"[{""Label"": ""2020 Democratic Party presidential...","[{""Label"": ""2020 Democratic Party presidential...",The Democratic candidates who want to f...,negative,0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13037,13037,13037,42399,N36658,news,newsus,Apartments for rent in Minneapolis: What will ...,Curious just how far your dollar goes in Minne...,https://assets.msn.com/labs/mind/BBWscWw.html,"[{""Label"": ""Minneapolis"", ""Type"": ""G"", ""Wikida...","[{""Label"": ""Minneapolis"", ""Type"": ""G"", ""Wikida...",Apartments for rent in Minneapolis: What...,positive,62,True
13038,13038,13038,42400,N32558,news,elections-2020-us,Trump campaign launching black outreach effort...,"WASHINGTON (AP) During the 2016 campaign, ca...",https://assets.msn.com/labs/mind/BBWsd7A.html,"[{""Label"": ""Donald Trump"", ""Type"": ""P"", ""Wikid...","[{""Label"": ""Donald Trump"", ""Type"": ""P"", ""Wikid...",Trump campaign launching black outreach ...,negative,4,True
13039,13039,13039,42401,N12470,news,newscrime,3 teens shot in Northeast DC Thursday,"Thursday was a violent night in the District, ...",https://assets.msn.com/labs/mind/BBWse0x.html,"[{""Label"": ""Northeast (Washington, D.C.)"", ""Ty...","[{""Label"": ""Metropolitan Police Department of ...",3 teens shot in Northeast DC Thursday ...,negative,16,True
13040,13040,13040,42402,N25642,news,newspolitics,Texas custody battle fuels debate over transge...,A custody battle between two parents has led t...,https://assets.msn.com/labs/mind/BBWseUG.html,"[{""Label"": ""Texas"", ""Type"": ""G"", ""WikidataId"":...","[{""Label"": ""Ted Cruz"", ""Type"": ""P"", ""WikidataI...","img class=""image spi...",negative,878,True


In [15]:
final_df = final_df.rename(columns={'Unnamed: 0': 'article_id'})

In [20]:
final_df = final_df.drop(columns=['Unnamed: 0.1', 'Unnamed: 0.1.1'])

In [22]:
final_df

Unnamed: 0,article_id,id,category,subcategory,title,abstract,url,title_entities,abstract_entities,text,sentiment,recommended_article,keep
2,2,N59295,news,newsworld,Chile: Three die in supermarket fire amid prot...,Three people have died in a supermarket fire a...,https://assets.msn.com/labs/mind/AAJ43pw.html,"[{""Label"": ""Chile"", ""Type"": ""G"", ""WikidataId"":...","[{""Label"": ""Santiago"", ""Type"": ""G"", ""WikidataI...",Chile: Three die in supermarket fire am...,negative,2011,True
3,3,N9786,news,newspolitics,Elijah Cummings to lie in state at US Capitol ...,"Cummings, a Democrat whose district included s...",https://assets.msn.com/labs/mind/AAJgNxm.html,"[{""Label"": ""Elijah Cummings"", ""Type"": ""P"", ""Wi...","[{""Label"": ""Elijah Cummings"", ""Type"": ""P"", ""Wi...",Elijah Cummings to lie in state at US C...,positive,9,True
5,5,N43620,news,newsworld,"Without help from US, UN climate fund struggle...",Rich countries gathered Thursday in France to ...,https://assets.msn.com/labs/mind/AAJgO34.html,"[{""Label"": ""United Nations"", ""Type"": ""O"", ""Wik...","[{""Label"": ""France"", ""Type"": ""G"", ""WikidataId""...","Without help from US, UN climate fund s...",negative,232,True
6,6,N61409,news,newsscienceandtechnology,Netflix hackers: Why they steal passwords,Netflix accounts don't really offer the sellin...,https://assets.msn.com/labs/mind/AAGBXF6.html,"[{""Label"": ""Netflix"", ""Type"": ""O"", ""WikidataId...","[{""Label"": ""Netflix"", ""Type"": ""O"", ""WikidataId...",Netflix hackers: Why they steal passwor...,negative,95,True
11,11,N21802,news,newspolitics,The Democratic candidates who want to face Trump,With Bill de Blasio's Sept. 19 decision to dro...,https://assets.msn.com/labs/mind/AAEmD6T.html,"[{""Label"": ""2020 Democratic Party presidential...","[{""Label"": ""2020 Democratic Party presidential...",The Democratic candidates who want to f...,negative,0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13037,13037,N36658,news,newsus,Apartments for rent in Minneapolis: What will ...,Curious just how far your dollar goes in Minne...,https://assets.msn.com/labs/mind/BBWscWw.html,"[{""Label"": ""Minneapolis"", ""Type"": ""G"", ""Wikida...","[{""Label"": ""Minneapolis"", ""Type"": ""G"", ""Wikida...",Apartments for rent in Minneapolis: What...,positive,62,True
13038,13038,N32558,news,elections-2020-us,Trump campaign launching black outreach effort...,"WASHINGTON (AP) During the 2016 campaign, ca...",https://assets.msn.com/labs/mind/BBWsd7A.html,"[{""Label"": ""Donald Trump"", ""Type"": ""P"", ""Wikid...","[{""Label"": ""Donald Trump"", ""Type"": ""P"", ""Wikid...",Trump campaign launching black outreach ...,negative,4,True
13039,13039,N12470,news,newscrime,3 teens shot in Northeast DC Thursday,"Thursday was a violent night in the District, ...",https://assets.msn.com/labs/mind/BBWse0x.html,"[{""Label"": ""Northeast (Washington, D.C.)"", ""Ty...","[{""Label"": ""Metropolitan Police Department of ...",3 teens shot in Northeast DC Thursday ...,negative,16,True
13040,13040,N25642,news,newspolitics,Texas custody battle fuels debate over transge...,A custody battle between two parents has led t...,https://assets.msn.com/labs/mind/BBWseUG.html,"[{""Label"": ""Texas"", ""Type"": ""G"", ""WikidataId"":...","[{""Label"": ""Ted Cruz"", ""Type"": ""P"", ""WikidataI...","img class=""image spi...",negative,878,True
