## Imports

In [1]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/michael/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [4]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer # For sentiment analysis
import cPickle as pickle # For loaded dataset from pickle file
import tqdm # Progress bar
from collections import Counter # Handy addon
from pprint import pprint # Useful to print JSON objects
import numpy as np

## Load the dataset of articles with introductions 

In [5]:
# This loads the file that you want, might take several seconds (up to a minute)

with open("news_sentiment.pickle", "r") as f:
    articles = pickle.load(f)
print len(articles), "articles were loaded"
print "Example article:"
pprint(articles[1040])


57767 articles were loaded
Example article:
{u'introductions': [{u'person': u'Bashar al-Assad',
                     u'text': u'President',
                     u'wdid': u'Q44329'},
                    {u'person': u'Emile Hokayem',
                     u'text': u'in Foreign Policy'},
                    {u'person': u'Ahrar al Sham',
                     u'text': u'the most important groups',
                     u'wdid': u'Q860943'},
                    {u'person': u'Vladimir Putin',
                     u'text': u'Russian President',
                     u'wdid': u'Q7747'},
                    {u'person': u'Barack Obama',
                     u'text': u'U.S. President',
                     u'wdid': u'Q76'},
                    {u'person': u'Osama Abu Zeid',
                     u'text': u'a senior adviser to the moderate Free Syrian Army'},
                    {u'person': u'Op-Ed',
                     u'text': u'for The Washington Post',
                     u'wdid': u'Q2602337'},
 

In [6]:
# separate articles from the two stories
ISIS_articles = []
Brexit_articles = []
for a in articles:
    if a["news_topic"] == 'ISIS War':
        ISIS_articles.append(a)
    else:
        Brexit_articles.append(a)
        
print len(ISIS_articles), " articles from ISIS War and ", len(Brexit_articles), "articles from Brexit were loaded"

39206  articles from ISIS War and  18561 articles from Brexit were loaded


In [7]:
# get only articles from one story, you can change this
articles = ISIS_articles

## Extract introductions, and obtain their sentiment

In [8]:
analyzer = SentimentIntensityAnalyzer()

total_introductions = []
for a in articles:
    for intro in a.get('introductions', []):
        intro['source'] = a['source']
        total_introductions.append(intro)

for intro in tqdm.tqdm_notebook(total_introductions):
    intro['sentiment'] = analyzer.polarity_scores(intro['text'])['compound']

Widget Javascript not detected.  It may not be installed properly.





In [14]:
# Example some sentiment for some of the introductions

subsample = np.random.choice(total_introductions, 100)
for intro in subsample:
    if intro['sentiment'] != 0:
        print "---------------"
        print "Entity mentionned:", intro['person']
        print intro['text']
        print "Sentiment:", intro['sentiment']

---------------
Entity mentionned: Ash Carter
the U.S. defense secretary
Sentiment: 0.128
---------------
Entity mentionned: Saad Hariri
resigned Lebanese Prime Minister
Sentiment: -0.25
---------------
Entity mentionned: Putin
fundamentally misunderstood
Sentiment: -0.34
---------------
Entity mentionned: Steven Mnuchin
US Treasury Secretary
Sentiment: 0.2023
---------------
Entity mentionned: Sergei Shoigu
Russian Defense Minister
Sentiment: 0.128
---------------
Entity mentionned: Ahrar al-Sham
the Islamist group , as well as the Suqour al - Sham and Failaq al - Sham insurgent groups
Sentiment: 0.2732
---------------
Entity mentionned: Stephen Townsend
anti - IS coalition forces
Sentiment: -0.3182
---------------
Entity mentionned: Sarac Ozdemir
a 43-year - old shopkeeper who supports the AKP
Sentiment: 0.3612
---------------
Entity mentionned: Gebran Bassil
Lebanon 's Minister of Energy and Water
Sentiment: 0.2732
---------------
Entity mentionned: Colvin
who died trying to retriev

## Build a 2-dimensional object containing sentiment per entity, per source

In [10]:
ent_source_sent = {}

for intro in total_introductions:
    p = intro['person']
    s = intro['source']
    if p not in ent_source_sent:
        ent_source_sent[p] = {}
    if s not in ent_source_sent[p]:
        ent_source_sent[p][s] = []
    ent_source_sent[p][s].append(intro['sentiment'])

In [15]:
# An example of how one entity (a city) is described by different sources

print ent_source_sent['Aleppo']

{u'nytimes.com': [0.0, -0.1531, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.1531, 0.0, 0.0, 0.0, -0.1531, 0.0, 0.0, -0.5574, 0.0, 0.0, 0.0, 0.0], u'allafrica.com': [-0.5994], u'bloomberg.com': [-0.5994, 0.0, 0.0, -0.2023, 0.0, -0.4404, -0.1531, -0.1531, 0.0, 0.0], u'bbc.co.uk': [0.0516, 0.0, -0.1531, 0.0, 0.0, 0.0, -0.1531, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.1531, -0.3182, -0.5994, -0.5994, 0.0, 0.0, 0.0, -0.1531, 0.0, 0.0, 0.0, -0.1531, 0.0, 0.0, 0.0], u'theguardian.com': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.7096, 0.0, -0.1531, 0.0], u'telegraph.co.uk': [0.4019, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.3612, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3182, 0.4404, -0.296, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3818, -0.1531, 0.0, -0.1531, 0.0, 0.0, 0.0, -0.3182, -0.1531, 0.0, 0.0, 0.0, 0.0, 0.0, -0.3612, 0.2023, -0.1531, 0.0, 0.0, -0.1531, -0.1531, 

In [16]:
# We get rid of entities that don't contain enough data

entities_kept = []

for entity in ent_source_sent.keys():
    sentiments = ent_source_sent[entity]
    total_size = sum([len(sentiments[source]) for source in sentiments.keys()])
    if total_size >= 3:
        entities_kept.append(entity)
        
print "We will keep a total of %s / %s in our dataset" % (len(entities_kept), len(ent_source_sent.keys()))


sources = set([])
for entity in entities_kept:
    sources|= set(ent_source_sent[entity].keys())
sources = list(sources)

print "We have ", len(sources), "sources: ", sources

We will keep a total of 7852  /  25128 in our dataset
We will keep a total of 7852 / 25128 in our dataset
We have  22 sources:  [u'telegraph.co.uk', u'foxnews.com', u'ap.org', u'businessinsider.in', u'independent.co.uk', u'reuters.com', u'wikinews.org', u'cnn.com', u'techcrunch.com', u'aa.com.tr', u'allafrica.com', u'nytimes.com', u'bloomberg.com', u'bbc.co.uk', u'latimes.com', u'rt.com', u'france24.com', u'chinadaily.com.cn', u'theguardian.com', u'washingtonpost.com', u'middleeasteye.net', u'aljazeera.com']


## We create the array we will use in our sparse model

In [14]:
# Parameters: changing these affects the results you get
Pos_neg_ratio = 2.0
overall_ratio = 0.15
pos_threshold = 0.15
neg_threshold = -0.15

N = len(entities_kept)
M = len(sources)
A = np.zeros((N, M))

sentiment_counts = Counter()

source2j = {source: j for j, source in enumerate(sources)}

for i, entity in enumerate(entities_kept):
    for source in ent_source_sent[entity].keys():
        sent_array = np.array(ent_source_sent[entity][source])
        N_pos = float(len(np.where(sent_array > pos_threshold)[0]))
        N_neg = float(len(np.where(sent_array < neg_threshold)[0]))
        T = float(len(sent_array))
        aggregate_sentiment = 0
        if N_pos > Pos_neg_ratio*N_neg and N_pos > overall_ratio*T:
            aggregate_sentiment = 1
        elif N_neg > Pos_neg_ratio*N_pos and N_neg > overall_ratio*T:
            aggregate_sentiment = -1
        j = source2j[source]
        
        A[i,j] = aggregate_sentiment
        
        sentiment_counts[aggregate_sentiment] += 1

print "We allocated some sentiment in this matrix, the repartition is:", sentiment_counts

We allocated some sentiment in this matrix, the repartition is: Counter({0: 19061, 1: 3650, -1: 2670})


## Model source similarity

In [15]:
# Write code that uses this matrix (entities, sources) to compute
# source similarity visible in bias of the way they describe entities