## Imports

In [1]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/michael/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [5]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer # For sentiment analysis
import cPickle as pickle # For loaded dataset from pickle file
import tqdm # Progress bar
from collections import Counter # Handy addon
from pprint import pprint # Useful to print JSON objects
import numpy as np

## Load the dataset of articles with introductions 

In [6]:
# This loads the file that you want, might take several seconds (up to a minute)

with open("news_sentiment.pickle", "r") as f:
    articles = pickle.load(f)
print len(articles), "articles were loaded"
print "Example article:"
pprint(articles[1040])


57767 articles were loaded
Example article:
{u'introductions': [{u'person': u'Bashar al-Assad',
                     u'text': u'President',
                     u'wdid': u'Q44329'},
                    {u'person': u'Emile Hokayem',
                     u'text': u'in Foreign Policy'},
                    {u'person': u'Ahrar al Sham',
                     u'text': u'the most important groups',
                     u'wdid': u'Q860943'},
                    {u'person': u'Vladimir Putin',
                     u'text': u'Russian President',
                     u'wdid': u'Q7747'},
                    {u'person': u'Barack Obama',
                     u'text': u'U.S. President',
                     u'wdid': u'Q76'},
                    {u'person': u'Osama Abu Zeid',
                     u'text': u'a senior adviser to the moderate Free Syrian Army'},
                    {u'person': u'Op-Ed',
                     u'text': u'for The Washington Post',
                     u'wdid': u'Q2602337'},
 

In [8]:
# separate articles from the two stories
ISIS_articles = []
Brexit_articles = []
for a in articles:
    if a["news_topic"] == 'ISIS War':
        ISIS_articles.append(a)
    else:
        Brexit_articles.append(a)
        
print len(ISIS_articles), " articles from ISIS War and ", len(Brexit_articles), "articles from Brexit were loaded"

39206  articles from ISIS War and  18561 articles from Brexit were loaded


In [9]:
# get only articles from one story, you can change this
articles = ISIS_articles

## Extract introductions, and obtain their sentiment

In [10]:
analyzer = SentimentIntensityAnalyzer()

total_introductions = []
for a in articles:
    for intro in a.get('introductions', []):
        intro['source'] = a['source']
        total_introductions.append(intro)

for intro in tqdm.tqdm_notebook(total_introductions):
    intro['sentiment'] = analyzer.polarity_scores(intro['text'])['compound']

Widget Javascript not detected.  It may not be installed properly.





In [11]:
# gets all of the introductions and splits on whitespace
intro_texts = map(lambda x: x[u'text'].split(), total_introductions)
# converts to lowercase and flattens list
words = []
for intro in intro_texts:
    for word in intro:
        words.append(word.lower())


In [12]:
# counts all of the words
word_counts = {}
for word in words:
    # word already seen
    if word in word_counts:
        word_counts[word] += 1
    else:
        word_counts[word] = 1


In [13]:
# most common words found
word_counts_list = [el for el in word_counts.iteritems()]
word_counts_list.sort(key = lambda x: -x[1])

In [14]:
# prints most common words
n = 10
print '%s most common words' % n
for i in range(n):
    print '%s, %s' % (word_counts_list[i][0], word_counts_list[i][1])

10 most common words
the, 53207
of, 30492
president, 30058
minister, 23643
a, 22037
's, 20523
in, 18812
-, 17370
prime, 12534
who, 12132


In [15]:
# hyperparameters for selecting important words
word_pos_threshold = 0.1
word_neg_threshold = -0.1
word_freq_threshold = 100  # shows up at least this many times

charged_words = []
for word, count in word_counts_list:
    if count < word_freq_threshold:
        break  # ok because sorted by frequency already
    word_sentiment = analyzer.polarity_scores(word)['compound']
    if word_sentiment > word_pos_threshold or word_sentiment < word_neg_threshold:
        charged_words.append((word, word_sentiment, count))


In [16]:
charged_words_list = zip(*charged_words)[0]

In [None]:
# Example some sentiment for some of the introductions

subsample = np.random.choice(total_introductions, 100)
for intro in subsample:
    if intro['sentiment'] != 0:
        print "---------------"
        print "Entity mentionned:", intro['person']
        print intro['text']
        print "Sentiment:", intro['sentiment']

## Build a 2-dimensional object containing sentiment per entity, per source

In [17]:
# for indexing the charged words
charged_words_dict = dict(zip(charged_words_list, [i for i in range(len(charged_words_list))]))

In [18]:
# count number of unique sources
sources = set(map(lambda intro: intro['source'], total_introductions))
num_unique_sources = len(sources)
sources_dict = dict(zip(list(sources), [i for i in range(len(sources))]))

In [90]:
# Build word count matrix that is sources by charged_words

# initialize with zeros
data_matrix_sources = np.zeros((len(charged_words_list), num_unique_sources), dtype=np.float)

for intro in total_introductions:
    intro_text = intro[u'text'].split()
    
    # if word is a charged word, increment corresponding data cell by 1
    for word in intro_text:
        if word in charged_words_dict:
            s = intro['source']
            data_matrix_sources[charged_words_dict[word], sources_dict[s]] += 1

In [20]:
# count number of unique people
people = set(map(lambda intro: intro['person'], total_introductions))
num_unique_people = len(people)
people_dict = dict(zip(list(people), [i for i in range(len(people))]))

In [21]:
# Build word count matrix that is people by charged_words

# initialize with zeros
data_matrix_people = np.zeros((len(charged_words_list), num_unique_people), dtype=np.float)

for intro in total_introductions:
    intro_text = intro[u'text'].split()
    
    # if word is a charged word, increment corresponding data cell by 1
    for word in intro_text:
        if word in charged_words_dict:
            p = intro['person']
            data_matrix_people[charged_words_dict[word], people_dict[p]] += 1

In [25]:
# count number of unique people
source_people = set(map(lambda intro: (intro['source'], intro['person']), total_introductions))
num_unique_source_people = len(source_people)
source_people_dict = dict(zip(list(source_people), [i for i in range(len(source_people))]))

In [27]:
# Build word count matrix that is (source, people) by charged_words

# initialize with zeros
data_matrix_people = np.zeros((len(charged_words_list), num_unique_source_people), dtype=np.float)

for intro in total_introductions:
    intro_text = intro[u'text'].split()
    
    # if word is a charged word, increment corresponding data cell by 1
    for word in intro_text:
        if word in charged_words_dict:
            s = intro['source']
            p = intro['person']
            data_matrix_people[charged_words_dict[word], source_people_dict[(s, p)]] += 1

In [9]:
ent_source_sent = {}

for intro in total_introductions:
    p = intro['person']
    s = intro['source']
    if p not in ent_source_sent:
        ent_source_sent[p] = {}
    if s not in ent_source_sent[p]:
        ent_source_sent[p][s] = []
    ent_source_sent[p][s].append(intro['sentiment'])

In [10]:
# An example of how one entity (a city) is described by different sources

print ent_source_sent['Aleppo']

{u'nytimes.com': [0.0, -0.1531, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.1531, 0.0, 0.0, 0.0, -0.1531, 0.0, 0.0, -0.5574, 0.0, 0.0, 0.0, 0.0], u'allafrica.com': [-0.5994], u'bloomberg.com': [-0.5994, 0.0, 0.0, -0.2023, 0.0, -0.4404, -0.1531, -0.1531, 0.0, 0.0], u'bbc.co.uk': [0.0516, 0.0, -0.1531, 0.0, 0.0, 0.0, -0.1531, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.1531, -0.3182, -0.5994, -0.5994, 0.0, 0.0, 0.0, -0.1531, 0.0, 0.0, 0.0, -0.1531, 0.0, 0.0, 0.0], u'theguardian.com': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.7096, 0.0, -0.1531, 0.0], u'telegraph.co.uk': [0.4019, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.3612, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3182, 0.4404, -0.296, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3818, -0.1531, 0.0, -0.1531, 0.0, 0.0, 0.0, -0.3182, -0.1531, 0.0, 0.0, 0.0, 0.0, 0.0, -0.3612, 0.2023, -0.1531, 0.0, 0.0, -0.1531, -0.1531, 

In [11]:
# We get rid of entities that don't contain enough data

entities_kept = []

for entity in ent_source_sent.keys():
    sentiments = ent_source_sent[entity]
    total_size = sum([len(sentiments[source]) for source in sentiments.keys()])
    if total_size >= 3:
        entities_kept.append(entity)
        
print "We will keep a total of %s / %s in our dataset" % (len(entities_kept), len(ent_source_sent.keys()))


sources = set([])
for entity in entities_kept:
    sources|= set(ent_source_sent[entity].keys())
sources = list(sources)

print "We have ", len(sources), "sources: ", sources

We will keep a total of 7852 / 25128 in our dataset
We have  22 sources:  [u'telegraph.co.uk', u'foxnews.com', u'ap.org', u'businessinsider.in', u'independent.co.uk', u'reuters.com', u'wikinews.org', u'cnn.com', u'techcrunch.com', u'aa.com.tr', u'allafrica.com', u'nytimes.com', u'bloomberg.com', u'bbc.co.uk', u'latimes.com', u'rt.com', u'france24.com', u'chinadaily.com.cn', u'theguardian.com', u'washingtonpost.com', u'middleeasteye.net', u'aljazeera.com']


## We create the array we will use in our sparse model

In [12]:
# Parameters: changing these affects the results you get
Pos_neg_ratio = 2.0
overall_ratio = 0.15
pos_threshold = 0.15
neg_threshold = -0.15

N = len(entities_kept)
M = len(sources)
A = np.zeros((N, M))

sentiment_counts = Counter()

source2j = {source: j for j, source in enumerate(sources)}

for i, entity in enumerate(entities_kept):
    for source in ent_source_sent[entity].keys():
        sent_array = np.array(ent_source_sent[entity][source])
        N_pos = float(len(np.where(sent_array > pos_threshold)[0]))
        N_neg = float(len(np.where(sent_array < neg_threshold)[0]))
        T = float(len(sent_array))
        aggregate_sentiment = 0
        if N_pos > Pos_neg_ratio*N_neg and N_pos > overall_ratio*T:
            aggregate_sentiment = 1
        elif N_neg > Pos_neg_ratio*N_pos and N_neg > overall_ratio*T:
            aggregate_sentiment = -1
        j = source2j[source]
        
        A[i,j] = aggregate_sentiment
        
        sentiment_counts[aggregate_sentiment] += 1

print "We allocated some sentiment in this matrix, the repartition is:", sentiment_counts

We allocated some sentiment in this matrix, the repartition is: Counter({0: 19061, 1: 3650, -1: 2670})


## Model source similarity

In [108]:
#  Sparse PCA stuff
#  want to find what journals think of people overall.

from sklearn import decomposition

test_data = np.zeros((22, 22))
test_data[0, 0] = 100

n = 4 # extract 5 components
# alpha is a sparsity param
print data_matrix_sources.shape
estimator = decomposition.SparsePCA(n_components=n, alpha=1e-2)
#estimator.fit_transform(data_matrix_sources)  # for reduction on sources
estimator.fit_transform(data_matrix_sources)  # for reduction on words

components = estimator.components_
projected_data = estimator.fit_transform(data_matrix_sources)
print components.shape
#import pdb; pdb.set_trace()
print(components)
print(components <= 0)

import pdb; pdb.set_trace()
x=0


(66, 22)
(4, 22)
[[ -2.57161676e+02  -3.13124873e+01  -1.67514264e-01  -1.56728158e+02
   -7.57601502e+01  -2.88576253e+01  -4.47549446e+02  -4.75839309e+00
   -6.99757120e+02  -2.04463264e+02  -8.41720300e+01  -7.08626227e+01
   -2.02961059e+02  -8.23089470e+01  -5.92573446e+00  -2.02288184e+02
   -3.00643126e-01  -1.05936915e+00  -2.35632614e+02  -3.90852246e+01
   -1.03738181e+01  -7.10581633e+01]
 [  4.93739773e+01  -2.05719125e+00  -2.40631681e-01   7.01256579e+01
    3.25005574e+01   2.77506829e+00  -5.98477771e+01   1.22349241e+00
   -9.60657517e+01   7.72828526e+01  -2.62763204e+01   6.73159658e+00
    6.95906502e+00   2.35199193e+01   2.41944409e+00   1.14241671e+02
   -1.15557934e-01   2.83374284e-01   8.45980602e+01   3.12673349e-02
    7.24848682e+00   1.27662484e+01]
 [  1.52291873e+00   3.32105358e+00   4.80592117e-02  -3.28681992e+01
   -8.21196740e+01  -6.57151721e+00   4.53062675e+01   8.48634493e-01
   -5.07702394e+01  -7.27273414e+00   7.18504147e+00   2.10535145e+01

BdbQuit: 

In [63]:
# Write code that uses this matrix (entities, sources) to compute
# source similarity visible in bias of the way they describe entities

In [15]:
def soft_threshold (x, t):
    if x > 0:
        return max(x-t, 0)
    else:
        return min(x+t, 0)

In [None]:
def graphical_lasso(X, lambda_initial_guess = 1e-2, lambda_lasso = 1e-2
                    max_iteration_matrix=10, max_iteration_partition=10,
                    convergence_threshold_matrix = 1e-2, convergence_threshold_partition=1e-6):
    S = np.cov(X.T)  # sampled covariance S
    if lambda_lass == 0:
        return np.linalg.inv(S)
    
    p = S.shape[0]
    W = S + lambda_initial_guess * np.identity(p)
    precision = np.linalg.inv(W)
    index = np.arange(p)  # for ease of indexing later
    
    for i in range(max_iteration_matrix):
        beta_coefs = np.zeros((p-1, p), dtype=np.float)
        for j in range(p):
            # partition W and S
            
    
    

In [24]:
def graphical_lasso (X, lambda_parameter = 0.01, max_iteration = 10, threshold = 0.01):
    # X: the original data matrix
    # lambda_parameter: hyperparameter for L1 penalty
    # max_iteration: maximum iteration
    # threshold: convergence threshold
    
    # check if hyperparameter is zero
    if lambda_parameter == 0:  # when lambda = 0, simply reutrn the inverse of sample covariance matrix
        return np.linalg.inv(np.cov(X.T))
    
    # if hyperparameter is not zero, go on to do the following
    # step 1 in algorithm 9.1
    p = X.shape[1]                          # number of features
    S = np.cov(X.T)                         # sample covariance matrix
    W = np.cov(X.T)                         # set the initial W matrix
    precision = np.linalg.inv(np.cov(X.T))  # initialize the precision matrix
    index = np.arange(p)                    # index used to partition the matrix W and S
    
    # step 2 in algorithm 9.1
    for i in range(max_iteration):
        #W_old = W  # later used to check for convergence
        B = np.zeros((p - 1, p))                  # used to store the beta coefficients

        for j in range(p):
            
            # partition W and S
            W_11 = W[index != j].T[index != j]        # select W matrix's all but the jth row and column to form W_11
            s_12 = S[j, index != j]                   # select the s12 from S, I actually select s12.T for easier dimension
            beta_j = - precision[index != j, j] / precision[j, j]  ##??## is this the right way to define the initial beta_j?
            V = W[index != j].T[index != j]           # used in coordinate descent
            
            # pathwise coordinate descent
            for n in range(max_iteration):
                #beta_old = beta_j.copy()  # previous beta, used for checking convergence of beta_j
                for k in range(p - 1): 
                    ##!!## this is adopted from 17.26 in Elements of Statistical Learning by Hastie, Tishirani, and Friedman
                    beta_j[k] = soft_threshold(s_12[k] - np.dot(V[np.arange(p - 1) != j, k], beta_j[np.arange(p - 1) != j]), 
                                               lambda_parameter) / V[k, k]
                    
                #if np.abs(beta_j - beta_old).mean() < threshold: ####!!#### some better test for convergence of beta_j
                #    B[np.arange(p - 1), j] = beta_j
                #    break
            
            # store the beta coefficients for jth freature
            B[np.arange(p - 1), j] = beta_j
            
            #else:
            #    # this triggers if break command did not occur
            #    print "The coordinate descent did not converge. Try to increase the maximum number of iterations."
                
            # update the w_12
            W[index != j, j] = np.dot(W_11, beta_j)
            
        #if check_convergence(W_old, W, S, threshold):
        #    break
            
    #else:
    #    # this triggers if break command did not occur
    #    print "The algorithm did not converge. Try to increase the maximum number of iterations."
    
    # update the precision matrix
    # step 3 in algorithm 9.1
    for j in range(p):
        precision[j, j] = 1 / (W[j, j] - np.dot(W[index != j, j], B[np.arange(p - 1), j]))  # this is theta_hat_22
        precision[index != j, j] = - B[np.arange(p - 1), j] * precision[j, j]               # this is theta_hat_12
    
    return precision

In [25]:
result = graphical_lasso(A, lambda_parameter=1e-2) # try it with the A matrix specified in Sentiment.ipynb

In [26]:
x = 0

result_inv = np.linalg.inv(result)
print np.linalg.norm(A)
print np.linalg.norm(result_inv)

import pdb; pdb.set_trace()

x = 1



79.4984276574
0.234252614242
--Return--
> <ipython-input-26-0748940d0e2e>(7)<module>()->None
-> import pdb; pdb.set_trace()
(Pdb) result == result.T
array([[ True, False,  True,  True,  True, False,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True, False,  True,
         True,  True,  True,  True],
       [False,  True,  True,  True,  True, False,  True, False,  True,
         True,  True,  True,  True,  True,  True,  True, False,  True,
         True, False,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True, 

BdbQuit: 