In [1]:
% matplotlib inline

from __future__ import division

import numpy as np 
import pandas as pd 

import spacy
from spacy.tokens.doc import Doc
import inspect

from textacy.vsm import Vectorizer
import textacy.vsm

import scipy.sparse as sp

from tqdm import *

import re
import en_core_web_sm

### Loading Data

In [2]:
def get_tweets(name):
    tweet_data = pd.read_csv('Tweets//'+name+'_tweets.csv', encoding = 'utf-8')
    return tweet_data

In [65]:
tweets = get_tweets('ewarren')
tweets = tweets.dropna()

### Pre-processing Data

In [66]:
# extracting URLS
tweets['urls'] = tweets.text.apply(lambda x: re.findall('http\S+', x))
tweets['text_clean'] = tweets.text.apply(lambda x: re.sub(u'http\S+', u'', x)) 

# extracting @... 
tweets['mentions'] = tweets.text.apply(lambda x: re.findall(u'@(\w+)', x))
tweets.text_clean = tweets.text_clean.apply(lambda x: re.sub(u'@(\w+)', u'', x))

# extracting hashtags
tweets['hashtags'] = tweets.text.apply(lambda x: re.findall(u'#(\w+)', x))
tweets.text_clean = tweets.text_clean.apply(lambda x: re.sub(u'#', u'', x))

# Binary for Re-Tweets
tweets['retweet'] = tweets.text.apply(lambda x: 1 if len(re.findall(u'RT', x))>0 else 0)
tweets.text_clean = tweets.text_clean.apply(lambda x: re.sub(u'RT', u'', x))

In [67]:
tweets

Unnamed: 0,id,created_at,text,urls,text_clean,mentions,hashtags,retweet
0,1144276520875319296,2019-06-27 16:09:16,I'm done playing by a different set of rules. ...,[https://t.co/WV0BqmmKjU],I'm done playing by a different set of rules. ...,[],[],0
1,1144276520090918913,2019-06-27 16:09:16,Mitch McConnell says efforts to stop gerrymand...,[],Mitch McConnell says efforts to stop gerrymand...,[],[],0
2,1144276518786555906,2019-06-27 16:09:16,The Supreme Court's decision is an abomination...,[],The Supreme Court's decision is an abomination...,[],[],0
3,1144237476451639296,2019-06-27 13:34:07,You bet I have a plan to deal with Mitch McCon...,[https://t.co/X0ODfTwoRy],You bet I have a plan to deal with Mitch McCon...,[],[DemDebate],0
4,1144088262174228480,2019-06-27 03:41:12,The economy is doing great for a thinner and t...,[https://t.co/8oBa9a4TmB],The economy is doing great for a thinner and t...,[],[DemDebate],0
5,1144086118465515520,2019-06-27 03:32:41,I spent a big chunk of my life studying why fa...,[https://t.co/MhYfR235nG],I spent a big chunk of my life studying why fa...,[],"[MedicareForAll, DemDebate]",0
6,1144082731745366016,2019-06-27 03:19:13,RT @TeamWarren: .@ewarren can make our governm...,[],": . can make our government, our economy, and...","[TeamWarren, ewarren]",[DemDebate],1
7,1144080356729118721,2019-06-27 03:09:47,I just stepped off the #DemDebate stage. I hav...,[https://t.co/pSUwFnT5qq],I just stepped off the DemDebate stage. I have...,[],[DemDebate],0
8,1144079755987341312,2019-06-27 03:07:24,I’m in this fight for a government and an econ...,[],I’m in this fight for a government and an econ...,[],[DemDebate],0
9,1144074674898382848,2019-06-27 02:47:13,No one should be above the law—that includes t...,[https://t.co/v1pgkcdRDm],No one should be above the law—that includes t...,[],[DemDebate],0


In [6]:
nlp = en_core_web_sm.load()

In [7]:
spacy_tweets = []

for doc in nlp.pipe(tweets.text_clean, n_threads = -1):
    spacy_tweets.append(doc)

### Getting the tf-idf values of content words

In [39]:
useful_entities = [u'NORP', u'FACILITY', u'ORG', u'EVENT', u'DATE', u'TIME']

Healthcare = [u'care','affordable care act', u'medicare', u'obamacare',  u'benefits', u'opioid', u'abortion', u'medicaid', u'single payer', u'medication', u'prescription', u'drugs', u'hospital', u'health', u'care']
Terrorism = [u'al', u'terror', u'sanctions', u'drone', u'oil', u'gulf', u'intelligence', u'ISIS', u'military', u'nuclear']
Economy = [u'china', u'tariff', u'mexico', u'fed', u'sec', u'interest', u'rate', u'immigration', u'debt', u'unemployment', u'growth', u'inflation', u'trade war', u'dollar', u'gdp', u'g20', u'recession', u'stock', u'market']
lgbtq = [u'gay', u'lesbian', u'pride', u'community', u'lgbt', u'same-sex', u'marriage', u'orientation', u'parade', u'stonewall', u'riot', u'civil', u'homosexual', u'queer', u'conversion']


In [9]:
content_tweets = []
for single_tweet in tqdm(spacy_tweets):
    single_tweet_content = []
    for token in single_tweet: 
        if ((token.ent_type_ in useful_entities)  
            or (token.pos_ in [u'NOUN', u'ADJ', u'VERB']) 
            or (token.lower_ in Healthcare)):
            single_tweet_content.append(token)
    content_tweets.append(single_tweet_content)

100%|██████████| 295/295 [00:00<00:00, 11203.45it/s]


In [10]:
content_tweets

[[Tying,
  president,
  hands,
  midst,
  crisis,
  is,
  misguided,
  dangerous,
  NDAA,
  amendment,
  is,
  authorization,
  use,
  military,
  force,
  affirms,
  constitutional,
  authority,
  president,
  keep,
  military,
  citizens,
  allies,
  safe],
 [Live,
  be,
  speaking,
  NDAA,
  amendment,
  reaffirm,
  constitutional,
  authorities,
  president,
  must,
  have,
  protect,
  defend,
  country,
  Watch],
 [are,
  many,
  aspects,
  healthcare,
  system,
  could,
  use,
  dose,
  transparency,
  Finding,
  ways,
  reform,
  surprise,
  billing,
  patients,
  are,
  situation,
  receive,
  outsized,
  bill,
  emergency,
  services,
  is,
  top,
  priority,
  work],
 [Fostering,
  sustainable,
  forest,
  land,
  management,
  policies,
  impact,
  wildfires,
  finding,
  ways,
  reduce,
  fuel,
  load,
  are,
  issues,
  critical,
  importance,
  Discussed,
  items,
  more,
  today,
  friend],
 [Raising,
  smoking,
  age,
  to,
  21,
  will,
  help,
  keep,
  harmful,
  to

In [11]:
vectorizer = Vectorizer(tf_type='linear', apply_idf=True, idf_type='smooth')

In [12]:
term_matrix = vectorizer.fit_transform([tok.lemma_ for tok in doc] for doc in spacy_tweets)

In [13]:
np_matrix = term_matrix.todense()

In [14]:
np_matrix.shape

(295, 2001)

In [15]:
for key in sorted(vectorizer.vocabulary_terms)[1000:1015]:
    print(key, vectorizer.vocabulary_terms[key])

energy 1000
enforcement 1001
engagement 1002
engineering 1003
enjoy 1004
enormous 1005
enough 1006
enrol 1007
ensure 1008
enter 1009
enthusiastic 1010
entire 1011
entitlement 1012
entrance 1013
entrust 1014


In [16]:
for token in content_tweets[89]:
    print(token.lemma_, vectorizer.vocabulary_terms[token.lemma_], 
          np.max(np_matrix[:,vectorizer.vocabulary_terms[token.lemma_]]))

patient 1483 5.0809215418899605
face 1050 9.221835825288448
skyrocket 1713 5.997212273764115
prescription 1533 10.60813018640834
drug 965 15.242764625669881
cost 859 8.584928363051379
insurance 1243 11.183494331311902
company 820 5.0809215418899605
amp 648 7.964618506443701
drug 965 15.242764625669881
manufacturer 1340 5.997212273764115
benefit 716 5.591747165655951
complex 826 5.591747165655951
system 1785 9.221835825288448
rebate 1584 5.997212273764115
be 705 6.587737349619255
introduce 1251 4.125410096862524
bill 725 7.599975392855791
aim 632 5.0809215418899605
bring 744 4.387774361330015
transparency 1854 5.0809215418899605
prescription 1533 10.60813018640834
drug 965 15.242764625669881
pricing 1543 5.591747165655951
system 1785 9.221835825288448
amp 648 7.964618506443701
lower 1326 5.591747165655951
pocket 1515 5.591747165655951
cost 859 8.584928363051379
medication 1353 5.997212273764115


In [17]:
tfidf_dict = {}
content_vocab = []
for tweet in content_tweets: 
    for token in tweet: 
        if token.lemma_ not in tfidf_dict: 
            content_vocab.append(token.lemma_)
            tfidf_dict[token.lemma_] = np.max(np_matrix[:,vectorizer.vocabulary_terms[token.lemma_]])

In [18]:
content_vocab

['tie',
 'president',
 'hand',
 'midst',
 'crisis',
 'be',
 'misguided',
 'dangerous',
 'NDAA',
 'amendment',
 'authorization',
 'use',
 'military',
 'force',
 'affirm',
 'constitutional',
 'authority',
 'keep',
 'citizen',
 'ally',
 'safe',
 'live',
 'speak',
 'reaffirm',
 'must',
 'have',
 'protect',
 'defend',
 'country',
 'watch',
 'many',
 'aspect',
 'healthcare',
 'system',
 'could',
 'dose',
 'transparency',
 'find',
 'way',
 'reform',
 'surprise',
 'billing',
 'patient',
 'situation',
 'receive',
 'outsized',
 'bill',
 'emergency',
 'service',
 'top',
 'priority',
 'work',
 'foster',
 'sustainable',
 'forest',
 'land',
 'management',
 'policy',
 'impact',
 'wildfire',
 'reduce',
 'fuel',
 'load',
 'issue',
 'critical',
 'importance',
 'discuss',
 'item',
 'more',
 'today',
 'friend',
 'raise',
 'smoking',
 'age',
 'to',
 '21',
 'will',
 'help',
 'harmful',
 'tobacco',
 'product',
 'child',
 'pleased',
 'see',
 'bipartisan',
 'tobacco21',
 'legislation',
 'pass',
 'look',
 'cont

In [19]:
for key in sorted(tfidf_dict)[500:505]:
    print ("WORD:" + str(key) + " -- tf-idf SCORE:" +  str(tfidf_dict[key]))

WORD:crisis -- tf-idf SCORE:4.898599985096006
WORD:critical -- tf-idf SCORE:4.125410096862524
WORD:crossroad -- tf-idf SCORE:5.997212273764115
WORD:crucial -- tf-idf SCORE:5.591747165655951
WORD:culmination -- tf-idf SCORE:5.997212273764115


In [20]:
tfidf_dict['economy']

9.488898610537493

In [21]:
from pymprog import *
begin('COWTS')

model('COWTS') is the default model.

In [23]:
x = var('x', len(spacy_tweets), bool)

# Check this worked

In [24]:
y = var('y', len(content_vocab), bool)
len(y), y[0]

(1500, 0 <= y[0] <= 1 binary)

In [25]:
maximize(sum(x) + sum([tfidf_dict[content_vocab[j]]*y[j] for j in range(len(y))]));

In [26]:
## Maximum length of the entire tweet summary

# Was 150 for the tweet summary, 
# But generated a 1000 word summary for CONABS
L = 1000

# hiding the output of this line since its a very long sum 
sum([x[i]*len(spacy_tweets[i]) for i in range(len(x))]) <= L;


In [27]:
def content_words(i):
    '''Given a tweet index i (for x[i]), this method will return the indices of the words in the 
    content_vocab[] array
    Note: these indices are the same as for the y variable
    '''
    tweet = spacy_tweets[i]
    content_indices = []
    
    for token in tweet:
        if token.lemma_ in content_vocab:
            content_indices.append(content_vocab.index(token.lemma_))
    return content_indices

In [28]:
def tweets_with_content_words(j):
    '''Given the index j of some content word (for content_vocab[j] or y[j])
    this method will return the indices of all tweets which contain this content word
    '''
    content_word = content_vocab[j]
    
    index_in_term_matrix = vectorizer.vocabulary_terms[content_word]
    
    matrix_column = np_matrix[:, index_in_term_matrix]
    
    return np.nonzero(matrix_column)[0]

In [29]:
for j in range(len(y)):
    sum([x[i] for i in tweets_with_content_words(j)])>= y[j]

In [30]:
for i in range(len(x)):
    sum(y[j] for j in content_words(i)) >= len(content_words(i))*x[i]

In [31]:
solve()

(0,
 'The MIP problem instance has been successfully solved. (This code\ndoes {\\it not} necessarily mean that the solver has found optimal\nsolution. It only means that the solution process was successful.)')

In [32]:
result_x =  [value.primal for value in x]
result_y = [value.primal for value in y]

In [33]:
end()

model('COWTS') is not the default model.

In [34]:
chosen_tweets = np.nonzero(result_x)
chosen_words = np.nonzero(result_y)

In [35]:
len(chosen_tweets[0]), len(chosen_words[0])

(31, 466)

In [36]:
for i in chosen_tweets[0]:
    print('--------------')
    print(spacy_tweets[i])

--------------
Fostering sustainable forest and land management policies, the impact of wildfires, and finding ways to reduce fuel load are all issues of critical importance to Utah. Discussed these items and more today with my friend . 
--------------
The Senate border supplemental bill provides much-needed resources for  and . But we urgently need a long-term fix that discourages illegal immigration by securing the border, closing legal loopholes, and instituting mandatory &amp; permanent E-Verify. 
--------------
My thoughts on climate change and the need to encourage investment in breakthrough technology to help tackle this global problem. 
--------------
 : Sen. Mitt Romney warns of ’increasingly harmful consequence’ for American consumers if Trump imposes tariffs on imports…
--------------
By 2021, 90% of Russia’s nuclear fleet will be modernized. Our triad hasn't been modernized since the 1980s. Congress must make the modernization of our nuclear deterrent a high priority—which 