In [2]:
import re

import numpy as np
import pandas as pd
from nltk.tokenize import TweetTokenizer
from nltk import pos_tag, ne_chunk
from nltk.chunk import conlltags2tree, tree2conlltags
from nltk.tag.stanford import StanfordNERTagger
import nltk

from sklearn.feature_extraction.text import TfidfVectorizer
from  textacy.vsm import Vectorizer

from tqdm import *
from pprint import pprint

from pymprog import *

Getting all the tweets from the twitter api and then saving it here

In [3]:
tweets = pd.read_csv('./tweet.csv',encoding='ISO-8859-1')

## Data Preprocessing

In [50]:
tweets.dropna(inplace=True)
tweets = tweets[tweets.retweets != 0]
tweets.to_csv('./preprocessed_tweets.csv')
tweets.shape
tweets

Unnamed: 0,date,username,retweets,text,mentions,hashtags
6,2018-08-16 23:57:13,VIVEKSINGHANIA2,1,The devastation caused in Kerala by the floods...,@SavitriJindal @sajjanjindal @MPNaveenJindal @...,#KeralaFloods
9,2018-08-16 23:55:28,JayGooner_,1,Arsenal Please help the people of Kerala who a...,@Arsenal @ArsenalKerala,#KeralaFloods #KeralaRains
18,2018-08-16 23:47:47,Dhanya28297502,1,KeralaFloods StandWithKerala Sir pls help my s...,@arunjaitley @ArvindKejriwal @PMOIndia @rajnat...,#KeralaFloods #StandWithKerala
23,2018-08-16 23:42:15,NishaPurushoth2,25,"KeralaFloods , What kind of journalism is this...",@timesofindia,#KeralaFloods
31,2018-08-16 23:39:50,shonali_16,4,The little what I could for keralites. Please ...,@reliancejio @airtelindia @BSNLCorporate,#KeralaFloodRelief #KeralaFloods
34,2018-08-16 23:38:14,Femin42943183,1,Please help kerala .. horrible flooding has ki...,@Oprah @BillGates,#Keralafloods
41,2018-08-16 23:34:09,samjoths,4,Pls share guys.. This is authentic.. KeralaFlo...,@dhanyarajendran @CMOKerala @ndmaindia @Kerala...,#KeralaFloods #KeralaFloodRelief #KeralaFloods...
42,2018-08-16 23:33:58,rameez6444,1,Please use this format Use this hashtag protoc...,@KeralaSDMA @cskerala @adgpi @SnehaMKoshy,#SOSKerala #KeralaFloods #KeralaFloods2018
61,2018-08-16 23:24:10,InCrisisRelief,3,"Hi , please share/ update your availability in...",@VolunteersOrg,#KeralaFloods2018 #KeralaFloods
66,2018-08-16 23:22:08,marcoajh1,3,hi follow me and give me a lot of love in the ...,@PlaticaPolinesi,#FelizJueves #16Ago #MadonnaAt60 #16agosto #Ma...


Extracting text from the tweets dataframe

Removing URLs, Removing @..., and the hashtags

In [86]:
# Building the corpus
tweet_text = []
tweets.text = tweets.text.apply(lambda x: re.sub(u'https:\S+', u'', x))
tweets.text = tweets.text.apply(lambda x: re.sub(u'http:\S+', u'', x))
tweets.text = tweets.text.apply(lambda x: re.sub(u'@\w+', u'', x))
tweets.text = tweets.text.apply(lambda x: re.sub(u'#', u'', x))
tweets.text = tweets.text.apply(lambda x: re.sub(u'â|¦|:|;|-|\x80|Â|µ|à|&|\/', u'', x))
tweets.text = tweets.text.apply(lambda x: x.replace(u'RT', u''))
print(tweets.text[6])

The devastation caused in Kerala by the floods is gutwrenching. Whatever we can do, we must. KeralaFloods pic.twitter.comSAgPpZdS2c


## Tokenizing with nltk

In [85]:
tknzr = TweetTokenizer()

nltk_tweets = []
for text in tweets.text:
    nltk_tweets.append(tknzr.tokenize(text))
nltk_tweets[-68]

['We',
 'are',
 'providing',
 'assistance',
 ',',
 'entire',
 'Kerala',
 'cannot',
 'be',
 'evacuated',
 '.',
 'The',
 'state',
 'government',
 'is',
 'trying',
 'their',
 'best',
 ',',
 'says',
 'K',
 'J',
 'Alphons',
 'while',
 'speaking',
 'to',
 'TIMES',
 'NOW',
 '\x99',
 's',
 'KeralaFloods',
 'pic.twitter.comokLxRpfevI']

In [7]:
#nltk.download()

Using POS tagger to get the array of various part of speech in the tweet

In [53]:
nltk_pos = []

for text in nltk_tweets:
    nltk_pos.append(pos_tag(text))
pprint(nltk_pos[-68])
print(ne_chunk(nltk_pos[-68]))

[('We', 'PRP'),
 ('are', 'VBP'),
 ('providing', 'VBG'),
 ('assistance', 'NN'),
 (',', ','),
 ('entire', 'JJ'),
 ('Kerala', 'NNP'),
 ('cannot', 'NN'),
 ('be', 'VB'),
 ('evacuated', 'VBN'),
 ('.', '.'),
 ('The', 'DT'),
 ('state', 'NN'),
 ('government', 'NN'),
 ('is', 'VBZ'),
 ('trying', 'VBG'),
 ('their', 'PRP$'),
 ('best', 'JJS'),
 (',', ','),
 ('says', 'VBZ'),
 ('K', 'NNP'),
 ('J', 'NNP'),
 ('Alphons', 'NNP'),
 ('while', 'IN'),
 ('speaking', 'VBG'),
 ('to', 'TO'),
 ('TIMES', 'NNP'),
 ('NOW', 'NNP'),
 ('\x99', 'NNP'),
 ('s', 'NN'),
 ('KeralaFloods', 'NNP'),
 ('pic.twitter.com/okLxRpfevI', 'NN')]
(S
  We/PRP
  are/VBP
  providing/VBG
  assistance/NN
  ,/,
  entire/JJ
  (GPE Kerala/NNP)
  cannot/NN
  be/VB
  evacuated/VBN
  ./.
  The/DT
  state/NN
  government/NN
  is/VBZ
  trying/VBG
  their/PRP$
  best/JJS
  ,/,
  says/VBZ
  (PERSON K/NNP J/NNP Alphons/NNP)
  while/IN
  speaking/VBG
  to/TO
  (ORGANIZATION TIMES/NNP)
  NOW/NNP
  /NNP
  s/NN
  (ORGANIZATION KeralaFloods/NNP)
  pic.twitt

Tried Named entity recognition using NLTK but not accurate

In [9]:
#pattern = 'NP: {<DT>?<JJ>*<NN>}'
#cp = nltk.RegexpParser(pattern)
#cs = cp.parse(nltk_pos[-68])
#print(cs)

In [10]:
#iob_tagged= tree2conlltags(cs)
#pprint(iob_tagged)

Now using Stanford Natural Processing!!
First, we will set the config_java file for nltk

In [54]:
nltk.internals.config_java("/usr/lib/jvm/java-11-openjdk-amd64/bin/java")
st = StanfordNERTagger('/home/pranav/Desktop/zine/Twitter-Mining/stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz',
           '/home/pranav/Desktop/zine/Twitter-Mining/stanford-ner-2018-10-16/stanford-ner.jar', encoding='utf-8')

In [55]:
nltk_ents = []
for tweet in tqdm(nltk_tweets):
    entity_tagged_tweet = st.tag(tweet)
    nltk_ents.append([tag for tag in entity_tagged_tweet if tag[1] != 'O'])

100%|██████████| 2725/2725 [1:15:42<00:00,  1.73s/it]


The Standford Named Entity Recognition library labels the text in the tweets, particularly into 3 classes (PERSON, ORGANIZATION, LOCATION).<br>
As, numerals will also be significant in the tweets we will concatenate it to the entity text. Hence, from the text we will take care about the entities and numbers.<br>
I will name these array content_tweets

Again, entities that are labelled as PERSON tend to be related more to feelings of the person, hence I will remove them as well.

In [56]:
content_tweets = []
for pos_tweet, tweet_entity in zip(nltk_pos, nltk_ents):
    # starting by appending all of the entities
    tweet_content = [word[0] for word in tweet_entity if word[1] != 'PERSON']
    
    # next by appending all of the numerals
    for token in pos_tweet:
         if token[1] == u'CD':
            tweet_content.append(token[0])
    content_tweets.append(tweet_content)
for i in range(len(content_tweets)):
    print(i, content_tweets[i])

0 ['Kerala']
1 ['Kerala']
2 []
3 []
4 ['Kerala']
5 ['75', '10302327271', '7234176', '0']
6 ['2018']
7 ['2018']
8 ['2018']
9 ['16Ago', '60', '16agosto', '10301811551', '6805120', '1']
10 ['2018']
11 ['2018']
12 []
13 ['BJP', '4India', '2', '10300994565', '14056192']
14 []
15 ['Kerala', 'Kerala']
16 ['Kerala', 'India', '2018']
17 ['10', '3year', '91', '95261', '34317']
18 ['kerala', 'India']
19 ['2018']
20 ['3', '5226604321', '5']
21 ['John', 'Pennycuick', 'Charitable', 'Trust', 'Kallvi', 'Educational', 'and', 'Public', 'Charitable', 'Trust', 'two']
22 ['2018']
23 ['34hrs', '10', '18', '02.8', '76', '20', '04.0']
24 ['Time', 'Kallissery', 'Pathanamthitta', '5', '2', '17/8', '18', '225am', '3', '30', '2018']
25 ['Navy', 'Kerala', 'State', 'Disaster', 'Management', 'Authority', 'Navy', 'Rescue', 'Helichopter', '91', '94465', '68222', '91', '94465', '68222', '91', '82812', '92702', '91', '82812', '92702']
26 ['Time', 'CMI', '5', '1', '16/8', '18', '215am', '3', '2018']
27 ['Thiruvalla', 'Th

1671 ['Kerala']
1672 []
1673 ['Army', 'Navy', '1000']
1674 ['India', 'Kerala', '67', '25', '50,000']
1675 ['02964938695', '3822208']
1676 ['Kerala', 'Kerala']
1677 ['Kerala']
1678 ['Ernakulam']
1679 ['Pathanamthitta']
1680 ['India']
1681 ['keralarescue.in']
1682 ['Kerala', 'Kerala', '67', '5/00050173', '67statewideredalert']
1683 ['Kerala']
1684 ['Kerala', 'India']
1685 ['Kerala', '67', '7', '14', '1.5', '65415305.cms']
1686 ['Kerala']
1687 ['Kerala']
1688 ['Kerala', 'Kerala', 'Kerala', 'Delhi', '2018']
1689 ['2018']
1690 ['UPI', 'Kerala', '2018']
1691 ['Edanadu', 'Chengannur']
1692 ['Chengannur', 'Kerala', '5', '9187925784', '75']
1693 ['Kozhipalam', '9188027055']
1694 ['Kerala']
1695 ['Army', 'Thiruvananthapuram', 'Ranny', 'Kozhencherry', 'Aranmula', '2018']
1696 ['Kozhencherry', 'Ayroor', 'Kerala', '2018', '5']
1697 ['9946244241']
1698 ['Kerala', 'Gujarat', 'Kerala']
1699 ['2b', '2', '67']
1700 ['Kerala']
1701 ['Thiruvananthapuram', 'Kozhencheri', 'Aranmula', 'Pathanamthitta', 'Kera

# Getting the tf-idf score

Now, we will take out tl-idf score for the tweet that will determine how much the word present in the tweet is importants.<br>
So, I will take out the tl-idf score of all of the nltk_tweets

I care about the tf-idf scores of the entire tweet, so will tf-idf score across the entire corpus of original tweets.

In [57]:
vectorizer = Vectorizer(tf_type='linear', apply_idf=True, idf_type='smooth')
term_matrix = vectorizer.fit_transform(nltk_tweets)
term_matrix.shape

(2725, 11578)

The term_matrix is a document-term matrix of shape (#document, #unique terms).<br>
Each row is a document and each column is a  unique word.<br>
The values of the matrix is the tf-idf score of the particular unique word in the column

In [58]:
np_matrix = term_matrix.todense()
np_matrix.shape

np.max(np_matrix[:,527])

8.217443431696534

The goal is to make a dictionary, which map from tokens in the content_tweets to some tf-idf score.<br>
Each word has a unique tf-idf value<br>
In simple words, we will find the column number of each word from the content_tweets in the document-term matrix, as now the vectorizer model is trained, for this we will use vectorizer.vocablury_terms[word]<br>
Each column is a distinct word

In [59]:
for key in sorted(vectorizer.vocabulary_terms)[527:715]:
    print(key, vectorizer.vocabulary_terms[key])

230cr 527
233682 528
23544 529
2382 530
2398.66 531
2399.60 532
24 533
24/7 534
2401.10 535
2401.7 536
24019 537
2403 538
24282 539
2436 540
243am 541
245am 542
247 543
24782 544
24X7 545
24x7 546
25 547
25,00 548
25,000 549
250 550
2500 551
2504611784 552
2511078 553
2546,76 554
25553 555
256 556
257pc 557
257pcmorerainfallnormal86684 558
2580044 559
25Kg 560
25L 561
25Lacks 562
25Lacs 563
25Lakh 564
25Lakhs 565
25lacs 566
25lacstokeralafloods 567
25lakh 568
25lakhs 569
25lakhsdonationtokeralarainsidukkidistricttamilnews218917 570
25lakhskeralarelieffund 571
25lakhstokeralafloodvictims 572
25th 573
26 574
26/211 575
26/314806 576
26/8 577
2608201 578
2610094 579
26291c 580
2649605 581
266 582
26flightsfromDohacancelledorrerouted 583
26idukkidamfulltothebrim 584
26livesndrfteamsdeployed777424 585
26th 586
27 587
27,000 588
27324 589
27429 590
2776243 591
28 592
2806657 593
2827929 594
2834371 595
2838912 596
2857 597
2883734 598
2891904 599
29 600
29.2 601
29.3 602
292 603
2959 604
296

Now to get the tf-idf score of all entities related to a specific content_tweet, we will use np.max function <br>
We will taken all the words from a specific content tweet and then use the vectorizer.vocabulary_terms to get the column-number of the word<br>
Then, we use np.max to take out the maximum value in the entire column for the word and that will be its tf-idf score

In [61]:
for token in content_tweets[1190]:
    print(token, vectorizer.vocabulary_terms[token], np.max(np_matrix[:,vectorizer.vocabulary_terms[token]]))

Pathanamthitta 3711 4.998567606828332
4Hrs 774 7.811978323588368
two 11101 6.345641254794941


## We will now go through all of the content_tweets to get the dictionary

In [62]:
tfidf_dict = {}
content_vocab = []
for tweet in content_tweets: 
    for token in tweet: 
        if token not in tfidf_dict: 
            if token in vectorizer.vocabulary_terms:
                content_vocab.append(token)
                tfidf_dict[token] = np.max(np_matrix[:,vectorizer.vocabulary_terms[token]])
print(len(tfidf_dict))

2155


In [63]:
for key in sorted(tfidf_dict)[900:911]:
    print ("WORD:" + str(key) + " -- tf-idf SCORE:" +  str(tfidf_dict[key]))

WORD:689123 -- tf-idf SCORE:6.831149070576642
WORD:689532 -- tf-idf SCORE:7.811978323588368
WORD:689532ftid -- tf-idf SCORE:8.217443431696534
WORD:689633f97ec54178bd163dc4b970bb35 -- tf-idf SCORE:8.217443431696534
WORD:689641 -- tf-idf SCORE:8.217443431696534
WORD:69 -- tf-idf SCORE:7.118831143028423
WORD:6912435 -- tf-idf SCORE:8.217443431696534
WORD:6967502 -- tf-idf SCORE:8.217443431696534
WORD:6980480 -- tf-idf SCORE:8.217443431696534
WORD:6BillionPeople -- tf-idf SCORE:8.217443431696534
WORD:6E -- tf-idf SCORE:7.811978323588368


## Content Word-based Tweet Summarization

To generate something that woul be more useful to people and other volunteers, It has to be something with content-words with high tf-idf scores.<br>
This can be done by using Integer Linear Programming(ILP) where we will maximize an equation given some constraints.<br>
Equation: Maximize the total score of content words in my summary<br>
\begin{equation}
\sum_{i=1}^n x_{i} + \sum_{j = 1}^{m} Score(j) \cdot y_{j}
\end{equation}
where, $x_{i}$ is 1 if the tweet is selected or 0 if the tweet is not selected, where $y_{j}$ is 1 or 0 if each content word is included (and Score(j) is that word's tf-idf score).<br>
\begin{equation}
\sum_{i=1}^n x_{i}.Length(i) <= L
\end{equation}
Constraint 1: The total length of all the selected tweets to be less than some value L, which will be the length of my summary, L. I can vary L depending on how long I want my summary to be. 
<br>
\begin{equation}
\sum_{i \in T_{j}} x_{i} \geq y_{j}, j = [1,...,m]
\end{equation}
Contraint 2:If I pick some content word $y_{j}$ (out of my $m$ possible content words) , then I want to have at least one tweet from the set of tweets which contain that content word, $T_{j}$.<br>
\begin{equation}
\sum_{j \in C_{i}} y_{j} \leq |C_{i}| \times x_{i}, i = [1,...,n]
\end{equation}
Constraint 3: If I pick a tweet in my summary, then all the content words in that tweet should be present in the summary<br>

Variables that the equation depend on are integers, 1 if the word is included and 0, if it is not

In [64]:
begin('COWTS')

model('COWTS') is the default model.

var function is used to create variables,  

In [65]:
# Defining the first variable x,
# This definies whether or not a tweet is selected
x = var('x', len(nltk_tweets), bool)
x[1000]

0 <= x[1000] <= 1 binary

In [66]:
# Defining the second variable y,
# This defines whether or not a content word is selected
y = var('y', len(content_vocab), bool)

In [67]:
len(y), y[0]

(2155, 0 <= y[0] <= 1 binary)

In [68]:
# Defining the equation that needs to be maximized
maximize(sum(x) + sum([tfidf_dict[content_vocab[j]]*y[j] for j in range(len(y))]));

In [69]:
## Constraint 1: Maximum length of entire tweet summary
# should be less than or equal to 150

L = 150
sum([x[i] * len(nltk_tweets[i]) for i in range(len(x))]) <= L;

In [70]:
#Constraint 2: If I pick a content word then I have to pick a tweet that contains the content word
def tweet_with_content_words(j):
    content_word = content_vocab[j]
    index_term_matrix = vectorizer.vocabulary_terms[content_word]
    matrix = np_matrix[:, index_term_matrix]
    
    return np.nonzero(matrix)[0]

In [71]:
for j in range(len(y)):
    sum([x[i] for i in tweet_with_content_words(j)]) >= y[j]

In [72]:
#Constraint 3: If i pick a tweet, then all of the content words from the tweet must be selected
def content_words(i):
    tweet = nltk_tweets[i]
    content_indices = []
    
    for token in tweet:
        if token in content_vocab:
            content_indices.append(content_vocab.index(token))
    
    return content_indices

In [73]:
for i in range(len(x)):
    sum(y[j] for j in content_words(i)) >= len(content_words(i)) * x[i]

In [74]:
solve()

(0,
 'The MIP problem instance has been successfully solved. (This code\ndoes {\\it not} necessarily mean that the solver has found optimal\nsolution. It only means that the solution process was successful.)')

In [75]:
result_x =  [value.primal for value in x]
result_y = [value.primal for value in y]

In [76]:
end()

model('COWTS') is not the default model.

In [77]:
chosen_tweets = np.nonzero(result_x)
chosen_words = np.nonzero(result_y)
chosen_tweets

(array([  51,  239, 1028, 1488, 2447, 2523, 2584, 2654]),)

In [78]:
len(chosen_tweets[0]), len(chosen_words[0])

(8, 76)

In [79]:
for i in chosen_tweets[0]:
    print ('--------------')
    print (" ".join(nltk_tweets[i]))

--------------
KeralaFloods KeralaFloodRelief KeralaSOS KeralaFloods 2018 + 91 94470 40337 Chalakudy , Pariyaram , near st george church 19 people Kannampilly house 19 people 1st floor vare vellam kayari Time 2 Am
--------------
District Control Room Numbers KeralaFloods pic.twitter.com/cuEQKKyGG2
--------------
KeralaFloods Emergency Help numbers pic.twitter.com/jmWqGrYzQe
--------------
KeralaFloods donation.cmdrf.Kerala.gov.in VPA keralacmdrfsbi For Bank Counter Payment Beneficiary Name Principal Secretary ( Fin ) , Treasurer CMDRF Bank Name State Bank of India ( SBI ) Account Number 6731994823 2 Branch City Branch , Thiruvananthapuram IFSC SBIN 0070028
--------------
Suriya_offl donated 25lakhs to Kerala Government for Flood relief .. KeralaFloods KeralaFloodRelief
--------------
India Wants To More On NationalHerald Case , Why Hide From Media Rahul ji ! KeralaFloods BJP Modi Chennai Bangalore Mumbai Pune Kolkata 12YearsOfKANK Delhi TripleTalaq Jaipur Bhopal Guwahati Hindus Temple 