In [1]:
import re

import numpy as np
import pandas as pd
from nltk.tokenize import TweetTokenizer
from nltk import pos_tag, ne_chunk
from nltk.chunk import conlltags2tree, tree2conlltags
from nltk.tag.stanford import StanfordNERTagger
import nltk

from sklearn.feature_extraction.text import TfidfVectorizer
from  textacy.vsm import Vectorizer

from tqdm import *
from pprint import pprint

from pymprog import *

Getting all the tweets from the twitter api and then saving it here

In [2]:
tweets = pd.read_csv('./tweet.csv',encoding='ISO-8859-1')

## Data Preprocessing

In [3]:
tweets.dropna(axis=0, subset=['retweets'],inplace=True)
tweets.head()

Unnamed: 0,date,username,retweets,text,mentions,hashtags
0,6/5/2019 3:26,mchellap,1,The western ghats policy that wasn't implement...,,#Keralafloods
1,6/4/2019 16:10,SRKKeralaFC,30,Schools are gonna open this week all over Kera...,@SRKCHENNAIFCpic,#KeralaFloods
2,6/4/2019 14:55,NewIndianXpress,2,Local self-government institutions and governm...,,#Wayanad #KeralaFloods
3,6/3/2019 6:55,JustOutNews,0,Govt to construct four new dams in Kerala; aim...,@CPIMKerala @keralagovernment,#kerala #keralafloods #StateNews #CurrentUpdat...
4,5/29/2019 16:22,Alonzo10541251,0,Thubten Chodron speaks against Dagri Rinpoche....,,#AwardWapsiExposed #SonOfTadipar #instagood #l...


Extracting text from the tweets dataframe

Removing URLs, Removing @..., and the hashtags

In [4]:
# Building the corpus
tweet_text = []
tweets.text = tweets.text.apply(lambda x: re.sub(u'https:\S+', u'', x))
tweets.text = tweets.text.apply(lambda x: re.sub(u'http:\S+', u'', x))
tweets.text = tweets.text.apply(lambda x: re.sub(u'(\s)@\w+', u'', x))
tweets.text = tweets.text.apply(lambda x: re.sub(u'#', u'', x))
for text in tweets.text:
    tweet_text.append(text)

## Tokenizing with nltk

In [5]:
tknzr = TweetTokenizer()

nltk_tweets = []
for text in tweets.text:
    nltk_tweets.append(tknzr.tokenize(text))
nltk_tweets[-68]

['Palakkad',
 'Thala',
 'Ajith',
 'fans',
 'Kodumbu',
 'unit',
 'provided',
 'relief',
 'materials',
 'to',
 'those',
 'who',
 'are',
 'affected',
 'by',
 'KeralaFloods',
 'pic.twitter.com/3sVnudB6Wt']

In [6]:
#nltk.download()

Using POS tagger to get the array of various part of speech in the tweet

In [7]:
nltk_pos = []

for text in nltk_tweets:
    nltk_pos.append(pos_tag(text))
pprint(nltk_pos[-68])
#print(ne_chunk(nltk_pos[-68]))

[('Palakkad', 'NNP'),
 ('Thala', 'NNP'),
 ('Ajith', 'NNP'),
 ('fans', 'NNS'),
 ('Kodumbu', 'NNP'),
 ('unit', 'NN'),
 ('provided', 'VBD'),
 ('relief', 'NN'),
 ('materials', 'NNS'),
 ('to', 'TO'),
 ('those', 'DT'),
 ('who', 'WP'),
 ('are', 'VBP'),
 ('affected', 'VBN'),
 ('by', 'IN'),
 ('KeralaFloods', 'NNP'),
 ('pic.twitter.com/3sVnudB6Wt', 'NN')]


Tried Named entity recognition using NLTK but not accurate

In [8]:
#pattern = 'NP: {<DT>?<JJ>*<NN>}'
#cp = nltk.RegexpParser(pattern)
#cs = cp.parse(nltk_pos[-68])
#print(cs)

In [9]:
#iob_tagged= tree2conlltags(cs)
#pprint(iob_tagged)

Now using Stanford Natural Processing!!
First, we will set the config_java file for nltk

In [10]:
nltk.internals.config_java("/usr/lib/jvm/java-11-openjdk-amd64/bin/java")
st = StanfordNERTagger('/home/pranav/Desktop/zine/Twitter-Mining/stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz',
           '/home/pranav/Desktop/zine/Twitter-Mining/stanford-ner-2018-10-16/stanford-ner.jar', encoding='utf-8')

In [11]:
nltk_ents = []
for tweet in tqdm(nltk_tweets):
    entity_tagged_tweet = st.tag(tweet)
    nltk_ents.append([tag for tag in entity_tagged_tweet if tag[1] != 'O'])

100%|██████████| 1470/1470 [40:27<00:00,  1.64s/it]


The Standford Named Entity Recognition library labels the text in the tweets, particularly into 3 classes (PERSON, ORGANIZATION, LOCATION).<br>
As, numerals will also be significant in the tweets we will concatenate it to the entity text. Hence, from the text we will take care about the entities and numbers.<br>
I will name these array content_tweets

Again, entities that are labelled as PERSON tend to be related more to feelings of the person, hence I will remove them as well.

In [12]:
content_tweets = []
for pos_tweet, tweet_entity in zip(nltk_pos, nltk_ents):
    # starting by appending all of the entities
    tweet_content = [word[0] for word in tweet_entity if word[1] != 'PERSON']
    
    # next by appending all of the numerals
    for token in pos_tweet:
         if token[1] == u'CD':
            tweet_content.append(token[0])
    content_tweets.append(tweet_content)
for i in range(len(content_tweets)):
    print(i, content_tweets[i])

0 []
1 ['Kerala']
2 ['Wayanad']
3 ['Kerala', 'four']
4 ['2']
5 ['Pandhalam', 'Municipality', 'Kerala', 'Ward']
6 ['Kerala', 'Care', 'Today', 'Pandhalam', 'Pathanamthitta']
7 ['ActionAid', 'Pandhalam', 'Municipality', 'Patinamthitta', 'Kerala']
8 ['1']
9 []
10 ['Kerala', 'State', 'Television']
11 ['Wayanad']
12 ['Kerala', 'Pampa']
13 ['Kerala', '1', '1']
14 []
15 ['2']
16 ['Pandhalam', 'Municipality', 'Patinamthitta', 'Kerala']
17 []
18 ['Sabarimala', 'BJP', 'Kerala']
19 ['Kerala', 'Sabarimala', '2018']
20 ['Mumbai', 'India']
21 ['4', '2', '2']
22 []
23 ['2']
24 ['Kerala', '1']
25 ['Kerala', 'UAE']
26 []
27 ['Geneva', 'Chekutty']
28 []
29 ['Sabarimala']
30 ['Kerala', 'NDA', 'Kerala', '2019', '13']
31 ['Optume', 'Houselifting', 'Company', 'Kerala', '100']
32 ['Lama', 'Zopa', '&', 'FPMT', '2']
33 ['2']
34 ['Kerala']
35 []
36 ['Kerala', '244', '65', '10']
37 ['two', '2018', '1', '2']
38 ['BJP', 'Sabarimala']
39 ['Kerala']
40 ['Amicus', 'Curiae', 'Kerala', 'Kerala', 'Govt', '2018']
41 ['UDF

# Getting the tf-idf score

Now, we will take out tl-idf score for the tweet that will determine how much the word present in the tweet is importants.<br>
So, I will take out the tl-idf score of all of the nltk_tweets

I care about the tf-idf scores of the entire tweet, so will tf-idf score across the entire corpus of original tweets.

In [13]:
vectorizer = Vectorizer(tf_type='linear', apply_idf=True, idf_type='smooth')
term_matrix = vectorizer.fit_transform(nltk_tweets)
term_matrix.shape

(1470, 8513)

The term_matrix is a document-term matrix of shape (#document, #unique terms).<br>
Each row is a document and each column is a  unique word.<br>
The values of the matrix is the tf-idf score of the particular unique word in the column

In [14]:
np_matrix = term_matrix.todense()
np_matrix.shape
print(np_matrix.ndim)
np.max(np_matrix[:,527])

2


13.003876502746765

The goal is to make a dictionary, which map from tokens in the content_tweets to some tf-idf score.<br>
Each word has a unique tf-idf value<br>
In simple words, we will find the column number of each word from the content_tweets in the document-term matrix, as now the vectorizer model is trained, for this we will use vectorizer.vocablury_terms[word]<br>
Each column is a distinct word

In [15]:
for key in sorted(vectorizer.vocabulary_terms)[527:715]:
    print(key, vectorizer.vocabulary_terms[key])

Adoor 527
Advanced 528
Advertisement 529
Aerial 530
Affairs 531
Affected 532
Affecters 533
Afridi 534
After 535
Again 536
Agenda 537
Aggregated 538
Agreed 539
Agriculture 540
Ahmadi 541
Aid 542
Aina 543
Air 544
Airbase 545
Airbus 546
Airdropping 547
Airforce 548
Airforce..twitter.com/lTraR3JQvH 549
Airlift 550
Airlifting 551
Airport 552
Airticket 553
Ajith 554
Akhil 555
Akhila's 556
Akshay 557
Akshaya 558
Al 559
Alappuzha 560
Alappuzha's 561
Aleppey 562
Alert 563
Ali 564
All 565
AllIndiaSFC 566
Allah 567
Allah's 568
Allapphuzha 569
Allappujha 570
Alleppey 571
Alliance 572
AlluArjun 573
Almighty 574
Almost 575
Alphons 576
Alphonse 577
Already 578
Also 579
Although 580
Aluva 581
Aluva.twitter.com/KW0mw36eCu 582
Alwaye 583
Always 584
Am 585
Aman 586
Amaravila 587
Amarinder 588
Amarpreet 589
Ambani 590
Ambassador 591
Ambayathode 592
America 593
Amicus 594
Amidst 595
Amir 596
Amit 597
Amita 598
Amma 599
Among 600
Amount 601
Amrita 602
An 603
And 604
Andhra 605
AndhraPradesh 606
Andimukku 60

Now to get the tf-idf score of all entities related to a specific content_tweet, we will use np.max function <br>
We will taken all the words from a specific content tweet and then use the vectorizer.vocabulary_terms to get the column-number of the word<br>
Then, we use np.max to take out the maximum value in the entire column for the word and that will be its tf-idf score

In [16]:
for token in content_tweets[1190]:
    print(token, vectorizer.vocabulary_terms[token], np.max(np_matrix[:,vectorizer.vocabulary_terms[token]]))

Adoor 527 13.003876502746765
Kerala 1831 5.553472662399717
2000 175 6.096473143265218
8122809298 392 7.195085431933328
8907471707 404 7.195085431933328
9000620380 411 7.195085431933328


## We will now go through all of the content_tweets to get the dictionary

In [17]:
tfidf_dict = {}
content_vocab = []
for tweet in content_tweets: 
    for token in tweet: 
        if token not in tfidf_dict: 
            if token in vectorizer.vocabulary_terms:
                content_vocab.append(token)
                tfidf_dict[token] = np.max(np_matrix[:,vectorizer.vocabulary_terms[token]])
print(len(tfidf_dict))

1211


In [18]:
for key in sorted(tfidf_dict)[900:911]:
    print ("WORD:" + str(key) + " -- tf-idf SCORE:" +  str(tfidf_dict[key]))

WORD:North -- tf-idf SCORE:6.214256178921602
WORD:Numbers -- tf-idf SCORE:6.6842598081673374
WORD:OR -- tf-idf SCORE:6.907403359481547
WORD:Odisha -- tf-idf SCORE:11.982225255214784
WORD:Of -- tf-idf SCORE:10.698517482869994
WORD:Officers -- tf-idf SCORE:7.195085431933328
WORD:Oman -- tf-idf SCORE:7.6005505400414926
WORD:Onam -- tf-idf SCORE:6.3477875715461245
WORD:One -- tf-idf SCORE:11.171295038998455
WORD:Online -- tf-idf SCORE:6.6842598081673374
WORD:Operation -- tf-idf SCORE:13.814806718963094


## Content Word-based Tweet Summarization

To generate something that woul be more useful to people and other volunteers, It has to be something with content-words with high tf-idf scores.<br>
This can be done by using Integer Linear Programming(ILP) where we will maximize an equation given some constraints.<br>
Equation: Maximize the total score of content words in my summary<br>
\begin{equation}
\sum_{i=1}^n x_{i} + \sum_{j = 1}^{m} Score(j) \cdot y_{j}
\end{equation}
where, $x_{i}$ is 1 if the tweet is selected or 0 if the tweet is not selected, where $y_{j}$ is 1 or 0 if each content word is included (and Score(j) is that word's tf-idf score).<br>
\begin{equation}
\sum_{i=1}^n x_{i}.Length(i) <= L
\end{equation}
Constraint 1: The total length of all the selected tweets to be less than some value L, which will be the length of my summary, L. I can vary L depending on how long I want my summary to be. 
<br>
\begin{equation}
\sum_{i \in T_{j}} x_{i} \geq y_{j}, j = [1,...,m]
\end{equation}
Contraint 2:If I pick some content word $y_{j}$ (out of my $m$ possible content words) , then I want to have at least one tweet from the set of tweets which contain that content word, $T_{j}$.<br>
\begin{equation}
\sum_{j \in C_{i}} y_{j} \leq |C_{i}| \times x_{i}, i = [1,...,n]
\end{equation}
Constraint 3: If I pick a tweet in my summary, then all the content words in that tweet should be present in the summary<br>

Variables that the equation depend on are integers, 1 if the word is included and 0, if it is not

In [62]:
begin('COWTS')

model('COWTS') is the default model.

var function is used to create variables,  

In [63]:
# Defining the first variable x,
# This definies whether or not a tweet is selected
x = var('x', len(nltk_tweets), bool)
x[1000]

0 <= x[1000] <= 1 binary

In [64]:
# Defining the second variable y,
# This defines whether or not a content word is selected
y = var('y', len(content_vocab), bool)

In [65]:
len(y), y[0]

(1211, 0 <= y[0] <= 1 binary)

In [66]:
# Defining the equation that needs to be maximized
maximize(sum(x) + sum([tfidf_dict[content_vocab[j]]*y[j] for j in range(len(y))]));

In [67]:
## Constraint 1: Maximum length of entire tweet summary
# should be less than or equal to 150

L = 150
sum([x[i] * len(nltk_tweets[i]) for i in range(len(x))]) <= L;

In [68]:
#Constraint 2: If I pick a content word then I have to pick a tweet that contains the content word
def tweet_with_content_words(j):
    content_word = content_vocab[j]
    index_term_matrix = vectorizer.vocabulary_terms[content_word]
    matrix = np_matrix[:, index_term_matrix]
    
    return np.nonzero(matrix)[0]

In [69]:
for j in range(len(y)):
    sum([x[i] for i in tweet_with_content_words(j)]) >= y[j]

In [70]:
#Constraint 3: If i pick a tweet, then all of the content words from the tweet must be selected
def content_words(i):
    tweet = nltk_tweets[i]
    content_indices = []
    
    for token in tweet:
        if token in content_vocab:
            content_indices.append(content_vocab.index(token))
    
    return content_indices

In [71]:
for i in range(len(x)):
    sum(y[j] for j in content_words(i)) >= len(content_words(i)) * x[i]

In [72]:
solve()

(0,
 'The MIP problem instance has been successfully solved. (This code\ndoes {\\it not} necessarily mean that the solver has found optimal\nsolution. It only means that the solution process was successful.)')

In [73]:
result_x =  [value.primal for value in x]
result_y = [value.primal for value in y]

In [74]:
end()

model('COWTS') is not the default model.

In [76]:
chosen_tweets = np.nonzero(result_x)
chosen_words = np.nonzero(result_y)
chosen_tweets

(array([ 170,  328,  590,  603,  685,  733,  869, 1090, 1366]),)

In [77]:
len(chosen_tweets[0]), len(chosen_words[0])

(9, 78)

In [78]:
for i in chosen_tweets[0]:
    print ('--------------')
    print (" ".join(nltk_tweets[i]))

--------------
Congress President Rahul Gandhi to visit Kerala on August 28 KeralaFloods pic.twitter.com/IM8plyvgnp
--------------
Modi Blocked UAE and UN Aid to Kerala ... Double National Disaster for Kerala Keralafloods RebuildKerala KeralaFloodRelief trollsanghpic.twitter.com/71goYFnSsV
--------------
Helpings Provided By " STUDENTS WING " Trivandrum District Committee Thalapathy Vijay Fans For People Affected By Flood . KeralaFloodRelief KeralaFloods DoForKerala KeralaReliefFundpic.twitter.com/vkTXKZqosF
--------------
â  ¢ Our SFC Brothers Doing Rescue Works KeralaFloods KeralaFloodRescuepic.twitter.com/bC03144tp3
--------------
Details of rescue and relief granted by Central Government to Kerala.Released by Ministry of Home Affairs through Press Information Bureau . KeralaFloods IndiaForKeralapic.twitter.com/AB120WPvVE
--------------
Visuals of rescue operations conducted by Indian Navy in flood-affected Kochi . KeralaFloods ( 18.8 . 18 ) pic.twitter.com/5stGlJ3Dkj
-------------