In [78]:
import re

import numpy as np
import pandas as pd
from nltk.tokenize import TweetTokenizer
from nltk import pos_tag, ne_chunk
from nltk.chunk import conlltags2tree, tree2conlltags
from nltk.tag.stanford import StanfordNERTagger
import nltk

from sklearn.feature_extraction.text import TfidfVectorizer
from  textacy.vsm import Vectorizer

from tqdm import *
from pprint import pprint

Getting all the tweets from the twitter api and then saving it here

In [82]:
tweets = pd.read_csv('./tweet.csv',encoding='ISO-8859-1')

## Data Preprocessing

In [4]:
tweets.dropna(axis=0, subset=['retweets'],inplace=True)
tweets.head()

Unnamed: 0,date,username,retweets,text,mentions,hashtags
0,6/5/2019 3:26,mchellap,1,The western ghats policy that wasn't implement...,,#Keralafloods
1,6/4/2019 16:10,SRKKeralaFC,30,Schools are gonna open this week all over Kera...,@SRKCHENNAIFCpic,#KeralaFloods
2,6/4/2019 14:55,NewIndianXpress,2,Local self-government institutions and governm...,,#Wayanad #KeralaFloods
3,6/3/2019 6:55,JustOutNews,0,Govt to construct four new dams in Kerala; aim...,@CPIMKerala @keralagovernment,#kerala #keralafloods #StateNews #CurrentUpdat...
4,5/29/2019 16:22,Alonzo10541251,0,Thubten Chodron speaks against Dagri Rinpoche....,,#AwardWapsiExposed #SonOfTadipar #instagood #l...


Extracting text from the tweets dataframe

Removing URLs, Removing @..., and the hashtags

In [73]:
# Building the corpus
tweet_text = []
tweets.text = tweets.text.apply(lambda x: re.sub(u'https:\S+', u'', x))
tweets.text = tweets.text.apply(lambda x: re.sub(u'http:\S+', u'', x))
tweets.text = tweets.text.apply(lambda x: re.sub(u'(\s)@\w+', u'', x))
tweets.text = tweets.text.apply(lambda x: re.sub(u'#', u'', x))
for text in tweets.text:
    tweet_text.append(text)

## Tokenizing with nltk

In [46]:
tknzr = TweetTokenizer()

nltk_tweets = []
for text in tweets.text:
    nltk_tweets.append(tknzr.tokenize(text))
nltk_tweets[-68]

['Palakkad',
 'Thala',
 'Ajith',
 'fans',
 'Kodumbu',
 'unit',
 'provided',
 'relief',
 'materials',
 'to',
 'those',
 'who',
 'are',
 'affected',
 'by',
 'KeralaFloods',
 'pic.twitter.com/3sVnudB6Wt']

In [7]:
#nltk.download()

Using POS tagger to get the array of various part of speech in the tweet

In [47]:
nltk_pos = []

for text in nltk_tweets:
    nltk_pos.append(pos_tag(text))
pprint(nltk_pos[-68])
#print(ne_chunk(nltk_pos[-68]))

[('Palakkad', 'NNP'),
 ('Thala', 'NNP'),
 ('Ajith', 'NNP'),
 ('fans', 'NNS'),
 ('Kodumbu', 'NNP'),
 ('unit', 'NN'),
 ('provided', 'VBD'),
 ('relief', 'NN'),
 ('materials', 'NNS'),
 ('to', 'TO'),
 ('those', 'DT'),
 ('who', 'WP'),
 ('are', 'VBP'),
 ('affected', 'VBN'),
 ('by', 'IN'),
 ('KeralaFloods', 'NNP'),
 ('pic.twitter.com/3sVnudB6Wt', 'NN')]


Tried Named entity recognition using NLTK but not accurate

In [9]:
#pattern = 'NP: {<DT>?<JJ>*<NN>}'
#cp = nltk.RegexpParser(pattern)
#cs = cp.parse(nltk_pos[-68])
#print(cs)

In [10]:
#iob_tagged= tree2conlltags(cs)
#pprint(iob_tagged)

Now using Stanford Natural Processing!!
First, we will set the config_java file for nltk

In [11]:
nltk.internals.config_java("/usr/lib/jvm/java-11-openjdk-amd64/bin/java")
st = StanfordNERTagger('/home/pranav/Desktop/zine/Twitter-Mining/stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz',
           '/home/pranav/Desktop/zine/Twitter-Mining/stanford-ner-2018-10-16/stanford-ner.jar', encoding='utf-8')

In [17]:
#nltk_ents = []
#for tweet in tqdm(nltk_tweets):
#    entity_tagged_tweet = st.tag(tweet)
#    nltk_ents.append([tag for tag in entity_tagged_tweet if tag[1] != 'O'])

The Standford Named Entity Recognition library labels the text in the tweets, particularly into 3 classes (PERSON, ORGANIZATION, LOCATION).<br>
As, numerals will also be significant in the tweets we will concatenate it to the entity text. Hence, from the text we will take care about the entities and numbers.<br>
I will name these array content_tweets

Again, entities that are labelled as PERSON tend to be related more to feelings of the person, hence I will remove them as well.

In [49]:
content_tweets = []
for pos_tweet, tweet_entity in zip(nltk_pos, nltk_ents):
    # starting by appending all of the entities
    tweet_content = [word[0] for word in tweet_entity if word[1] != 'PERSON']
    
    # next by appending all of the numerals
    for token in pos_tweet:
         if token[1] == u'CD':
            tweet_content.append(token[0])
    content_tweets.append(tweet_content)

# Getting the tl-idf score

Now, we will take out tl-idf score for the tweet that will determine how much the word present in the tweet is importants.<br>
So, I will take out the tl-idf score of all of the nlt_tweets

In [84]:
#vectorizer = Vectorizer(weighting='tfidf')

text_matrix = vectorizer.fit_transform(tweet_text)
text_matrix.todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])