# Notebook for Sentiment Analysis

In [30]:
# NTLK functions
import nltk
from nltk.corpus import stopwords
from nltk import tokenize as tok
from nltk.stem.snowball import SnowballStemmer # load nltk's SnowballStemmer as variabled 'stemmer'
import lda # topic modeling -NMF & LDA
import string
from nltk.tag import StanfordNERTagger

# Load the LDA model from sk-learn
from sklearn.decomposition import LatentDirichletAllocation as LDA
# Tf-Idf and Clustering packages
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

import pandas as pd
import numpy as np

import re
import json

from datetime import date
from dateutil import parser

import keras
import h5py
from keras.models import model_from_json
from keras.models import load_model
from nltk.tokenize import RegexpTokenizer

Using TensorFlow backend.


## Import Tweets

In [31]:
df = pd.read_csv('FebDebate#3.csv')
df = df.drop(['Unnamed: 0'], axis=1)
df['text'] = df['text'].astype(str)

### Check nan's

In [32]:
# Get names of indexes for which column Age has value 30
indexNames = df[df.text == 'nan'].index
 
# Delete these row indexes from dataFrame
df.drop(indexNames, inplace=True)

(df.text == 'nan').sum()

0

### Check Data

In [33]:
df.groupby('search_term').count()

Unnamed: 0_level_0,id,author_id,text,retweets,permalink,date,formatted_date,favorites,mentions,hashtags,geo,urls
search_term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
biden,9846,9846,9846,9846,9846,9846,9846,9846,1930,1093,0,2268
bloomberg,9975,9975,9975,9975,9975,9975,9975,9975,1324,1055,0,3011
buttigieg,9612,9612,9612,9612,9612,9612,9612,9612,2377,1257,0,2263
klobuchar,9897,9897,9897,9897,9897,9897,9897,9897,3323,1512,0,2614
sanders,9624,9624,9624,9624,9624,9624,9624,9624,1536,1074,0,1893
warren,9702,9702,9702,9702,9702,9702,9702,9702,1706,1146,0,1723


In [34]:
df.head(2)

Unnamed: 0,id,author_id,text,retweets,permalink,date,formatted_date,favorites,mentions,hashtags,geo,urls,search_term
0,1232455231251730432,1049108900618940416,No wonder he wants to be president!!,0,https://twitter.com/rebellious_yell/status/123...,2020-02-25 23:59:59+00:00,Tue Feb 25 23:59:59 +0000 2020,1,,,,,warren
1,1232455227548192769,489991648,@realDonaldTrump Mike Bloomberg’s New TV Ad: I...,0,https://twitter.com/jjsmaga7/status/1232455227...,2020-02-25 23:59:58+00:00,Tue Feb 25 23:59:58 +0000 2020,0,@realDonaldTrump @MikeBloomberg,,,,warren


In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 58656 entries, 0 to 59993
Data columns (total 13 columns):
id                58656 non-null int64
author_id         58656 non-null int64
text              58656 non-null object
retweets          58656 non-null int64
permalink         58656 non-null object
date              58656 non-null object
formatted_date    58656 non-null object
favorites         58656 non-null int64
mentions          12196 non-null object
hashtags          7137 non-null object
geo               0 non-null float64
urls              13772 non-null object
search_term       58656 non-null object
dtypes: float64(1), int64(4), object(8)
memory usage: 6.3+ MB


### Adding Day/month as columns

In [36]:
#Parsing through the date and adding day/month/year/hour to the dataframe
day = []
month = []
year = []
hour = []
formated_date = list(df.formatted_date)
for i in formated_date:
    x = parser.parse(i)
    day.append(x.day)
    month.append(x.month)
    year.append(x.year)
    hour.append(x.hour)

#Adding to df
df['day'] = day
df['month'] = month
df['year'] = year
df['hour'] = hour

In [37]:
#df.groupby(['day', 'search_term']).count()

### Functions for text data and cleaning

In [38]:
#Complie all regular expressions
isURL = re.compile(r'http[s]?:// (?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', re.VERBOSE | re.IGNORECASE)
isRTusername = re.compile(r'^RT+[\s]+(@[\w_]+:)',re.VERBOSE | re.IGNORECASE) #r'^RT+[\s]+(@[\w_]+:)'
isEntity = re.compile(r'@[\w_]+', re.VERBOSE | re.IGNORECASE)

def clean_tweet(row):
    row = isURL.sub("",row)
    row = isRTusername.sub("",row)
    row = isEntity.sub("",row)
    return row

def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in tok.sent_tokenize(text) for word in tok.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [39]:
# remove urls and retweets and entities from the text
df['text_clean'] = df['text'].apply(lambda row:clean_tweet(row))

#remove punctuations
RE_PUNCTUATION = '|'.join([re.escape(x) for x in string.punctuation])  
df['text_clean'] = df['text_clean'].str.replace(RE_PUNCTUATION, "")
df.head(2)

Unnamed: 0,id,author_id,text,retweets,permalink,date,formatted_date,favorites,mentions,hashtags,geo,urls,search_term,day,month,year,hour,text_clean
0,1232455231251730432,1049108900618940416,No wonder he wants to be president!!,0,https://twitter.com/rebellious_yell/status/123...,2020-02-25 23:59:59+00:00,Tue Feb 25 23:59:59 +0000 2020,1,,,,,warren,25,2,2020,23,No wonder he wants to be president
1,1232455227548192769,489991648,@realDonaldTrump Mike Bloomberg’s New TV Ad: I...,0,https://twitter.com/jjsmaga7/status/1232455227...,2020-02-25 23:59:58+00:00,Tue Feb 25 23:59:58 +0000 2020,0,@realDonaldTrump @MikeBloomberg,,,,warren,25,2,2020,23,Mike Bloomberg’s New TV Ad I Will Get It Done...


### Load Pre Trained Neural Net

In [40]:
weight_path = '../SentimentAnalysis/model/best_model.hdf5'
prd_model = load_model(weight_path)
prd_model.summary()
word_idx = json.load(open("../SentimentAnalysis/Data/word_idx.txt"))



Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 56, 100)           40000100  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               234496    
_________________________________________________________________
dense_1 (Dense)              (None, 512)               131584    
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 10)                5130      
Total params: 40,371,310
Trainable params: 371,210
Non-trainable params: 40,000,100
_________________________________________________________________


### Function to get sentiment score using trained weights from NN

In [41]:
def get_sentiment_DL(prd_model, text_data, word_idx):

    #data = "Pass the salt"

    live_list = []
    batchSize = len(text_data)
    live_list_np = np.zeros((56,batchSize))
    for index, row in text_data.iterrows():
        #print (index)
        text_data_sample = text_data['text'][index]
        # split the sentence into its words and remove any punctuations.
        tokenizer = RegexpTokenizer(r'\w+')
        text_data_list = tokenizer.tokenize(text_data_sample)

        #text_data_list = text_data_sample.split()


        labels = np.array(['1','2','3','4','5','6','7','8','9','10'], dtype = "int")
        #word_idx['I']
        # get index for the live stage
        data_index = np.array([word_idx[word.lower()] if word.lower() in word_idx else 0 for word in text_data_list])
        data_index_np = np.array(data_index)

        # padded with zeros of length 56 i.e maximum length
        padded_array = np.zeros(56)
        padded_array[:data_index_np.shape[0]] = data_index_np[:56]
        data_index_np_pad = padded_array.astype(int)


        live_list.append(data_index_np_pad)

    live_list_np = np.asarray(live_list)
    score = prd_model.predict(live_list_np, batch_size=batchSize, verbose=0)
    single_score = np.round(np.dot(score, labels)/10,decimals=2)

    score_all  = []
    for each_score in score:

        top_3_index = np.argsort(each_score)[-3:]
        top_3_scores = each_score[top_3_index]
        top_3_weights = top_3_scores/np.sum(top_3_scores)
        single_score_dot = np.round(np.dot(top_3_index, top_3_weights)/10, decimals = 2)
        score_all.append(single_score_dot)

    text_data['Sentiment_Score'] = pd.DataFrame(score_all)

    return text_data

### Data Clean after Sentiment Included

In [68]:
text_data = df.reset_index()

In [69]:
text_data.head()

Unnamed: 0,index,id,author_id,text,retweets,permalink,date,formatted_date,favorites,mentions,hashtags,urls,search_term,day,month,year,hour,text_clean
0,0,1207812848254115847,1055637768485908485,"Yeah, @JoeBiden?!?!?!?! #DemDebate",0,https://twitter.com/NinesMarie/status/12078128...,2019-12-19 23:59:57+00:00,Thu Dec 19 23:59:57 +0000 2019,0,@JoeBiden,#DemDebate,https://twitter.com/mcpli/status/1207763584907...,biden,19,12,2019,23,Yeah DemDebate
1,1,1207812833414647808,1189920349082783745,Stay positive President Trump. We the people b...,0,https://twitter.com/LloydPe73431132/status/120...,2019-12-19 23:59:54+00:00,Thu Dec 19 23:59:54 +0000 2019,0,,,,biden,19,12,2019,23,Stay positive President Trump We the people be...
2,2,1207812831887818752,223017524,Just have someone in the audience call Biden o...,0,https://twitter.com/bradakibasama/status/12078...,2019-12-19 23:59:53+00:00,Thu Dec 19 23:59:53 +0000 2019,0,,,,biden,19,12,2019,23,Just have someone in the audience call Biden o...
3,3,1207812820081029127,1908816246,How many people of color did your 1994 crimina...,0,https://twitter.com/joeraddi/status/1207812820...,2019-12-19 23:59:51+00:00,Thu Dec 19 23:59:51 +0000 2019,0,,,,biden,19,12,2019,23,How many people of color did your 1994 crimina...
4,4,1207812804209717248,940743747612172288,Which administration built the “cages???”,0,https://twitter.com/KenJ91572854/status/120781...,2019-12-19 23:59:47+00:00,Thu Dec 19 23:59:47 +0000 2019,6,,,,biden,19,12,2019,23,Which administration built the “cages”


In [None]:
text_datat_out = get_sentiment_DL(prd_model, text_data, word_idx)

In [62]:
test = text_datat_out.sort_values(by='Sentiment_Score')[['text', 'search_term', 'Sentiment_Score', 'retweets']]

In [63]:
test = test.reset_index()

In [67]:
test.text[0]

'Am I really going to get my insomniac ass out of my lovely bed to find stupid CNN on my telly and watch the Democratic debate just to take the piss out of Biden? Yeap!'

In [66]:
test.head(1000)

Unnamed: 0,index,text,search_term,Sentiment_Score,retweets
0,405,Am I really going to get my insomniac ass out ...,biden,0.05,0
1,871,Obama is dirty as the day is long. Why does no...,biden,0.06,0
2,205,Can you get me a job like you got hunter in Uk...,biden,0.06,1
3,120,"ACA is crap, good riddance to bad garbage.",biden,0.07,0
4,603,Oh please. Remember Jared and Ivanka who made ...,biden,0.07,0
...,...,...,...,...,...
995,576,Hey Joe we wanted to keep our Doctor and we co...,biden,0.71,0
996,387,Trump can’t afford to wait til Biden is the no...,biden,0.71,0
997,874,That trailer looks amazing! I have seen many t...,biden,0.72,0
998,638,Wow I can’t believe you tweeted me! Thank you ...,biden,0.72,0
