In [45]:
# Importing the necessary libraries
import pandas as pd
import numpy as np
import nltk
import re
nltk.download('punkt')
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### Reading the data

In [17]:
# Reading the csv file into a dataframe
data = pd.read_csv('tweets_company.csv' ,encoding='latin-1')
data.tail()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
9088,Ipad everywhere. #SXSW {link},iPad,Positive emotion
9089,"Wave, buzz... RT @mention We interrupt your re...",,No emotion toward brand or product
9090,"Google's Zeiger, a physician never reported po...",,No emotion toward brand or product
9091,Some Verizon iPhone customers complained their...,,No emotion toward brand or product
9092,Ï¡Ïàü_ÊÎÒ£Áââ_£â_ÛâRT @...,,No emotion toward brand or product


As we want a model that can rate the sentiment of a tweet based on its content we will drop any coulmns that are not relevenat to this goal in this case being column 'emotion_in_tweet_is_directed_at'

In [18]:
# Let us keep the necessary columns
data =  data[['tweet_text','is_there_an_emotion_directed_at_a_brand_or_product']]
data

Unnamed: 0,tweet_text,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Positive emotion
...,...,...
9088,Ipad everywhere. #SXSW {link},Positive emotion
9089,"Wave, buzz... RT @mention We interrupt your re...",No emotion toward brand or product
9090,"Google's Zeiger, a physician never reported po...",No emotion toward brand or product
9091,Some Verizon iPhone customers complained their...,No emotion toward brand or product


In [19]:
# These columns names are quite bulky so let us rename them
data.rename(columns = {'tweet_text' : 'text'}, inplace=True)
data.rename(columns = {'is_there_an_emotion_directed_at_a_brand_or_product' : 'sentiment'}, inplace=True)

In [20]:
data.head()

Unnamed: 0,text,sentiment
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Positive emotion


Great! now our columns have simpler column names which makes it easier to work with.

In [21]:
data = data.copy()
data

Unnamed: 0,text,sentiment
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Positive emotion
...,...,...
9088,Ipad everywhere. #SXSW {link},Positive emotion
9089,"Wave, buzz... RT @mention We interrupt your re...",No emotion toward brand or product
9090,"Google's Zeiger, a physician never reported po...",No emotion toward brand or product
9091,Some Verizon iPhone customers complained their...,No emotion toward brand or product


To be able to work with column text we need to be able to clean it in that there are no characters eg.@,?,# and so we proceed to preprocessing the data that is column text which is the company's tweets. 

#### Data Preprocessing

Let us first check for missing values in text.

In [22]:
data['text'].isna().sum()

1

So the column has one missing value let us deal with it by dropping the row with missing data.

In [23]:
data.dropna(subset=['text'],inplace=True)

In [24]:
data['text'].isna().sum() # Great no more missing values let us continue

0

For the fact that we are dealing with text we will not check for duplicate values as it is more than evident to have duplicate values in text curated by humans via the internet on social media platforms such as Twitter.

Let us start by converting column text to lowercase to ensure consistency that all words upper and lower case are treated same.

In [25]:
data['text'] = data['text'].apply(lambda x: x.lower())

The data has special characters and punctuation that any model will have difficulty dealing with so we need to remove them as they also do not contribute meaning to the text.

In [26]:
data['text'] = data['text'].apply((lambda x: re.sub(r'[^a-zA-Z0-9]', '',x)))

In [27]:
data['text']

0       wesley83ihavea3giphoneafter3hrstweetingatrisea...
1       jessedeeknowaboutfludappawesomeipadiphoneappth...
2       swonderlincannotwaitforipad2alsotheyshouldsale...
3       sxswihopethisyearsfestivalisntascrashyasthisye...
4       sxtxstategreatstuffonfrisxswmarissamayergoogle...
                              ...                        
9088                               ipadeverywheresxswlink
9089    wavebuzzrtmentionweinterruptyourregularlysched...
9090    googleszeigeraphysicianneverreportedpotentiala...
9091    someverizoniphonecustomerscomplainedtheirtimef...
9092          rtmentiongoogletestscheckinoffersatsxswlink
Name: text, Length: 9092, dtype: object

Great! Now our text does not have special characters and punctuation thanks to regular expressions that allow us to specify the pattern we want the text we have to follow which is one with no special characters and punctuation.

Next step is to breakdown the cleaned text into individual words or tokens. This provides basis for further analysis.

In [36]:
tokens = data['text'].apply(word_tokenize)
tokens

0       [wesley83ihavea3giphoneafter3hrstweetingatrise...
1       [jessedeeknowaboutfludappawesomeipadiphoneappt...
2       [swonderlincannotwaitforipad2alsotheyshouldsal...
3       [sxswihopethisyearsfestivalisntascrashyasthisy...
4       [sxtxstategreatstuffonfrisxswmarissamayergoogl...
                              ...                        
9088                             [ipadeverywheresxswlink]
9089    [wavebuzzrtmentionweinterruptyourregularlysche...
9090    [googleszeigeraphysicianneverreportedpotential...
9091    [someverizoniphonecustomerscomplainedtheirtime...
9092        [rtmentiongoogletestscheckinoffersatsxswlink]
Name: text, Length: 9092, dtype: object

Each row now contains individual tokens/words separated by commas.

Next step is Stopword Removal. Stopwords are common words that do not carry much signifance in sentiment analysis such as 'the','is'.

Removal of stopwords helps reduce noise in the data and improve efficiency of the analysis.

In [41]:
stop_words = set(stopwords.words('english'))

filtered_text = [[token for token in token_list if token.lower() not in stop_words]
                 for token_list in tokens]
for token_list in filtered_text:
    print(token_list)


['wesley83ihavea3giphoneafter3hrstweetingatriseaustinitwasdeadineedtoupgradepluginstationsatsxsw']
['jessedeeknowaboutfludappawesomeipadiphoneappthatyoulllikelyappreciateforitsdesignalsotheyregivingfreetsatsxsw']
['swonderlincannotwaitforipad2alsotheyshouldsalethemdownatsxsw']
['sxswihopethisyearsfestivalisntascrashyasthisyearsiphoneappsxsw']
['sxtxstategreatstuffonfrisxswmarissamayergoogletimoreillytechbooksconferencesampmattmullenwegwordpress']
['teachntech00newipadappsforspeechtherapyandcommunicationareshowcasedatthesxswconferencehttphtly49n4miearedchatasd']
['sxswisjuststartingctiaisaroundthecornerandgoogleioisonlyahopskipandajumpfromtheregoodtimetobeanandroidfan']
['beautifullysmartandsimpleideartmadebymanythenextwebwroteaboutourhollergramipadappforsxswhttpbitlyieavob']
['countingdownthedaystosxswplusstrongcanadiandollarmeansstockuponapplegear']
['excitedtomeetthesamsungmobileusatsxswsoicanshowthemmysprintgalaxysstillrunningandroid21fail']
['findampstartimpromptupartiesatsxswwithh

Now we have a series with no stopwords! 

The first model I want to use is the Recurrent Neural Network which requires tokens to be a sequence of tokens to be able to be fed into an LSTM network which is in full (Long Short Term Memory Cells) which is a type of RNN specifically designed to overcome the Vanishing Gradient Problem in RNN.

In [54]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['text'])
sequences = tokenizer.texts_to_sequences(data['text'])

# Vocabulary size
vocab_size = len(tokenizer.word_index) + 1

# Padding
max_sequence_length = max(len(sequence) for sequence in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length)

In [55]:
labels = np.array(data['sentiment'])