In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('./dataset/Tweets.csv')

In [3]:
data.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [4]:
data = data[['airline_sentiment', 'text']]

In [5]:
data.head()

Unnamed: 0,airline_sentiment,text
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...


In [6]:
data.airline_sentiment.unique()

array(['neutral', 'positive', 'negative'], dtype=object)

In [7]:
data.airline_sentiment.value_counts()

negative    9178
neutral     3099
positive    2363
Name: airline_sentiment, dtype: int64

In [8]:
data_p = data[data.airline_sentiment == 'positive']   # positive comments
data_n = data[data.airline_sentiment == 'negative']   # negative comments

In [9]:
data_n = data_n.iloc[: len(data_p)]

In [10]:
len(data_n), len(data_p)

(2363, 2363)

In [11]:
data = pd.concat([data_p, data_n])

In [12]:
data.head()

Unnamed: 0,airline_sentiment,text
1,positive,@VirginAmerica plus you've added commercials t...
6,positive,"@VirginAmerica yes, nearly every time I fly VX..."
8,positive,"@virginamerica Well, I didn't…but NOW I DO! :-D"
9,positive,"@VirginAmerica it was amazing, and arrived an ..."
11,positive,@VirginAmerica I &lt;3 pretty graphics. so muc...


In [13]:
# shuffle
data = data.sample(len(data))

In [14]:
data.head()

Unnamed: 0,airline_sentiment,text
11604,positive,@USAirways Thank you!
5968,positive,@SouthwestAir thank you!!! #bringbacktheluvtor...
14000,positive,"@AmericanAir Thanks for the reply, but a funct..."
6502,positive,@SouthwestAir About time...and just in time fo...
1322,negative,@united why do I check in online if I still ha...


In [15]:
data['review'] = (data.airline_sentiment == 'positive').astype('int')

In [16]:
data.head()

Unnamed: 0,airline_sentiment,text,review
11604,positive,@USAirways Thank you!,1
5968,positive,@SouthwestAir thank you!!! #bringbacktheluvtor...,1
14000,positive,"@AmericanAir Thanks for the reply, but a funct...",1
6502,positive,@SouthwestAir About time...and just in time fo...,1
1322,negative,@united why do I check in online if I still ha...,0


In [17]:
del data['airline_sentiment']

In [18]:
data.head()

Unnamed: 0,text,review
11604,@USAirways Thank you!,1
5968,@SouthwestAir thank you!!! #bringbacktheluvtor...,1
14000,"@AmericanAir Thanks for the reply, but a funct...",1
6502,@SouthwestAir About time...and just in time fo...,1
1322,@united why do I check in online if I still ha...,0


In [19]:
# vectorize the text

In [20]:
import re

In [21]:
token = re.compile('[A-Za-z]+|[!?,.()]')
def reg_text(text):
    re_text = token.findall(text)
    lower_text = [word.lower() for word in re_text]
    return lower_text

In [22]:
data['text'] = data.text.apply(reg_text)

In [23]:
data.head()

Unnamed: 0,text,review
11604,"[usairways, thank, you, !]",1
5968,"[southwestair, thank, you, !, !, !, bringbackt...",1
14000,"[americanair, thanks, for, the, reply, ,, but,...",1
6502,"[southwestair, about, time, ., ., ., and, just...",1
1322,"[united, why, do, i, check, in, online, if, i,...",0


In [24]:
word_set = set()
for text in data.text:
    for word in text: 
        word_set.add(word)

In [25]:
# need another 1 position for padding number refers to <UNK> and padding number itself
word_size = len(word_set) + 1

In [26]:
word_list = list(word_set)

In [27]:
# 0 for padding
word_dict = dict((word, word_list.index(word) + 1) for word in word_list)

In [28]:
data_to_index = data.text.apply(lambda x: [word_dict.get(word, 0) for word in x])

In [29]:
# data.review.values
data_to_index.values

array([list([5245, 2035, 4834, 2591]),
       list([2141, 2035, 4834, 2591, 2591, 2591, 6918, 5069, 2280]),
       list([5452, 101, 6997, 2043, 3654, 2786, 1635, 2471, 1801, 3640, 5164, 5431, 496, 1070, 2043, 1490, 4477, 4783, 1600, 2821, 440, 2043, 1042, 1070, 4848, 2786, 5731, 440]),
       ...,
       list([31, 1492, 3123, 5523, 440, 6899, 6109, 2301, 1231, 4783, 179, 3640, 2786, 6441, 6538, 99, 2743, 2497, 3430, 2751, 3833, 5882, 5077, 3442, 6997, 3121, 3550, 440, 4477, 4783, 3088, 2591]),
       list([31, 5358, 2281, 6287, 1897, 4867, 5776, 204, 3493, 2591]),
       list([31, 1007, 837, 3667, 6920, 2884, 5115, 934, 6095, 629, 1117, 3673])],
      dtype=object)

In [30]:
maxlen = max(len(x) for x in data_to_index )

In [31]:
maxlen

40

In [32]:
data_to_index = keras.preprocessing.sequence.pad_sequences(data_to_index, maxlen=maxlen)

In [33]:
data_to_index.shape

(4726, 40)

In [34]:
# data preprocessing complete

In [35]:
# model building...

In [36]:
model = keras.Sequential()
model.add(layers.Embedding(word_size, 50, input_length=maxlen))
model.add(layers.LSTM(64))
model.add(layers.Dense(1, activation='sigmoid'))

In [37]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 40, 50)            355050    
_________________________________________________________________
lstm (LSTM)                  (None, 64)                29440     
_________________________________________________________________
dense (Dense)                (None, 1)                 65        
Total params: 384,555
Trainable params: 384,555
Non-trainable params: 0
_________________________________________________________________


In [38]:
model.compile(optimizer=keras.optimizers.Adam(), loss='binary_crossentropy', metrics=['acc'])

In [41]:
model.fit(data_to_index, data.review.values, epochs=10, batch_size=128, validation_split=0.2)

Train on 3780 samples, validate on 946 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1b520731160>