In [2]:
import pandas as pd
df= pd.read_csv('../data/Tweets.csv', sep=',')
df.head(3)

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)


In [3]:
# we are interested just by the phrases and the sentiment shown in the airline_sentiment and the text column
data = df[['text','airline_sentiment']]
data.head(10)

Unnamed: 0,text,airline_sentiment
0,@VirginAmerica What @dhepburn said.,neutral
1,@VirginAmerica plus you've added commercials t...,positive
2,@VirginAmerica I didn't today... Must mean I n...,neutral
3,@VirginAmerica it's really aggressive to blast...,negative
4,@VirginAmerica and it's a really big bad thing...,negative
5,@VirginAmerica seriously would pay $30 a fligh...,negative
6,"@VirginAmerica yes, nearly every time I fly VX...",positive
7,@VirginAmerica Really missed a prime opportuni...,neutral
8,"@virginamerica Well, I didn't…but NOW I DO! :-D",positive
9,"@VirginAmerica it was amazing, and arrived an ...",positive


In [4]:
data.shape

(14640, 2)

In [5]:
# convert airline_sentiment column to numeric
sentiment_label = data.airline_sentiment.factorize()
sentiment_label

(array([0, 1, 0, ..., 0, 2, 0], dtype=int64),
 Index(['neutral', 'positive', 'negative'], dtype='object'))

In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

data_phrases = data.text.values
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(data_phrases)

vocab_size = len(tokenizer.word_index) + 1
encoded_phrases = tokenizer.texts_to_sequences(data_phrases)
padded_sequence = pad_sequences(encoded_phrases, maxlen=200)

In [7]:
import pickle

# saving
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
print(tokenizer.word_index)

In [9]:
print(data_phrases[0])
print(encoded_phrases[0])
print(padded_sequence[0])

@VirginAmerica What @dhepburn said.
[81, 62, 226]
[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0  81
  62 226]


In [10]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense, Dropout, SpatialDropout1D
from tensorflow.keras.layers import Embedding

model = Sequential() 
model.add(Embedding(5000, 200))
model.add(LSTM(50, dropout=0.5))
model.add(Dense(3, activation='softmax')) 
model.compile(loss='sparse_categorical_crossentropy',optimizer='rmsprop', metrics=['accuracy'])  
print(model.summary()) 


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 200)         1000000   
                                                                 
 lstm (LSTM)                 (None, 50)                50200     
                                                                 
 dense (Dense)               (None, 3)                 153       
                                                                 
Total params: 1,050,353
Trainable params: 1,050,353
Non-trainable params: 0
_________________________________________________________________
None


In [11]:
from sklearn.model_selection import train_test_split
from keras.callbacks import ModelCheckpoint

X_train, X_test, y_train, y_test = train_test_split(padded_sequence, sentiment_label[0], test_size=0.20, random_state=42)

In [13]:
checkpoint1 = ModelCheckpoint("./best_model.hdf5", monitor='val_accuracy', verbose=1,save_best_only=True, mode='auto', period=1,save_weights_only=False)

history = model.fit(X_train, y_train, validation_split=0.2, epochs=5, batch_size=32, shuffle=True, callbacks=[checkpoint1], validation_data=(X_test, y_test))

Epoch 1/5
Epoch 00001: val_accuracy improved from -inf to 0.78831, saving model to .\best_model.hdf5
Epoch 2/5
Epoch 00002: val_accuracy improved from 0.78831 to 0.80751, saving model to .\best_model.hdf5
Epoch 3/5
Epoch 00003: val_accuracy did not improve from 0.80751
Epoch 4/5
Epoch 00004: val_accuracy did not improve from 0.80751
Epoch 5/5
Epoch 00005: val_accuracy did not improve from 0.80751


In [15]:
from tensorflow.keras.models import load_model

trained_model = load_model("../model/best_model.hdf5")

In [16]:
test_word ="titanic is a great movie"

with open('../model/tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

encoded = tokenizer.texts_to_sequences([test_word])
padded = pad_sequences(encoded, maxlen=200)
padded

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0, 

In [17]:
prediction = trained_model.predict(padded)
prediction

array([[0.08358422, 0.89145106, 0.02496476]], dtype=float32)