In [212]:
import keras
import time
import os
import gc
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from keras import optimizers
from keras.models import Sequential
from keras.layers import Dense, LSTM, Bidirectional
from keras.utils import np_utils

In [85]:
def clean_str(string):  
    """ 
    Tokenization/string cleaning for dataset 
    Every dataset is lower cased except 
    """  
    sens = word_tokenize(string.lower())
    sens = [word for word in sens if not word in english_stopwords]
    sens = [word for word in sens if not word in english_punctuations]
    sens = [lemmatizer.lemmatize(word) for word in sens]
    sens = [word for word in sens if word.isalpha()]
    sens = ' '.join(sens)
    return sens
  

In [4]:
df = pd.read_csv('../data/Tweets.csv')

In [145]:
X = df['text']
y = df['airline_sentiment']
y.replace({'neutral':'0', 'positive':'1', 'negative':'-1'}, inplace = True)

In [182]:
# preprocssing, stopwords and rare words, tokenization and vectorizing
lemmatizer=WordNetLemmatizer()
english_stopwords = stopwords.words('english')
english_punctuations = [',', '.','\'s', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%']
X = X.apply(clean_str)

# vectorizing using tfidf
vectorizer = TfidfVectorizer(ngram_range = (1,2), max_features = 2000)
X_tfidf = vectorizer.fit_transform(X)

In [178]:
a = ['asshole','b','c','d']
b = ['a','curvy','e']
k = vectorizer.fit_transform()

In [202]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf,y,test_size = 0.2, random_state = 517)
X_train = X_train.toarray()
X_train = np.reshape(X_train,(X_train.shape[0], 1, 2000))
X_test = X_test.toarray()
X_test = np.reshape(X_test, (X_test.shape[0], 1, 2000))

y_train = np_utils.to_categorical(y_train, num_classes=3)
y_test = np_utils.to_categorical(y_test, num_classes=3)

In [197]:
X_train.shape

(11712, 1, 2000)

In [209]:
bi_lstm = Sequential()
bi_lstm.add(Bidirectional(LSTM(100 , return_sequences=False), input_shape = (1, 2000)))
bi_lstm.add(Dense(100, activation = 'tanh'))
bi_lstm.add(Dense(3,activation = 'softmax'))
bi_lstm.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_18 (Bidirectio (None, 200)               1680800   
_________________________________________________________________
dense_29 (Dense)             (None, 100)               20100     
_________________________________________________________________
dense_30 (Dense)             (None, 3)                 303       
Total params: 1,701,203
Trainable params: 1,701,203
Non-trainable params: 0
_________________________________________________________________


In [213]:
bi_lstm.compile(loss='categorical_crossentropy', optimizer=optimizers.Adam(lr = 0.01), metrics=['accuracy'])
bi_lstm.fit(X_train, y_train, epochs=50, batch_size=16, verbose=2)

Epoch 1/50
 - 35s - loss: 0.6065 - acc: 0.7539
Epoch 2/50
 - 33s - loss: 0.4390 - acc: 0.8212
Epoch 3/50
 - 33s - loss: 0.3212 - acc: 0.8667
Epoch 4/50
 - 32s - loss: 0.1713 - acc: 0.9352
Epoch 5/50
 - 32s - loss: 0.0724 - acc: 0.9728
Epoch 6/50
 - 33s - loss: 0.0473 - acc: 0.9825
Epoch 7/50
 - 32s - loss: 0.0362 - acc: 0.9860
Epoch 8/50
 - 32s - loss: 0.0306 - acc: 0.9879
Epoch 9/50
 - 33s - loss: 0.0327 - acc: 0.9877
Epoch 10/50
 - 33s - loss: 0.0311 - acc: 0.9881
Epoch 11/50
 - 35s - loss: 0.0298 - acc: 0.9886
Epoch 12/50
 - 35s - loss: 0.0306 - acc: 0.9882
Epoch 13/50
 - 39s - loss: 0.0307 - acc: 0.9882
Epoch 14/50
 - 36s - loss: 0.0265 - acc: 0.9895
Epoch 15/50
 - 35s - loss: 0.0281 - acc: 0.9884
Epoch 16/50
 - 37s - loss: 0.0307 - acc: 0.9885
Epoch 17/50
 - 35s - loss: 0.0269 - acc: 0.9899
Epoch 18/50
 - 36s - loss: 0.0265 - acc: 0.9887
Epoch 19/50
 - 34s - loss: 0.0258 - acc: 0.9898
Epoch 20/50
 - 34s - loss: 0.0279 - acc: 0.9886
Epoch 21/50
 - 35s - loss: 0.0270 - acc: 0.9889
E

<keras.callbacks.History at 0x24f87013e10>

In [207]:
bi_lstm.evaluate(X_test, y_test)



[0.89473610595275799, 0.74180327868852458]

In [208]:
bi_lstm.metrics_names

['loss', 'acc']