In [1]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords

import re

from sklearn.model_selection import train_test_split

import gensim

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, LSTM, Embedding
from keras.utils import to_categorical

Using TensorFlow backend.


In [3]:
data = pd.read_csv("Resources/CL_data/train_K.tsv", sep="\t")

In [4]:
data.head(20)

Unnamed: 0,id,tweet,label
0,264183816548130816,Gas by my house hit $3.39!!!! I'm going to Cha...,1
1,264249301910310912,Iranian general says Israel's Iron Dome can't ...,-1
2,264105751826538497,with J Davlar 11th. Main rivals are team Polan...,1
3,264094586689953794,"Talking about ACT's && SAT's, deciding where I...",-1
4,254941790757601280,"They may have a SuperBowl in Dallas, but Dalla...",-1
5,264169034155696130,Im bringing the monster load of candy tomorrow...,0
6,263192091700654080,"Apple software, retail chiefs out in overhaul:...",0
7,263398998675693568,@oluoch @victor_otti @kunjand I just watched i...,1
8,260200142420992000,#Livewire Nadal confirmed for Mexican Open in ...,0
9,264087629237202944,@MsSheLahY I didnt want to just pop up... but ...,1


In [5]:
data.shape

(7589, 3)

In [6]:
data["label"].value_counts()

 0    3691
 1    2834
-1    1064
Name: label, dtype: int64

## Cleaning the data

In [7]:
# dropping id column

data = data.drop("id", axis=1)

In [8]:
stopwords = stopwords.words('english')

In [9]:
def tweet_cleaner(tweet):
    tweet = re.sub(r"@\w*", " ", str(tweet).lower()).strip() #removing username
    tweet = re.sub(r'https?://[A-Za-z0-9./]+', " ", str(tweet).lower()).strip() #removing links
    tweet = re.sub(r'[^a-zA-Z]', " ", str(tweet).lower()).strip() #removing sp_char
    tw = []
    
    for text in tweet.split():
        if text not in stopwords:
            tw.append(text)
    
    return " ".join(tw)

In [10]:
data.tweet = data.tweet.apply(lambda x: tweet_cleaner(x))

### word2vec

In [11]:
documents = [text.split() for text in data.tweet]

In [12]:
len(documents)

7589

In [13]:
w2v_model = gensim.models.word2vec.Word2Vec(size = 256, window = 7, min_count = 5)

In [14]:
w2v_model.build_vocab(documents)

In [15]:
w2v_model.train(documents, total_examples=len(documents), epochs=32)

(1997973, 2751136)

In [16]:
w2v_model.wv["books"]

array([-0.06739452,  0.05951492, -0.00569145,  0.00796658,  0.00845119,
       -0.02411447, -0.0907554 , -0.08264431, -0.22629194, -0.00480192,
        0.17816222, -0.0105978 , -0.10060362,  0.12056576, -0.07532194,
       -0.09386944,  0.18856499,  0.10251729,  0.14567699,  0.04933336,
        0.19712687, -0.14026083,  0.04449678,  0.09807208,  0.1154101 ,
       -0.1918768 ,  0.01988804, -0.02788014,  0.02990635,  0.05029344,
       -0.04509725,  0.13051184,  0.00924684,  0.04302948, -0.15820096,
       -0.15352693,  0.02446632, -0.01765246, -0.09593899, -0.07071105,
       -0.21922775, -0.04387697, -0.09720156, -0.0910012 , -0.1579234 ,
        0.14327405,  0.09468402, -0.11714992, -0.07078959, -0.1449274 ,
       -0.0346597 , -0.04150938, -0.04687542,  0.06827463, -0.09273379,
       -0.18003286,  0.01957406, -0.25248638, -0.06319299,  0.13643973,
       -0.12050783,  0.11652208,  0.05853823,  0.17713295, -0.16392364,
       -0.05741556, -0.05238053, -0.02707237, -0.12682949, -0.03

### Converting tweets to vectors

In [17]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data.tweet)

In [18]:
len(tokenizer.word_index)

14849

In [19]:
x_train = pad_sequences(tokenizer.texts_to_sequences(data.tweet), maxlen=256, padding="post", truncating="post")

In [20]:
x_train

array([[ 2448,    95,   259, ...,     0,     0,     0],
       [ 6518,  2183,   143, ...,     0,     0,     0],
       [ 1003,  6520,     2, ...,     0,     0,     0],
       ...,
       [  822,   181,  2377, ...,     0,     0,     0],
       [   54,    15,  6516, ...,     0,     0,     0],
       [14847,   432,   555, ...,     0,     0,     0]], dtype=int32)

In [21]:
y_train = data.label

y_train_f = []
for x in y_train:
    if x == 1:
        y_train_f.append(1)
    elif x == 0:
        y_train_f.append(0)
    elif x == -1:
        y_train_f.append(2)
        
y_train_f = to_categorical(y_train_f)

In [22]:
y_train_f

array([[0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       ...,
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]], dtype=float32)

### Model

In [23]:
embedding_matrix = np.zeros((14850,256))

In [24]:
for word, i in tokenizer.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]

In [25]:
embedding_layer = Embedding(14850, 256, weights=[embedding_matrix], input_length=256, trainable=False)

W0727 11:47:31.614658 4636145088 deprecation_wrapper.py:119] From /Users/valarmathipukuraj/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.



In [26]:
model = Sequential()
model.add(embedding_layer)
model.add(Dropout(0.25))
model.add(Dense(200, activation="relu"))
model.add(Dropout(0.25))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(50, activation="relu"))
model.add(Dense(3, activation="softmax"))

W0727 11:47:31.713744 4636145088 deprecation_wrapper.py:119] From /Users/valarmathipukuraj/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0727 11:47:31.719173 4636145088 deprecation_wrapper.py:119] From /Users/valarmathipukuraj/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0727 11:47:31.760576 4636145088 deprecation_wrapper.py:119] From /Users/valarmathipukuraj/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:174: The name tf.get_default_session is deprecated. Please use tf.compat.v1.get_default_session instead.

W0727 11:47:31.763450 4636145088 deprecation_wrapper.py:119] From /Users/valarmathipukuraj/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:181: The name tf.ConfigProto is deprecated. Please use tf

In [27]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 256, 256)          3801600   
_________________________________________________________________
dropout_1 (Dropout)          (None, 256, 256)          0         
_________________________________________________________________
dense_1 (Dense)              (None, 256, 200)          51400     
_________________________________________________________________
dropout_2 (Dropout)          (None, 256, 200)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               120400    
_________________________________________________________________
dense_2 (Dense)              (None, 50)                5050      
_________________________________________________________________
dense_3 (Dense)              (None, 3)                 153       
Total para

In [28]:
model.compile(loss='categorical_crossentropy',
              optimizer="adam",
              metrics=['accuracy'])

W0727 11:47:32.555874 4636145088 deprecation_wrapper.py:119] From /Users/valarmathipukuraj/anaconda3/lib/python3.7/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.



In [None]:
model.fit(x_train, y_train_f, batch_size=32, epochs=4, validation_split=0.1, verbose=1)

W0727 11:47:33.235931 4636145088 deprecation.py:323] From /Users/valarmathipukuraj/anaconda3/lib/python3.7/site-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Train on 6830 samples, validate on 759 samples
Epoch 1/4

### Testing

In [None]:
def sentiment(text):
    
    x_test = pad_sequences(tokenizer.texts_to_sequences([text]), maxlen=256)
    score = model.predict([x_test])[0]
    
    final = "Positive = %f ,Negative = %f, Neutral = %f" % (score[1], score[2], score[0])
    return print(final)

In [None]:
sentiment("I like reading books.")