In [1]:
import pandas as pd
import numpy as np

import nltk
# nltk.download()
from nltk.corpus import stopwords

import re

from sklearn.model_selection import train_test_split
# !pip install -U gensim
import gensim

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, LSTM, Embedding
from keras.utils import to_categorical

Using TensorFlow backend.


In [2]:
data = pd.read_csv("Resources/CL_data/train.csv")

In [3]:
data.head()

Unnamed: 0,Id,Category,Tweet
0,639976673685962000,0,And @whytology article 3 gives SCOTUS no legis...
1,641539752177586000,0,And 2nd it wasnt harrys fault that he had ran ...
2,639767721073315000,0,And I don't know about TGIF. Fridays are terri...
3,638850695349800000,0,"And if my day wasn't bad enough already, Snoop..."
4,638783969568366000,0,And in comes Sony with the letdown. Only one c...


In [4]:
data.rename(columns={'Id':'id',
                          'Category':'label',
                          'Tweet':'tweet'}, 
                 inplace=True)

In [5]:
data.head()

Unnamed: 0,id,label,tweet
0,639976673685962000,0,And @whytology article 3 gives SCOTUS no legis...
1,641539752177586000,0,And 2nd it wasnt harrys fault that he had ran ...
2,639767721073315000,0,And I don't know about TGIF. Fridays are terri...
3,638850695349800000,0,"And if my day wasn't bad enough already, Snoop..."
4,638783969568366000,0,And in comes Sony with the letdown. Only one c...


In [6]:
data.shape

(2742, 3)

In [7]:
data["label"].value_counts()

1    2165
0     577
Name: label, dtype: int64

## Cleaning the data

In [8]:
# dropping id column

data = data.drop("id", axis=1)

In [9]:
stopwords = stopwords.words('english')

In [10]:
def tweet_cleaner(tweet):
    tweet = re.sub(r"@\w*", " ", str(tweet).lower()).strip() #removing username
    tweet = re.sub(r'https?://[A-Za-z0-9./]+', " ", str(tweet).lower()).strip() #removing links
    tweet = re.sub(r'[^a-zA-Z]', " ", str(tweet).lower()).strip() #removing sp_char
    tw = []
    
    for text in tweet.split():
        if text not in stopwords:
            tw.append(text)
    
    return " ".join(tw)

In [11]:
data.tweet = data.tweet.apply(lambda x: tweet_cleaner(x))

### word2vec

In [12]:
documents = [text.split() for text in data.tweet]

In [13]:
len(documents)

2742

In [14]:
w2v_model = gensim.models.word2vec.Word2Vec(size = 256, window = 7, min_count = 5)

In [15]:
w2v_model.build_vocab(documents)

In [16]:
w2v_model.train(documents, total_examples=len(documents), epochs=32)

(593893, 967424)

In [17]:
w2v_model.wv["happy"]

array([-0.00598909, -0.4722668 , -0.09655412, -0.37244913, -0.08486858,
       -0.20167777,  0.20023277,  0.08242609,  0.09362113, -0.12624973,
       -0.48497325,  0.34036568,  0.38601735, -0.06259313, -0.2781593 ,
       -0.09938415,  0.17153692,  0.20689152, -0.06500493, -0.14593157,
       -0.00103552,  0.19058743,  0.12887733,  0.2932889 ,  0.3671027 ,
       -0.5019751 , -0.11377428,  0.34662455, -0.45808923, -0.47487524,
       -0.23297955,  0.40128368, -0.21937132, -0.12481028,  0.20255196,
       -0.33321992,  0.46738693,  0.10047664,  0.1675784 , -0.24538265,
        0.08389262, -0.11780208, -0.28360167,  0.308576  ,  0.08450785,
        0.1820194 ,  0.04597469,  0.04638813, -0.28724763,  0.29620442,
        0.35297784, -0.08294714, -0.29473817, -0.52131206, -0.47529128,
        0.14494878,  0.21032952, -0.13607447,  0.0051568 ,  0.15447526,
        0.05043015,  0.30377173,  0.143796  , -0.04939755,  0.3214892 ,
        0.6144994 , -0.2673427 ,  0.13147832, -0.04759571, -0.01

### Converting tweets to vectors

In [34]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data.tweet)

In [35]:
len(tokenizer.word_index)

7011

In [36]:
x_train = pad_sequences(tokenizer.texts_to_sequences(data.tweet), maxlen=256, padding="post", truncating="post")

In [37]:
x_train

array([[1045, 1046,  363, ...,    0,    0,    0],
       [  28, 1779, 2769, ...,    0,    0,    0],
       [  33,   37, 2772, ...,    0,    0,    0],
       ...,
       [ 110,  220,    2, ...,    0,    0,    0],
       [ 110, 1262,    2, ...,    0,    0,    0],
       [ 110, 7009, 7010, ...,    0,    0,    0]], dtype=int32)

In [38]:
y_train = data.label

y_train_f = []
for x in y_train:
    if x == 1:
        y_train_f.append(1)
    elif x == 0:
        y_train_f.append(0)
    elif x == -1:
        y_train_f.append(2)
        
y_train_f = to_categorical(y_train_f)

In [23]:
y_train_f

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [0., 1.],
       [0., 1.],
       [0., 1.]], dtype=float32)

### Model

In [24]:
embedding_matrix = np.zeros((14850,256))

In [25]:
for word, i in tokenizer.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]

In [26]:
embedding_layer = Embedding(14850, 256, weights=[embedding_matrix], input_length=256, trainable=False)

W0726 16:27:07.154638 4562326976 deprecation_wrapper.py:119] From /Users/valarmathipukuraj/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.



In [44]:
model = Sequential()
model.add(embedding_layer)
model.add(Dropout(0.25))
model.add(Dense(200, activation="relu"))
model.add(Dropout(0.25))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(50, activation="relu"))
model.add(Dense(100, activation="relu"))
model.add(Dense(2, activation="softmax"))

In [45]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 256, 256)          3801600   
_________________________________________________________________
dropout_3 (Dropout)          (None, 256, 256)          0         
_________________________________________________________________
dense_4 (Dense)              (None, 256, 200)          51400     
_________________________________________________________________
dropout_4 (Dropout)          (None, 256, 200)          0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               120400    
_________________________________________________________________
dense_5 (Dense)              (None, 50)                5050      
_________________________________________________________________
dense_6 (Dense)              (None, 100)               5100      
__________

In [46]:
model.compile(loss='categorical_crossentropy',
              optimizer="adam",
              metrics=['accuracy'])

In [47]:
model.fit(x_train, y_train_f, batch_size=32, epochs=10, validation_split=0.1, verbose=1)

Train on 2467 samples, validate on 275 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1a350c5e48>

# Testing

In [42]:
def sentiment(text):
    
    x_test = pad_sequences(tokenizer.texts_to_sequences([text]), maxlen=256)
    score = model.predict([x_test])[0]
    
    final = "Accuracy = %f ,value loss = %f" % (score[1], score[0])
    return print(final)

In [43]:
sentiment(" It's grotesque, it's barbaric")

Accuracy = 0.764084 ,value loss = 0.235916
