In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Dropout, LSTM, Activation, Embedding, CuDNNLSTM, Bidirectional
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import one_hot, Tokenizer
from tensorflow.keras.initializers import glorot_uniform

# Loading the Dataset

In [2]:
df = pd.read_csv('data/data.csv')
df.drop('Unnamed: 0', axis=1, inplace=True)
df.head(10)


Unnamed: 0,Handle,Party,Tweet
0,RepDarrenSoto,Democrat,Gov. @ricardorossello's comments degrading wom...
1,RepDarrenSoto,Democrat,Looks like Trump will end his discriminatory c...
2,RepDarrenSoto,Democrat,For several years we sought to replace our sta...
3,RepDarrenSoto,Democrat,Proud to announce that @fema awarded @PolkCoun...
4,RepDarrenSoto,Democrat,STURDY Bill passed @EnergyCommerce Cmte today ...
5,RepDarrenSoto,Democrat,Enough is enough! Billionaire super predator J...
6,RepDarrenSoto,Democrat,We continue our efforts to provide American Ci...
7,RepDarrenSoto,Democrat,We're committed to defending quality &amp; aff...
8,RepDarrenSoto,Democrat,Robocalls aren’t just annoying. Many are outri...
9,RepDarrenSoto,Democrat,This is why we continue to fight @jediabetical...


# Pre-processing

In [3]:
df['Party'] = pd.Categorical(df.Party)
df['Party'] = pd.get_dummies(df['Party'], drop_first=True)
# df[df['Party'] == 0]
df.head()


Unnamed: 0,Handle,Party,Tweet
0,RepDarrenSoto,0,Gov. @ricardorossello's comments degrading wom...
1,RepDarrenSoto,0,Looks like Trump will end his discriminatory c...
2,RepDarrenSoto,0,For several years we sought to replace our sta...
3,RepDarrenSoto,0,Proud to announce that @fema awarded @PolkCoun...
4,RepDarrenSoto,0,STURDY Bill passed @EnergyCommerce Cmte today ...


<b>0</b> - Democrat <br>
<b>1</b> - Republican

In [4]:
X = df['Tweet']
Y = df['Party']
print(X.shape)
print(Y.shape)

(90000,)
(90000,)


In [5]:
print(X.iloc[0])

Gov. @ricardorossello's comments degrading women, including my dear friend @MMViverito, are unacceptable. I condemn these demeaning words. Now more than ever, Puerto Rico is in need of strong leadership. I urge the Governor to use appropriate language &amp; always respect women.


### Removing special characters

In [10]:
import re
from string import punctuation
punctuation = punctuation + "—\n\t"
regex = re.compile('[%s]' % re.escape(punctuation))

def remove_punctuations(sentence):
    return regex.sub('', sentence)

def remove_link(sentence):
    sentence = str(sentence)
    return sentence[:sentence.find("https://")]

def pre_processing(X, **kwargs):
    # Replaces special characters
    X = X.str.replace('&amp;', "and")
    X = X.str.replace('\xa0', " ")
    X = X.str.replace('\u2003', " ")
    
    #Removes links
    X = X.apply(remove_link)
    
    #Removes punctuations and converts into lowercase
    X = X.apply(remove_punctuations)
    X = X.apply(str.lower)
    
    #Removes null values
    ind = list(X[X==""].index)
    x = X.drop(ind)
    
    if 'Y' in kwargs:
        y = kwargs['Y'].drop(ind)
        return x, y
        
    return x    

In [12]:
x, y = pre_processing(X, Y=Y)
print(X.shape)
print(Y.shape)

(90000,)
(90000,)


In [13]:
#Example
d = pre_processing(X)
d

0        gov ricardorossellos comments degrading women ...
1        looks like trump will end his discriminatory c...
2        for several years we sought to replace our sta...
3        proud to announce that fema awarded polkcounty...
4        sturdy bill passed energycommerce cmte today w...
5        enough is enough billionaire super predator je...
6        we continue our efforts to provide american ci...
7        were committed to defending quality and afford...
8        robocalls aren’t just annoying many are outrig...
9        this is why we continue to fight jediabetical ...
10       trio of fla congressional members wrap up over...
11       freedom isn’t free it’s protected by the coura...
12       major victory days after ussupremecourt decide...
13       passed ✅ proud to join my colleagues from flor...
14       success housedemocrats passed my proposal to a...
15       2 yrs after hurricane maria and one yr after w...
16       our congressional blockchain caucus is meeting.

# Modelling
<b><hr>

## Splitting into training and testing sets


In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=420)

## Using Tokenizer and creating a vocabulary

In [15]:
t = Tokenizer(oov_token="UNK")
a = list(x)

In [16]:
print("Len of a: {}".format(len(a)))

Len of a: 89541


### Creating a vocabulary

In [17]:
t.fit_on_texts(a)

In [18]:
vocab_size = len(t.word_index) + 1
print("Vocabulary size: {}".format(vocab_size))

Vocabulary size: 89911


In [19]:
max_sent_len = len(max(x, key=len).split()) + 1
print("Maximum sentence length: {}".format(max_sent_len))

Maximum sentence length: 74


### Encoding the sentences

In [20]:
def convert_sentences(sentence, word_to_index):
    encoded = np.zeros((1, max_sent_len))
    sentence_words = sentence.lower().split()
    j = 0
    for w in sentence_words:
        encoded[0, j] = word_to_index[w]
        j += 1
    
    return np.float32(encoded)

def encode_and_pad(X):
    encoded_x = t.texts_to_sequences(X)
    padded_x = pad_sequences(encoded_x, maxlen=max_sent_len, padding='post')
    return padded_x

In [21]:
padded_X_train = encode_and_pad(X_train)
padded_X_train

array([[  99,   93, 1529, ...,    0,    0,    0],
       [   2,  839,   10, ...,    0,    0,    0],
       [1126, 1192, 5953, ...,    0,    0,    0],
       ...,
       [   2,   41,  271, ...,    0,    0,    0],
       [  26,   10, 2651, ...,    0,    0,    0],
       [ 128,   13,  590, ...,    0,    0,    0]])

In [22]:
padded_X_train.shape

(71632, 74)

## Embedding Layer

In [21]:
emb_dim = 50

In [22]:
embedding_layer = Embedding(input_dim=vocab_size, output_dim=emb_dim, input_length=max_sent_len, trainable=False)
trained_embedding_layer = Embedding(input_dim=vocab_size, output_dim=emb_dim, input_length=max_sent_len, trainable=True)

W0719 00:23:53.289809 13600 deprecation.py:506] From C:\Users\tshan\Anaconda3\envs\nlp\lib\site-packages\tensorflow\python\keras\initializers.py:119: calling RandomUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [23]:
# encoded_X_train1 = tf.convert_to_tensor(encoded_X_train, np.float32)
layer_1 = embedding_layer(padded_X_train)

In [24]:
trained_layer_1 = trained_embedding_layer(padded_X_train)

In [25]:
n_classes = 2
learning_rate = 0.05
epochs = 50
num_hidden_units = 128
timesteps = 28

In [26]:
model = Sequential([
    embedding_layer,
    Bidirectional(CuDNNLSTM(300, return_sequences=False)),
    Dropout(0.5),
    Dense(2, activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

W0719 00:26:17.933421 13600 deprecation.py:506] From C:\Users\tshan\Anaconda3\envs\nlp\lib\site-packages\tensorflow\python\ops\init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0719 00:26:17.937412 13600 deprecation.py:506] From C:\Users\tshan\Anaconda3\envs\nlp\lib\site-packages\tensorflow\python\ops\init_ops.py:97: calling GlorotUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0719 00:26:17.938408 13600 deprecation.py:506] From C:\Users\tshan\Anaconda3\envs\nlp\lib\site-packages\tensorflow\python\ops\init_ops.py:97: calling Orthogonal.__init__ (from tensorflow.python.ops.init_

In [None]:
model.fit(padded_X_train, y_train, epochs=200, batch_size = 400, shuffle=True)

In [27]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 74, 50)            4495550   
_________________________________________________________________
bidirectional (Bidirectional (None, 600)               844800    
_________________________________________________________________
dropout (Dropout)            (None, 600)               0         
_________________________________________________________________
dense (Dense)                (None, 2)                 1202      
Total params: 5,341,552
Trainable params: 846,002
Non-trainable params: 4,495,550
_________________________________________________________________


In [148]:
model1 = Sequential([
    embedding_layer,
    Bidirectional(CuDNNLSTM(128, return_sequences=True)),
    Dropout(0.5),
    CuDNNLSTM(128),
    Dropout(0.5),
    Dense(2),
    Activation('softmax')
])

W0718 00:53:54.259601 20456 deprecation.py:506] From C:\Users\tshan\Anaconda3\envs\nlp\lib\site-packages\tensorflow\python\ops\init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0718 00:53:54.263591 20456 deprecation.py:506] From C:\Users\tshan\Anaconda3\envs\nlp\lib\site-packages\tensorflow\python\ops\init_ops.py:97: calling GlorotUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0718 00:53:54.264588 20456 deprecation.py:506] From C:\Users\tshan\Anaconda3\envs\nlp\lib\site-packages\tensorflow\python\ops\init_ops.py:97: calling Orthogonal.__init__ (from tensorflow.python.ops.init_

In [149]:
model1.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 74, 50)            4495550   
_________________________________________________________________
bidirectional (Bidirectional (None, 74, 256)           184320    
_________________________________________________________________
dropout (Dropout)            (None, 74, 256)           0         
_________________________________________________________________
cu_dnnlstm_1 (CuDNNLSTM)     (None, 128)               197632    
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 2)                 258       
_________________________________________________________________
activation (Activation)      (None, 2)                 0

In [152]:
model1.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [154]:
model1.fit(padded_X_train, y_train, epochs=50, batch_size = 400, shuffle=True)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x1a9009e8cc0>

In [165]:
model2 = Sequential([
    embedding_layer,
    Bidirectional(CuDNNLSTM(256, return_sequences=False)),
    Dropout(0.5),
    Dense(2, activation='softmax')
])

model2.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model2.fit(padded_X_train, y_train, epochs=150, batch_size = 400, shuffle=True)

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x1a91139d550>

In [38]:
encoded_X_test = t.texts_to_sequences(X_test)
padded_X_test = pad_sequences(encoded_X_test, maxlen=max_sent_len, padding='post')
padded_X_test

array([[14982, 26770, 26771, ...,     0,     0,     0],
       [   60,     2,   563, ...,     0,     0,     0],
       [    2,    59,     4, ...,     0,     0,     0],
       ...,
       [  111,     3, 47046, ...,     0,     0,     0],
       [ 1300,     0,     0, ...,     0,     0,     0],
       [   15,     2,   219, ...,     0,     0,     0]])

In [40]:
padded_X_test.shape

(17909, 74)

In [43]:
pred_y_test = model4.predict_classes(padded_X_test)
pred_y_test

array([1, 0, 1, ..., 0, 0, 1], dtype=int64)

In [44]:
pred_y_test.shape

(17909,)

In [1]:
y_test.shape

NameError: name 'y_test' is not defined

In [47]:
from tensorflow.train import Saver

In [None]:
saver = Saver(max_to_keep=1) 
with tf.Session() as sess:
    # train your model, then:
    savePath = saver.save(sess, 'model/trained_model.ckpt')