In [1]:
import numpy as np
import pandas as pd
np.random.seed(0)
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform
from unicodedata import normalize
np.random.seed(1)
#import dill

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def read_glove_vecs(glove_file):
    with open(glove_file, 'r') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = normalize('NFKD',line.strip()).split(' ')
            curr_word = ' '.join(line[:-300])
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[-300:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map

In [3]:
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('../data/glove.840B.300d.txt')

In [4]:
word = "nyu"
index = 289846
print("the index of", word, "in the vocabulary is", word_to_index[word])
print("the", str(index) + "th word in the vocabulary is", index_to_word[index])

the index of nyu in the vocabulary is 1934360
the 289846th word in the vocabulary is 256-byte


In [5]:
data = pd.read_csv('../data/output_1.csv',index_col=0).reset_index(drop=True)

In [6]:
train = pd.concat([data[data['label']==0].sample(400),data[data['label']==1].sample(400)])

In [7]:
X_train, Y_train = np.array(train.tweet_content),np.array(train.label)

In [8]:
test = data[~data.index.isin(list(train.index))]

In [9]:
X_test, Y_test = np.array(test.tweet_content),np.array(test.label)

In [20]:
def find_len(sentence):
    sen = sentence.split(' ')
    sen = [i.strip() for i in sen if 'http' not in i]
    sen = [i for i in sen if len(i)>0]
    return(len(sen))
def convert_to_one_hot(Y, C):
    Y = np.eye(C)[Y.reshape(-1)]
    return Y

In [11]:
maxLen = max([find_len(i) for i in X_train])

In [12]:
def sentences_to_indices(X, word_to_index, max_len):
    """
    Converts an array of sentences (strings) into an array of indices corresponding to words in the sentences.
    The output shape should be such that it can be given to `Embedding()` (described in Figure 4). 
    
    Arguments:
    X -- array of sentences (strings), of shape (m, 1)
    word_to_index -- a dictionary containing the each word mapped to its index
    max_len -- maximum number of words in a sentence. You can assume every sentence in X is no longer than this. 
    
    Returns:
    X_indices -- array of indices corresponding to words in the sentences from X, of shape (m, max_len)
    """
    
    m = X.shape[0]                                   # number of training examples
    
    ### START CODE HERE ###
    # Initialize X_indices as a numpy matrix of zeros and the correct shape (≈ 1 line)
    X_indices = np.zeros([m,max_len])
    
    for i in range(m):                               # loop over training examples
        
        # Convert the ith training sentence in lower case and split is into words. You should get a list of words.
        sentence_words = X[i].split(' ')
        sentence_words = [i.strip() for i in sentence_words if 'http' not in i]
        sentence_words = [i for i in sentence_words if len(i)>0]
        
        # Initialize j to 0
        j = 0
        
        # Loop over the words of sentence_words
        for w in sentence_words:
            if w not in word_to_index:
                word_to_vec_map[w] = np.random.normal(0,0.5,300)
                word_to_index[w] = len(word_to_index)+1
                index_to_word[len(word_to_index)+1] = w
            # Set the (i,j)th entry of X_indices to the index of the correct word.
            X_indices[i, j] = word_to_index[w]
            # Increment j to j + 1
            j = j+1
            
    ### END CODE HERE ###
    
    return X_indices

In [13]:
X1 = np.array(data['tweet_content'][:3].tolist())
X1_indices = sentences_to_indices(X1,word_to_index, max_len = 30)
print("X1 =", X1)
print("X1_indices =", X1_indices)

X1 = ['.@FloydMayweather just stepped up in a BIG way and donated $200 MILLION to #Houston 🙌🏽🙏🏽 #Harvey'
 'Hey! @FloydMayweather social media is saying you donated $200 million to Houston for victims of Harvey. Is it true? #harvey #HopeForHouston'
 '#Harvey2017 #Antifa #BlackLivesMatter #HarveyLootcrew https://t.co/R3PCR44Amf']
X1_indices = [[2.195412e+06 1.834088e+06 2.084350e+06 2.145677e+06 1.810830e+06
  1.540021e+06 5.279720e+05 2.166160e+06 1.561756e+06 1.694663e+06
  2.195413e+06 1.027925e+06 2.121635e+06 1.319000e+03 2.195414e+06
  2.195415e+06 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
  0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
  0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00]
 [2.195416e+06 2.195417e+06 2.071175e+06 1.883765e+06 1.823739e+06
  2.041490e+06 2.189551e+06 1.694663e+06 2.195413e+06 1.893035e+06
  2.121635e+06 8.848700e+05 1.746336e+06 2.155398e+06 1.936716e+06
  2.195418e+06 9.181830e+05 1.825179e+06 2.195419e+06

In [14]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    """
    Creates a Keras Embedding() layer and loads in pre-trained GloVe 50-dimensional vectors.
    
    Arguments:
    word_to_vec_map -- dictionary mapping words to their GloVe vector representation.
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    embedding_layer -- pretrained layer Keras instance
    """
    
    vocab_len = len(word_to_index) + 1                  # adding 1 to fit Keras embedding (requirement)
    emb_dim = word_to_vec_map["cucumber"].shape[0]      # define dimensionality of your GloVe word vectors 
    
    ### START CODE HERE ###
    # Initialize the embedding matrix as a numpy array of zeros of shape (vocab_len, dimensions of word vectors = emb_dim)
    emb_matrix = np.zeros([vocab_len,emb_dim])
    
    # Set each row "index" of the embedding matrix to be the word vector representation of the "index"th word of the vocabulary
    for word, index in word_to_index.items():
        emb_matrix[index, :] = word_to_vec_map[word]

    # Define Keras embedding layer with the correct output/input sizes, make it trainable. 
    embedding_layer = Embedding(vocab_len, emb_dim,trainable=False)
    ### END CODE HERE ###

    # Build the embedding layer, it is required before setting the weights of the embedding layer. Do not modify the "None".
    embedding_layer.build((None,))
    
    # Set the weights of the embedding layer to the embedding matrix. Your layer is now pretrained.
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

In [15]:
embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
print("weights[0][1][3] =", embedding_layer.get_weights()[0][1][3])

weights[0][1][3] = -0.40237


In [16]:
def Harvey_Fake(input_shape, word_to_vec_map, word_to_index):
    """
    Function creating the Harvey_Fake model's graph.
    
    Arguments:
    input_shape -- shape of the input, usually (max_len,)
    word_to_vec_map -- dictionary mapping every word in a vocabulary into its 300-dimensional vector representation
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (2.2m words)

    Returns:
    model -- a model instance in Keras
    """
    
    ### START CODE HERE ###
    # Define sentence_indices as the input of the graph, it should be of shape input_shape and dtype 'int32' (as it contains indices).
    sentence_indices = Input(input_shape,dtype='int32')
    
    # Create the embedding layer pretrained with GloVe Vectors (≈1 line)
    embedding_layer = pretrained_embedding_layer(word_to_vec_map,word_to_index)
    
    # Propagate sentence_indices through your embedding layer, you get back the embeddings
    embeddings = embedding_layer(sentence_indices)   
    
    # Propagate the embeddings through an LSTM layer with 128-dimensional hidden state
    # Be careful, the returned output should be a batch of sequences.
    X = LSTM(128,return_sequences=True)(embeddings)
    # Add dropout with a probability of 0.5
    X = Dropout(0.5)(X)
    # Propagate X trough another LSTM layer with 128-dimensional hidden state
    # Be careful, the returned output should be a single hidden state, not a batch of sequences.
    X = LSTM(128)(X)
    # Add dropout with a probability of 0.5
    X = Dropout(0.5)(X)
    # Propagate X through a Dense layer with softmax activation to get back a batch of 5-dimensional vectors.
    X = Dense(2)(X)
    # Add a softmax activation
    X = Activation('softmax')(X)
    
    # Create Model instance which converts sentence_indices into X.
    model = Model(inputs=[sentence_indices],outputs=[X])
    
    ### END CODE HERE ###
    
    return model

In [17]:
model = Harvey_Fake((maxLen,), word_to_vec_map, word_to_index)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 29)                0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 29, 300)           658627800 
_________________________________________________________________
lstm_1 (LSTM)                (None, 29, 128)           219648    
_________________________________________________________________
dropout_1 (Dropout)          (None, 29, 128)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 258       
__________

In [18]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [21]:
X_train_indices = sentences_to_indices(X_train, word_to_index, maxLen)
Y_train_oh = convert_to_one_hot(Y_train, C = 2)

In [23]:
model.fit(X_train_indices, Y_train_oh, epochs = 50, batch_size = 32, shuffle=True)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f7e184302b0>

In [24]:
X_test_indices = sentences_to_indices(X_test, word_to_index, max_len = maxLen)
Y_test_oh = convert_to_one_hot(Y_test, C = 2)
loss, acc = model.evaluate(X_test_indices, Y_test_oh)
print()
print("Test accuracy = ", acc)


Test accuracy =  0.8219895291078777


In [25]:
X_play = np.array(['There is a crocodile in the flood','Donald Trump donated 1 billion'])

In [29]:
C = 2
X_play_indices = sentences_to_indices(X_play, word_to_index, maxLen)
pred = model.predict(X_play_indices)
for i in range(len(X_play)):
    x = X_play_indices
    num = np.argmax(pred[i])
    print(X_play[i] +' prediction: ' + str(num))

There is a crocodile in the flood prediction: 1
Donald Trump donated 1 billion prediction: 0
