In [None]:
pip install -q tf-models-official==2.3.0

In [None]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras import callbacks
from tensorflow.keras.utils import to_categorical
import numpy as np
import matplotlib.pyplot as plt


from official import nlp
from official.nlp import bert
from official.nlp import optimization

In [None]:
tweets_df = pd.read_csv("data\\data.csv")
tweets_df

In [None]:
import re
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords


def remove_url(text):
    return re.sub(r"http\S+", "", text)

def remove_punctuation(text):
    """Remove punctuation"""
    translator = str.maketrans("","",string.punctuation)
    return text.translate(translator)

stop = set(stopwords.words("english"))
def remove_stopwords(text):
    filtered_words = [word.lower() for word in text.split() if word.lower() not in stop]
    return " ".join(filtered_words)

def remove_numbers(text):
    filtered_words = [i for i in text.split() if not i.isdigit()]
    return " ".join(filtered_words)

stop_words = ["ed","rt","tweet","tweeted"]
def remove_freq(text):
    filtered_words = [word.lower() for word in text.split() if word.lower() not in stop_words]
    return " ".join(filtered_words)

def remove_non_latin(text):
    return re.sub(r'[^\x00-\x7F]+','', text)


In [None]:
tweets_df['Tweets'] = tweets_df['Tweets'].apply(remove_url)
tweets_df['Tweets'] = tweets_df['Tweets'].apply(remove_punctuation)
tweets_df['Tweets'] = tweets_df['Tweets'].apply(remove_stopwords)
tweets_df['Tweets'] = tweets_df['Tweets'].apply(remove_numbers)
tweets_df['Tweets'] = tweets_df['Tweets'].apply(remove_freq)
tweets_df['Tweets'] = tweets_df['Tweets'].apply(remove_non_latin)
tweets_df['Tweets'] = tweets_df['Tweets'].str.replace('\d+', '')


In [None]:
# Use English stemmer.
stemmer = SnowballStemmer("english")

tweets_df['tokenized'] = tweets_df.apply(lambda row: nltk.word_tokenize(row['Tweets']), axis=1)
tweets_df['stemmed'] = tweets_df['tokenized'].apply(lambda x: [stemmer.stem(y) for y in x])
tweets_df['stemmed'] = tweets_df.apply(lambda row: TreebankWordDetokenizer().detokenize(row['stemmed']), axis=1)

In [None]:
from collections import Counter

# Count unique words to calculate number of unique words for tokenization
def counter_word(text_col):
    count = Counter()
    for text in text_col.values:
        for word in text.split():
            count[word] += 1
    return count


counter = counter_word(tweets_df.text)

In [None]:
len(counter)

In [None]:
counter

In [None]:
num_unique_words = len(counter)

In [None]:
counter.most_common(10)

In [None]:
X = tweets_df['stemmed']
y = pd.get_dummies(tweets_df['Feeling']).values


In [None]:
from sklearn.model_selection import train_test_split

# split the full data 80:20 into training:validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, random_state=101)

# split training data 87.5:12.5 into training:testing sets
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, train_size=0.875, random_state=101)

In [None]:
# Tokenize
from tensorflow.keras.preprocessing.text import Tokenizer

# vectorize text by turning each text into a sequence of integers
tokenizer = Tokenizer(num_words=num_unique_words)
tokenizer.fit_on_texts(X_train) 

In [None]:
index_of_words = tokenizer.word_index

# vacab size is number of unique words + reserved 0 index for padding
vocab_size = len(index_of_words) + 1

print('Number of unique words: {}'.format(len(index_of_words)))

In [None]:
X_train = tokenizer.texts_to_sequences(X_train)
X_valid = tokenizer.texts_to_sequences(X_valid)
X_test = tokenizer.texts_to_sequences(X_test)

In [None]:
print(X_train[10:15])

In [None]:
# Pad the sequences to have the same length
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Max number of words in a sequence
max_length = 500
max_len = -1
for ele in tweets_df.text: 
    if len(ele) > max_len: 
        max_len = len(ele) 
        res = ele 
print(max_len)



X_train = pad_sequences(X_train, maxlen=max_length, padding="post", truncating="post")
X_valid = pad_sequences(X_valid, maxlen=max_length, padding="post", truncating="post")
X_test = pad_sequences(X_test, maxlen=max_length, padding="post", truncating="post")
X_train.shape,  X_valid.shape, X_test.shape

In [None]:
def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    with open(filepath,encoding="utf8") as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word] 
                embedding_matrix[idx] = np.array(
                    vector, dtype=np.float32)[:embedding_dim]
    return embedding_matrix

In [None]:
import urllib.request
import zipfile
import os

fname = 'embeddings/wiki-news-300d-1M.vec'

if not os.path.isfile(fname):
    print('Downloading word vectors...')
    urllib.request.urlretrieve('https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip',
                              'wiki-news-300d-1M.vec.zip')
    print('Unzipping...')
    with zipfile.ZipFile('wiki-news-300d-1M.vec.zip', 'r') as zip_ref:
        zip_ref.extractall('embeddings')
    print('done.')
    
    os.remove('wiki-news-300d-1M.vec.zip')

In [None]:
# Number of dimensions for word embedding
embed_num_dims = 300

embedd_matrix = create_embedding_matrix(fname, index_of_words, embed_num_dims)
embedd_matrix.shape

In [None]:
# Inspect unseen words
new_words = 0

for word in index_of_words:
    entry = embedd_matrix[index_of_words[word]]
    if all(v == 0 for v in entry):
        new_words = new_words + 1

print('Words found in wiki vocab: ' + str(len(index_of_words) - new_words))
print('New words found: ' + str(new_words))

In [None]:
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
# Convolution
kernel_size = 3
filters = 32

# Embedding layer before the actaul CNN 
embedd_layer = tf.keras.layers.Embedding(vocab_size,
                         embed_num_dims,
                         input_length = 500,
                         weights = [embedd_matrix],
                         trainable=False)

model2 = tf.keras.models.Sequential()
model2.add(embedd_layer)
model2.add(tf.keras.layers.Conv1D(filters, kernel_size, activation='relu'))
model2.add(tf.keras.layers.BatchNormalization())
model2.add(tf.keras.layers.Conv1D(filters, kernel_size, activation='relu'))
model2.add(tf.keras.layers.BatchNormalization())
model2.add(tf.keras.layers.Conv1D(filters, kernel_size, activation='relu'))
model2.add(tf.keras.layers.BatchNormalization())
model2.add(tf.keras.layers.GlobalMaxPooling1D())
model2.add(tf.keras.layers.Flatten())

model2.add(tf.keras.layers.Dense(64, activation='relu'))
model2.add(tf.keras.layers.Dense(7, activation='softmax'))

epochs = 10
batch_size = 128


opt = tf.keras.optimizers.Adam(learning_rate=0.01)

model2.compile(loss = 'categorical_crossentropy', optimizer = opt, metrics=['accuracy',f1_m,precision_m, recall_m])




hist = model2.fit(X_train, y_train, 
                 batch_size=batch_size,
                 epochs=epochs,
                 validation_data=(X_valid,y_valid))

In [None]:
loss, accuracy, f1_score, precision, recall = model2.evaluate(X_test, y_test, verbose=0)
print(f1_score)

In [None]:
#model architecture

# Embedding layer before the actaul BLSTM 
embedd_layer = tf.keras.layers.Embedding(vocab_size,
                         embed_num_dims,
                         input_length = 500,
                         weights = [embedd_matrix],
                         trainable=False)


model3 = tf.keras.models.Sequential()
model3.add(embedd_layer)
model3.add(tf.keras.layers.Bidirectional(tf.keras.layers.GRU(units=128,
                              dropout=0.2,
                              recurrent_dropout=0.2, return_sequences=True)))
model3.add(tf.keras.layers.Bidirectional(tf.keras.layers.GRU(units=128,
                              dropout=0.2,
                              recurrent_dropout=0.2, return_sequences=True)))
model3.add(tf.keras.layers.Dense(128,activation = tf.nn.relu))                             
model3.add(tf.keras.layers.Dense(7, activation='softmax'))



model3.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy',f1_m,precision_m, recall_m])
hist = model3.fit(X_train, y_train, epochs=10, validation_data=(X_valid,y_valid),batch_size=128)

In [None]:
loss, accuracy, f1_score, precision, recall = model3.evaluate(X_test, y_test, verbose=0)
print(f1_score)

In [None]:
import time

message = ['I am really happy that you won']

seq = tokenizer.texts_to_sequences(message)
padded = pad_sequences(seq, maxlen=max_length)

start_time = time.time()
pred = model2.predict(padded)

print('Message: ' + str(message))
print('predicted: {} ({:.2f} seconds)'.format(class_names[np.argmax(pred)], (time.time() - start_time)))

In [None]:
model2.summary()