<a href="https://colab.research.google.com/github/tomfirer/NLP_Assignment3/blob/main/NLP_Assignment3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [54]:
#imports
import pandas as pd
import numpy as np

from scipy import sparse

import sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from gensim.models import Word2Vec

import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, LSTM, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences

from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

import re
import string

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [81]:
### functions ###
def clean_text(corpus: list[str]) -> list[str]:
  new_corpus = []
  for document in corpus:
    document = re.sub(r'[^\w\s]', '', document)
    document = document.lower()
    new_corpus.append(document)
  return new_corpus


def tokenize_nltk(corpus: list[str]) -> list[list[str]]:
  token_mat = []
  for document in corpus:
    token_mat.append(nltk.word_tokenize(document))
  return token_mat


def remove_stopwords(token_mat: list[list[str]]) -> list[list[str]]:
  stop_words = stopwords.words('english')
  filtered_tokens = [[token for token in token_arr if token not in stop_words and token.isalpha()] for token_arr in token_mat]
  return filtered_tokens


def lemmatize_nltk(token_mat: list[list[str]]) -> list[list[str]]:
  lemmatizer = WordNetLemmatizer()
  lemmatized_res = []
  for token_arr in token_mat:
    lemmatized_res.append([lemmatizer.lemmatize(token) for token in token_arr])
  return lemmatized_res


#"untokenizes" a matrix of tokens back into an array of strings
def token_matrix_to_string_array(token_mat: list[list[str]]) -> list[str]:
  return [' '.join([str(x) for x in token_arr]) for token_arr in token_mat]


def get_word2vec_model(token_mat: list[list[str]], vector_size: int) -> Word2Vec:
  model = Word2Vec(
    sentences=token_mat,      # The corpus to train the model on
    vector_size=vector_size,  # The size of the word vectors to be learned
    window=5,                 # The size of the window of words to be considered
    min_count=5,              # The minimum frequency required for a word to be included in the vocabulary
    sg=0,                     # 0 for CBOW, 1 for skip-gram
    negative=5,               # The number of negative samples to use for negative sampling
    ns_exponent=0.75,         # The exponent used to shape the negative sampling distribution
    alpha=0.03,               # The initial learning rate
    min_alpha=0.0007,         # The minimum learning rate to which the learning rate will be linearly reduced
    epochs=30,                # The number of epochs (iterations) over the corpus
    workers=4,                # The number of worker threads to use for training the model
    seed=42,                  # The seed for the random number generator
    max_vocab_size=None       # The maximum vocabulary size (None means no limit)
  )
  return model


def get_recurrent_model(model_type: str, unit_num: int, vocab_size: int, vector_size: int, max_length: int, embedding_matrix: list[list[float]]) -> Sequential:
  model = Sequential()
  model.add(Embedding(input_dim=vocab_size, output_dim=vector_size, weights=[embedding_matrix], input_length=max_length, trainable=False))
  if model_type == 'LSTM':
    model.add(LSTM(unit_num))
  elif model_type == 'RNN':
    model.add(SimpleRNN(unit_num))
  else:
    return None
  model.add(Dense(vocab_size, activation='softmax'))
  return model


# Predict the next word
def predict_next_word(model: Sequential, tokenizer: Tokenizer, text: str, max_sequence_length: int) -> str:
    sequence = tokenizer.texts_to_sequences([text])
    sequence = pad_sequences(sequence, maxlen=max_sequence_length, padding='pre')
    predicted_probabilities = model.predict(sequence)
    predicted_word_index = np.argmax(predicted_probabilities, axis=-1)
    predicted_word = tokenizer.index_word.get(predicted_word_index[0], 'Unknown')
    return predicted_word


def generate_completion(model: GPT2LMHeadModel, tokenizer: GPT2Tokenizer, sentence: str, max_length=30) -> str:
    #encode input
    inputs = tokenizer.encode(sentence, return_tensors='pt')
    #generate completion
    outputs = model.generate(
        inputs,
        max_length=max_length,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        early_stopping=True
    )
    #decode the generated text
    completion = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return completion


def predict_sentiment(sentence: str, tokenizer: Tokenizer, model: Sequential, maxlen: int, label_encoder: LabelEncoder):
    tokens = tokenizer.texts_to_sequences([sentence])
    tokens_padded = pad_sequences(tokens, maxlen=maxlen)
    #predict
    prediction = model.predict(tokens_padded)
    sentiment_idx = np.argmax(prediction)
    sentiment = label_encoder.inverse_transform([sentiment_idx])[0]

    return sentiment, prediction[0][sentiment_idx]

In [62]:
messages_df = pd.read_excel('office_messages.xlsx')
messages_df['Messages'] = clean_text(messages_df['Messages'])
messages_df.head()

Unnamed: 0,Messages,Sentiment
0,hey team quick reminder todays meeting is at 1...,Positive
1,does anyone have the latest sales report need ...,Positive
2,happy friday everyone any plans for the weekend,Positive
3,congrats to the marketing team for the success...,Positive
4,happy birthday to sarah from hr cake in the br...,Positive


In [63]:
### Data Preprocessing ###
token_matrix = tokenize_nltk(messages_df['Messages'])
token_matrix = remove_stopwords(token_matrix)
token_matrix = lemmatize_nltk(token_matrix)
print(token_matrix)

[['hey', 'team', 'quick', 'reminder', 'today', 'meeting', 'conference', 'room'], ['anyone', 'latest', 'sale', 'report', 'need', 'presentation'], ['happy', 'friday', 'everyone', 'plan', 'weekend'], ['congrats', 'marketing', 'team', 'successful', 'campaign', 'launch'], ['happy', 'birthday', 'sarah', 'hr', 'cake', 'break', 'room', 'pm'], ['got', 'call', 'client', 'loved', 'proposal'], ['weather', 'update', 'snow', 'expected', 'tomorrow', 'morning', 'plan', 'commute', 'accordingly'], ['kudos', 'team', 'fixing', 'server', 'issue', 'quickly'], ['reminder', 'team', 'lunch', 'noon', 'going', 'new', 'place', 'downtown'], ['ceo', 'visiting', 'office', 'next', 'week', 'let', 'ensure', 'everything', 'ready'], ['quick', 'poll', 'team', 'lunch', 'option', 'friday', 'mexican', 'italian'], ['hr', 'update', 'new', 'health', 'insurance', 'option', 'available', 'starting', 'next', 'month'], ['congratulation', 'john', 'promotion', 'senior', 'analyst'], ['team', 'let', 'brainstorm', 'idea', 'upcoming', 'pr

In [64]:
### Vectore Embeddings ###
vector_size = 20
word2vec_model = get_word2vec_model(token_matrix, vector_size=vector_size)

In [65]:
### Creating Input for Models ###
tokenizer = Tokenizer()
tokenizer.fit_on_texts(token_matrix_to_string_array(token_matrix))
vocab_size = len(tokenizer.word_index) + 1
sequences = tokenizer.texts_to_sequences(token_matrix_to_string_array(token_matrix))
max_length = max([len(seq) for seq in sequences])

#create embedding matrix
embedding_matrix = np.zeros((vocab_size, vector_size))
for word, i in tokenizer.word_index.items():
    if word in word2vec_model.wv:
        embedding_matrix[i] = word2vec_model.wv[word]

sentences_before, words_after = [], []
for seq in sequences:
    for i in range(1, len(seq)):
        sentences_before.append(seq[:i])
        words_after.append(seq[i])

sentences_before = pad_sequences(sentences_before, maxlen=max_length, padding='pre')
words_after = np.array(words_after)

sentences_before_train, sentences_before_test, words_after_train, words_after_test = train_test_split(sentences_before, words_after, test_size=0.2, random_state=42)
sentences_before_train = np.array(sentences_before_train)
sentences_before_test = np.array(sentences_before_test)
words_after_train = np.array(words_after_train).reshape(-1)
words_after_test = np.array(words_after_test).reshape(-1)

In [46]:
#define model
rnn_model = get_recurrent_model(model_type='RNN', unit_num=128, vocab_size=vocab_size, vector_size=vector_size, max_length=max_length, embedding_matrix=embedding_matrix)
rnn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

#train model
rnn_model.fit(sentences_before_train, words_after_train, epochs=10, batch_size=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x78849923f070>

In [47]:
#define model
lstm_model = get_recurrent_model(model_type='LSTM', unit_num=128, vocab_size=vocab_size, vector_size=vector_size, max_length=max_length, embedding_matrix=embedding_matrix)
lstm_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

#train model
lstm_model.fit(sentences_before_train, words_after_train, epochs=10, batch_size=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x78849b8355a0>

In [48]:
#evaluate
rnn_loss = rnn_model.evaluate(sentences_before_test, words_after_test, verbose=0)[0]
rnn_perplexity = np.exp(rnn_loss)
lstm_loss = lstm_model.evaluate(sentences_before_test, words_after_test, verbose=0)[0]
lstm_perplexity = np.exp(lstm_loss)
print(f'RNN loss={rnn_loss}, perplexity={rnn_perplexity}')
print(f'LSTM loss={lstm_loss}, perplexity={lstm_perplexity}')

RNN loss=6.7658820152282715, perplexity=867.7312225152211
LSTM loss=6.874927043914795, perplexity=967.7047630581189


In [49]:
### GPT-2 ###

#define model
gpt_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt_model = GPT2LMHeadModel.from_pretrained('gpt2')

#generate completions for the sentences
sentences = token_matrix_to_string_array([arr[:len(arr)//2] for arr in token_matrix[:4]])
completions = [generate_completion(gpt_model, gpt_tokenizer, sentence) for sentence in sentences]

print('\n\n')
# Print the completions
for i, (sentence, completion) in enumerate(zip(sentences, completions)):
    print(f"Original Sentence {i+1}: {sentence}")
    print(f"Completion {i+1}: {completion}\n")
    print('-----------------------------------------------------------------------------------------------')

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generati




Original Sentence 1: hey team quick reminder
Completion 1: hey team quick reminder:

The team is not responsible for any damage caused by the use of this product.
.

-----------------------------------------------------------------------------------------------
Original Sentence 2: anyone latest sale
Completion 2: anyone latest sale.

The company has been in the news recently for its controversial decision to sell its own mobile phone business to Google. The

-----------------------------------------------------------------------------------------------
Original Sentence 3: happy friday
Completion 3: happy friday, and I'm going to be back in the studio with you guys.

I'm gonna be in a lot of different places

-----------------------------------------------------------------------------------------------
Original Sentence 4: congrats marketing team
Completion 4: congrats marketing team.

"We're excited to be working with you on this project," said the company's CEO, John D. D

-----

In [69]:
### Sentiment Analysis ###
texts = messages_df['Messages'].tolist()
labels = messages_df['Sentiment'].tolist()

tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

x_data = pad_sequences(sequences, maxlen=max_length)

encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(labels)
encoded_labels = tf.keras.utils.to_categorical(encoded_labels)

x_train, x_test, y_train, y_test = train_test_split(x_data, encoded_labels, test_size=0.2, random_state=42)

In [70]:
# Define the model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=vector_size, weights=[embedding_matrix], input_length=max_length, trainable=False))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(len(set(labels)), activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Print model summary
model.summary()

Model: "sequential_16"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_16 (Embedding)    (None, 10, 20)            8880      
                                                                 
 lstm_7 (LSTM)               (None, 128)               76288     
                                                                 
 dense_12 (Dense)            (None, 3)                 387       
                                                                 
Total params: 85555 (334.20 KB)
Trainable params: 76675 (299.51 KB)
Non-trainable params: 8880 (34.69 KB)
_________________________________________________________________


In [76]:
#train
model.fit(x_train, y_train, batch_size=10, epochs=10, validation_data=(x_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
