<a href="https://colab.research.google.com/github/tomfirer/NLP_Assignment2/blob/main/NLP_Assignment2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
#imports
import pandas as pd
import numpy as np

import sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from gensim.models import Word2Vec

import nltk
from nltk import word_tokenize
from nltk.stem import LancasterStemmer
from nltk.corpus import stopwords
from nltk.grammar import CFG
from nltk.parse.chart import ChartParser

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [28]:
### functions ###
def tokenize_nltk(sentence_arr: list[str]) -> list[list[str]]:
  result_tokens_array = []

  for sentence in sentence_arr:
    result_tokens_array.append(nltk.word_tokenize(sentence))

  return result_tokens_array


def stem_nltk(token_mat: list[list[str]]) -> list[list[str]]:
  stemming_result = []
  ls = LancasterStemmer()

  for token_arr in token_mat:
    stemming_result.append([ls.stem(token) for token in token_arr])

  return stemming_result


def remove_stopwords(token_mat: list[list[str]]) -> list[list[str]]:
  #stopwords remove determiners and preposiitons that are important for the cyk analysis, so i decided to leave them here.
  #stopwords = stopwords.words('english')
  #filtered_tokens = [[token for token in token_arr if token not in stopwords and token.isalpha()] for token_arr in token_mat]
  filtered_tokens = [[token for token in token_arr if token.isalpha()] for token_arr in token_mat]
  return filtered_tokens


def bow_feature_extraction(corpus: list[str]):
  vectorizer = CountVectorizer()
  bow_mat = vectorizer.fit_transform(corpus)
  return (vectorizer, bow_mat)


def tfidf_feature_extraction(corpus: list[str]):
  vectorizer = TfidfVectorizer()
  tfidf_mat = vectorizer.fit_transform(corpus)
  return (vectorizer, tfidf_mat)


#"untokenizes" a matrix of tokens back into an array of strings
def token_matrix_to_string_array(token_mat: list[list[str]]) -> list[str]:
  return [' '.join([str(x) for x in token_arr]) for token_arr in token_mat]

In [29]:
### Loading data from txt file ###
with open('whatsapp_messages.txt', 'r') as file:
    messages = file.readlines()

whatsapp_df = pd.DataFrame(messages, columns=['Messages'])
whatsapp_df

Unnamed: 0,Messages
0,"Hey team, quick reminder: today's meeting is a..."
1,Does anyone have the latest sales report? Need...
2,Happy Friday everyone! Any plans for the weeke...
3,"Just a heads up, the printer on the 3rd floor ..."
4,Can someone review the draft proposal I just s...
...,...
72,IT update: network upgrade scheduled for this ...
73,Congrats to the accounting team for closing th...
74,Reminder: company-wide town hall meeting next ...
75,HR update: wellness program launches next week...


In [30]:
tokens = tokenize_nltk(whatsapp_df['Messages'])
print(tokens)

[['Hey', 'team', ',', 'quick', 'reminder', ':', 'today', "'s", 'meeting', 'is', 'at', '10', 'AM', 'in', 'Conference', 'Room', 'A', '.'], ['Does', 'anyone', 'have', 'the', 'latest', 'sales', 'report', '?', 'Need', 'it', 'for', 'the', 'presentation', '.'], ['Happy', 'Friday', 'everyone', '!', 'Any', 'plans', 'for', 'the', 'weekend', '?'], ['Just', 'a', 'heads', 'up', ',', 'the', 'printer', 'on', 'the', '3rd', 'floor', 'is', 'out', 'of', 'toner', 'again', '.'], ['Can', 'someone', 'review', 'the', 'draft', 'proposal', 'I', 'just', 'sent', 'over', '?'], ['Reminder', ':', 'team', 'lunch', 'at', 'noon', ',', 'we', "'re", 'going', 'to', 'that', 'new', 'place', 'downtown', '!'], ['Has', 'anyone', 'seen', 'my', 'blue', 'notebook', '?', 'I', 'think', 'I', 'left', 'it', 'in', 'the', 'meeting', 'room', '.'], ['Important', ':', 'The', 'deadline', 'for', 'project', 'submissions', 'has', 'been', 'extended', 'to', 'Friday', '.'], ['Who', "'s", 'attending', 'the', 'client', 'meeting', 'tomorrow', '?', '

In [31]:
stem = stem_nltk(tokens)
print(stem)

[['hey', 'team', ',', 'quick', 'remind', ':', 'today', "'s", 'meet', 'is', 'at', '10', 'am', 'in', 'conf', 'room', 'a', '.'], ['doe', 'anyon', 'hav', 'the', 'latest', 'sal', 'report', '?', 'nee', 'it', 'for', 'the', 'pres', '.'], ['happy', 'friday', 'everyon', '!', 'any', 'plan', 'for', 'the', 'weekend', '?'], ['just', 'a', 'head', 'up', ',', 'the', 'print', 'on', 'the', '3rd', 'flo', 'is', 'out', 'of', 'ton', 'again', '.'], ['can', 'someon', 'review', 'the', 'draft', 'propos', 'i', 'just', 'sent', 'ov', '?'], ['remind', ':', 'team', 'lunch', 'at', 'noon', ',', 'we', "'re", 'going', 'to', 'that', 'new', 'plac', 'downtown', '!'], ['has', 'anyon', 'seen', 'my', 'blu', 'notebook', '?', 'i', 'think', 'i', 'left', 'it', 'in', 'the', 'meet', 'room', '.'], ['import', ':', 'the', 'deadlin', 'for', 'project', 'submit', 'has', 'been', 'extend', 'to', 'friday', '.'], ['who', "'s", 'attend', 'the', 'cli', 'meet', 'tomorrow', '?', 'nee', 'to', 'fin', 'the', 'agend', '.'], ['good', 'morn', '!', 'cou

In [32]:
filtered = remove_stopwords(stem)
print(filtered)

[['hey', 'team', 'quick', 'remind', 'today', 'meet', 'is', 'at', 'am', 'in', 'conf', 'room', 'a'], ['doe', 'anyon', 'hav', 'the', 'latest', 'sal', 'report', 'nee', 'it', 'for', 'the', 'pres'], ['happy', 'friday', 'everyon', 'any', 'plan', 'for', 'the', 'weekend'], ['just', 'a', 'head', 'up', 'the', 'print', 'on', 'the', 'flo', 'is', 'out', 'of', 'ton', 'again'], ['can', 'someon', 'review', 'the', 'draft', 'propos', 'i', 'just', 'sent', 'ov'], ['remind', 'team', 'lunch', 'at', 'noon', 'we', 'going', 'to', 'that', 'new', 'plac', 'downtown'], ['has', 'anyon', 'seen', 'my', 'blu', 'notebook', 'i', 'think', 'i', 'left', 'it', 'in', 'the', 'meet', 'room'], ['import', 'the', 'deadlin', 'for', 'project', 'submit', 'has', 'been', 'extend', 'to', 'friday'], ['who', 'attend', 'the', 'cli', 'meet', 'tomorrow', 'nee', 'to', 'fin', 'the', 'agend'], ['good', 'morn', 'could', 'someon', 'ad', 'me', 'to', 'the', 'calend', 'invit', 'for', 'next', 'week', 'workshop'], ['emerg', 'maint', 'in', 'the', 'buil

In [33]:
corpus = token_matrix_to_string_array(filtered)
print(corpus)

['hey team quick remind today meet is at am in conf room a', 'doe anyon hav the latest sal report nee it for the pres', 'happy friday everyon any plan for the weekend', 'just a head up the print on the flo is out of ton again', 'can someon review the draft propos i just sent ov', 'remind team lunch at noon we going to that new plac downtown', 'has anyon seen my blu notebook i think i left it in the meet room', 'import the deadlin for project submit has been extend to friday', 'who attend the cli meet tomorrow nee to fin the agend', 'good morn could someon ad me to the calend invit for next week workshop', 'emerg maint in the build today expect som disrupt', 'just a remind to submit yo expens report by end of day', 'the on the flo seem to be down it is look into it', 'can someon bring an extr laptop charg to the meet room', 'congr to the market team for the success campaign launch', 'do forget to upd yo project stat on the shar driv', 'happy birthday to sarah from hr cak in the break ro

In [34]:
### bow ###
bow_vectorizer, bow_mat = bow_feature_extraction(corpus)
#print(bow_mat.toarray())
print(bow_vectorizer.get_feature_names_out())

['access' 'accord' 'account' 'achiev' 'act' 'ad' 'adv' 'afternoon' 'again'
 'agend' 'ahead' 'al' 'am' 'an' 'analyst' 'and' 'annount' 'any' 'anyon'
 'apolog' 'apply' 'approv' 'are' 'at' 'attend' 'audit' 'avail' 'await'
 'badg' 'bak' 'be' 'been' 'bin' 'birthday' 'blu' 'bonus' 'book'
 'brainstorm' 'break' 'brief' 'bring' 'budget' 'build' 'by' 'cafeter'
 'cak' 'cal' 'calend' 'campaign' 'can' 'cas' 'caus' 'ceo' 'chang' 'char'
 'charg' 'check' 'cle' 'cleanup' 'cli' 'clos' 'cod' 'coff' 'comfort'
 'commit' 'commut' 'company' 'complet' 'conduc' 'conf' 'confus' 'congr'
 'congrat' 'contact' 'correct' 'could' 'cov' 'cybersec' 'dat' 'day'
 'deadlin' 'dec' 'depart' 'deploy' 'detail' 'direct' 'dish' 'disrupt'
 'distribut' 'do' 'doe' 'don' 'down' 'downtim' 'downtown' 'draft' 'dress'
 'dril' 'driv' 'due' 'each' 'ear' 'east' 'effect' 'elev' 'els' 'emerg'
 'employ' 'end' 'ens' 'eod' 'ev' 'everyon' 'everyth' 'expect' 'expens'
 'expery' 'extend' 'extr' 'facil' 'famy' 'feedback' 'fin' 'fir' 'fix'
 'flex' 'f

In [35]:
### tf-idf ###
(tfidf_vectorizer, tfidf_mat) = tfidf_feature_extraction(corpus)
#print(tfidf_mat.toarray())

print('idf values:')
for x, y in zip(tfidf_vectorizer.get_feature_names_out(), tfidf_vectorizer.idf_):
    print(x, ':', y)

idf values:
access : 4.663561646129646
accord : 4.258096538021482
account : 4.663561646129646
achiev : 4.663561646129646
act : 4.663561646129646
ad : 4.663561646129646
adv : 4.663561646129646
afternoon : 3.970414465569701
again : 4.663561646129646
agend : 4.258096538021482
ahead : 4.258096538021482
al : 4.663561646129646
am : 3.970414465569701
an : 4.258096538021482
analyst : 4.663561646129646
and : 4.258096538021482
annount : 4.663561646129646
any : 4.258096538021482
anyon : 3.5649493574615367
apolog : 4.663561646129646
apply : 4.663561646129646
approv : 4.663561646129646
are : 4.663561646129646
at : 3.4107986776342782
attend : 4.663561646129646
audit : 4.258096538021482
avail : 3.970414465569701
await : 4.663561646129646
badg : 4.663561646129646
bak : 4.663561646129646
be : 3.159484249353372
been : 3.970414465569701
bin : 4.663561646129646
birthday : 4.663561646129646
blu : 4.663561646129646
bonus : 4.663561646129646
book : 4.663561646129646
brainstorm : 4.663561646129646
break : 3.9

In [36]:
### word2vec ###
model = Word2Vec(
    sentences=filtered,      # The corpus to train the model on
    vector_size=100,       # The size of the word vectors to be learned
    window=5,              # The size of the window of words to be considered
    min_count=5,           # The minimum frequency required for a word to be included in the vocabulary
    sg=0,                  # 0 for CBOW, 1 for skip-gram
    negative=5,            # The number of negative samples to use for negative sampling
    ns_exponent=0.75,      # The exponent used to shape the negative sampling distribution
    alpha=0.03,            # The initial learning rate
    min_alpha=0.0007,      # The minimum learning rate to which the learning rate will be linearly reduced
    epochs=30,             # The number of epochs (iterations) over the corpus
    workers=4,             # The number of worker threads to use for training the model
    seed=42,               # The seed for the random number generator
    max_vocab_size=None    # The maximum vocabulary size (None means no limit)
)

In [37]:
# Get the vector representation of a word
vector = model.wv['meet']
print(vector)

# Find the most similar words to a given word
similar_words = model.wv.most_similar('meet')
print(similar_words)

[ 1.28116994e-03 -3.88933299e-03 -1.65316940e-03 -2.67273327e-03
  9.30857903e-04 -1.91852741e-03  4.88758367e-03  8.13926163e-04
  3.26252496e-03  8.33176833e-04 -7.78434984e-03 -2.63418857e-04
  8.95656087e-03  1.99302493e-04 -6.34416565e-03  9.79908928e-03
  1.20929116e-03 -1.55783538e-02 -2.05054274e-03  5.74439357e-04
  4.09227004e-03  2.82573327e-03 -8.59381910e-03 -7.36961199e-04
  7.55546615e-03 -6.58677379e-03 -5.31418622e-03 -3.85440327e-03
 -1.46412002e-02  8.62685125e-03 -6.86553307e-03 -7.69572984e-03
  5.24776010e-03 -5.11310203e-03  8.03278200e-03 -1.45847015e-02
 -3.43723502e-03 -1.43447332e-02 -4.32930574e-05  5.18594589e-03
  9.30086389e-05 -6.87286723e-03 -4.30223532e-03 -2.25415290e-03
 -8.50134157e-03  5.03185811e-03  4.99610882e-03  1.73931830e-02
  7.63225788e-03 -9.22123063e-03 -2.42345012e-03  2.82681477e-03
 -1.99511973e-03  1.26010645e-03  2.73712003e-03 -1.02822306e-02
  1.01002799e-02  8.99000559e-03  1.08920420e-02  7.89088663e-03
  2.73213838e-04 -1.45674

In [38]:
filtered_token_mat = filtered[0:5]
token_mat = tokens[0:5]
print(filtered_token_mat)
print(token_mat)

[['hey', 'team', 'quick', 'remind', 'today', 'meet', 'is', 'at', 'am', 'in', 'conf', 'room', 'a'], ['doe', 'anyon', 'hav', 'the', 'latest', 'sal', 'report', 'nee', 'it', 'for', 'the', 'pres'], ['happy', 'friday', 'everyon', 'any', 'plan', 'for', 'the', 'weekend'], ['just', 'a', 'head', 'up', 'the', 'print', 'on', 'the', 'flo', 'is', 'out', 'of', 'ton', 'again'], ['can', 'someon', 'review', 'the', 'draft', 'propos', 'i', 'just', 'sent', 'ov']]
[['Hey', 'team', ',', 'quick', 'reminder', ':', 'today', "'s", 'meeting', 'is', 'at', '10', 'AM', 'in', 'Conference', 'Room', 'A', '.'], ['Does', 'anyone', 'have', 'the', 'latest', 'sales', 'report', '?', 'Need', 'it', 'for', 'the', 'presentation', '.'], ['Happy', 'Friday', 'everyone', '!', 'Any', 'plans', 'for', 'the', 'weekend', '?'], ['Just', 'a', 'heads', 'up', ',', 'the', 'printer', 'on', 'the', '3rd', 'floor', 'is', 'out', 'of', 'toner', 'again', '.'], ['Can', 'someone', 'review', 'the', 'draft', 'proposal', 'I', 'just', 'sent', 'over', '?']

In [39]:
### cyk ###
# Define the grammar in Chomsky Normal Form (CNF)
grammar = CFG.fromstring("""
    S -> NP VP | VP
    NP -> N | Det N | NP NP | NP PP | Pro | Adj NP
    VP -> V | V NP | V PP | Adv VP | VP Adv
    PP -> Prep NP
    Det -> 'the' | 'a'
    Adj -> 'quick' | 'latest' | 'happy' | 'any' | 'just'
    Adv -> 'just' | 'up' | 'out' | 'again' | 'ov'
    N -> 'hey' | 'team' | 'remind' | 'today' | 'meet' | 'am' | 'conf' | 'room' | 'a' | 'doe' | 'sal' | 'report' | 'nee' | 'pres' | 'friday' | 'plan' | 'weekend' | 'head' | 'print' | 'flo' | 'out' | 'ton' | 'can' | 'review' | 'draft' | 'propos'
    V -> 'team' | 'remind' | 'meet' | 'is' | 'am' | 'doe' | 'hav' | 'sal' | 'report' | 'nee' | 'plan' | 'head' | 'print' | 'can' | 'review' | 'draft' | 'sent'
    Prep -> 'through' | 'bef' | 'at' | 'in' | 'for' | 'on' | 'of'
    Pro -> 'i' | 'anyon' | 'it' | 'everyon' | 'any' | 'someon'
""")

In [40]:
# Initialize the parser
parser = ChartParser(grammar)

# Set data to parse
filtered_token_mat = filtered[0:5]

res_parses = []

# Parse each sentence in the
for token_arr in filtered_token_mat:
  parses = list(parser.parse(token_arr))
  res_parses.append(parses)

In [41]:
#@title Next Word Generator
sentene_dictionary = {'sentence 1':0, 'sentence 2':1, 'sentence 3':2, 'sentence 4':3, 'sentence 5':4}

sentence = 'sentence 5' #@param ['sentence 1', 'sentence 2', 'sentence 3', 'sentence 4', 'sentence 5']
i = sentene_dictionary[sentence]

print(token_matrix_to_string_array([filtered[i]]))

# Check if the sentence is in the language and print the parse trees
if res_parses[i]:
  print("Sentence is in the language.")
  print(f"Found {len(res_parses[i])} different parse trees.")
  for tree in res_parses[i]:
    tree.pretty_print()
else:
  print("Sentence is not in the language.")

['can someon review the draft propos i just sent ov']
Sentence is in the language.
Found 84 different parse trees.
                                      S                         
                                 _____|__________________        
                                NP                       |      
                        ________|___________             |       
                       NP                   |            |      
                  _____|______________      |            |       
                 NP                   |     |            |      
           ______|_________           |     |            |       
          NP               |          |     |            VP     
      ____|______          |          |     |         ___|____   
     NP          |         |          |     |        VP       | 
  ___|____       |         |          |     |    ____|___     |  
 NP       NP     NP        NP         NP    NP  |        VP   | 
 |        |      |      ___|____  