<a href="https://colab.research.google.com/github/tomfirer/NLP_Assignment1/blob/main/NLP_Assignment1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#imports
import pandas as pd
import numpy as np
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
import time

import nltk
from nltk.stem import LancasterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

import spacy
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English

import requests
from bs4 import BeautifulSoup

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [2]:
def print_basic_statistics_from_df(df):
  print('----- Basic Statistics -----')
  print('Number of messages: {}'.format(df.shape[0]))
  print('\tHam messages: {}'.format(df['v1'].value_counts()['ham']))
  print('\tSpam messages: {}'.format(df['v1'].value_counts()['spam']))
  print_basic_statistics_from_sentences(df['v2'])


def print_basic_statistics_from_sentences(string_arr):
  print('Average number of words per message: {}'.format(get_average_number_of_words(string_arr)))
  word_frequency_info = get_word_frequency_info(string_arr)
  print('Five most frequent words: {}'.format(get_five_most_frequent_words(word_frequency_info)))
  print('Number of words that appear once: {}'.format(get_number_of_unique_words(word_frequency_info)))


def get_average_number_of_words(string_arr):
  num_of_words = [count_num_of_words(s) for s in string_arr]
  return sum(num_of_words)/len(num_of_words)


def get_word_frequency_info(string_arr):
  corpus = np.array(string_arr)
  vectorizer = CountVectorizer(tokenizer=lambda x: x.split(), token_pattern=None)
  X = vectorizer.fit_transform(corpus)
  words = vectorizer.get_feature_names_out()
  frequencies = np.sum(np.array(X.toarray()), axis=0)
  return (words, frequencies)


def get_number_of_unique_words(word_frequency_info):
  return list(word_frequency_info[1]).count(1)


def get_five_most_frequent_words(word_frequency_info):
  words = word_frequency_info[0]
  frequencies = word_frequency_info[1]

  frequency_and_index_tupples = list(zip(frequencies, range(len(frequencies))))
  sorted_frequency_and_index_tupples = sorted(frequency_and_index_tupples, key = lambda tup: tup[0])
  five_most_frequent_tupple = sorted_frequency_and_index_tupples[-5:len(sorted_frequency_and_index_tupples)]
  five_most_frequent_indices = [tup[1] for tup in five_most_frequent_tupple]
  five_most_frequent_words = [words[i] for i in five_most_frequent_indices]
  return five_most_frequent_words


def count_num_of_words(string):
  if string.strip == "":
    return 0
  return len(string.split())


def clean_data(df, column):
  # list of strings
  document = df[column]
  # list of list of tokens
  token_mat = [nltk.word_tokenize(token_arr) for token_arr in document]
  # remove stopwords from matrix of tokens
  stopwords = nltk.corpus.stopwords.words('english')
  filtered_tokens = [[token.lower() for token in token_arr if token.lower() not in stopwords and token.isalpha()] for token_arr in token_mat]
  # replace original column with new filtered column
  df[column] = [' '.join(token_arr) for token_arr in filtered_tokens]
  return df


  #receives an array of sentenecs and returns a matrix of tokens
def tokenize_using_nltk(sentence_arr):
  result_tokens_array = []
  start_time = time.time()

  for sentence in sentence_arr:
    result_tokens_array.append(nltk.word_tokenize(sentence))
  print('Time elapsed: {}[sec]'.format(np.round((time.time() - start_time), decimals = 4)))

  return result_tokens_array


#receives an array of sentenecs and returns a matrix of tokens
def tokenize_using_spacy(sentence_arr):
  nlp = English()
  tokenizer = Tokenizer(nlp.vocab)
  result_tokens_array = []
  start_time = time.time()

  for sentence in sentence_arr:
    result_tokens_array.append(list(tokenizer(sentence)))
  print('Time elapsed: {}[sec]'.format(np.round((time.time() - start_time), decimals = 4)))

  return result_tokens_array


def stem_using_nltk(token_mat):
  stemming_res = []
  ls = LancasterStemmer()
  start_time = time.time()

  for s in token_mat:
    stemming_res.append([ls.stem(token) for token in s])

  print('Time elapsed: {}[sec]'.format(np.round((time.time() - start_time), decimals = 4)))
  return stemming_res


#turns a matrix of tokens into an array of strings
def token_matrix_to_string_array(mat):
  return [' '.join([str(x) for x in token_arr]) for token_arr in mat]


def lemmatize_using_nltk(token_mat):
  lemmatizer = WordNetLemmatizer()
  lemmatized_res = []
  start_time = time.time()

  for s in token_mat:
    lemmatized_res.append([lemmatizer.lemmatize(token) for token in s])

  print('Time elapsed: {}[sec]'.format(np.round((time.time() - start_time), decimals = 4)))
  return lemmatized_res


def lemmatize_using_spacy(string_arr):
  nlp = spacy.load("en_core_web_sm")
  lemmatized_res = []
  start_time = time.time()

  for doc in nlp.pipe(string_arr):
    lemmatized_res.append([token.lemma_ for token in doc])

  print('Time elapsed: {}[sec]'.format(np.round((time.time() - start_time), decimals = 4)))
  return lemmatized_res

In [3]:
spam_df = pd.read_csv("/content/spam.csv", encoding = "ISO-8859-1")
spam_df = spam_df.drop(spam_df.columns[[2, 3, 4]], axis = 'columns')
spam_df = clean_data(spam_df, 'v2')
spam_df

Unnamed: 0,v1,v2
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif u oni
2,spam,free entry wkly comp win fa cup final tkts may...
3,ham,u dun say early hor u c already say
4,ham,nah think goes usf lives around though
...,...,...
5567,spam,time tried contact u pound prize claim easy ca...
5568,ham,b going esplanade fr home
5569,ham,pity mood suggestions
5570,ham,guy bitching acted like interested buying some...


In [4]:
print_basic_statistics_from_df(spam_df)

----- Basic Statistics -----
Number of messages: 5572
	Ham messages: 4825
	Spam messages: 747
Average number of words per message: 8.375987078248384
Five most frequent words: ['gt', 'ur', 'get', 'call', 'u']
Number of words that appear once: 3588


In [5]:
### Performing Tokenization on the Spam File Using NLTK ###
nltk_tokens = tokenize_using_nltk(spam_df['v2'])
print('\n{}'.format(nltk_tokens[0:5]))
print('Type of token: {}\n'.format(type(nltk_tokens[0][0])))

print_basic_statistics_from_sentences(token_matrix_to_string_array(nltk_tokens))

Time elapsed: 0.9811[sec]

[['go', 'jurong', 'point', 'crazy', 'available', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'got', 'amore', 'wat'], ['ok', 'lar', 'joking', 'wif', 'u', 'oni'], ['free', 'entry', 'wkly', 'comp', 'win', 'fa', 'cup', 'final', 'tkts', 'may', 'text', 'fa', 'receive', 'entry', 'question', 'std', 'txt', 'rate', 'c', 'apply'], ['u', 'dun', 'say', 'early', 'hor', 'u', 'c', 'already', 'say'], ['nah', 'think', 'goes', 'usf', 'lives', 'around', 'though']]
Type of token: <class 'str'>

Average number of words per message: 8.375987078248384
Five most frequent words: ['gt', 'ur', 'get', 'call', 'u']
Number of words that appear once: 3588


In [6]:
### Performing Tokenization on the Spam File Using SpaCy ###
spacy_tokens = tokenize_using_spacy(spam_df['v2'])
print('\n{}'.format(spacy_tokens[0:5]))
print('Type of token: {}\n'.format(type(spacy_tokens[0][0])))

print_basic_statistics_from_sentences(token_matrix_to_string_array(spacy_tokens))

Time elapsed: 0.6626[sec]

[[go, jurong, point, crazy, available, bugis, n, great, world, la, e, buffet, cine, got, amore, wat], [ok, lar, joking, wif, u, oni], [free, entry, wkly, comp, win, fa, cup, final, tkts, may, text, fa, receive, entry, question, std, txt, rate, c, apply], [u, dun, say, early, hor, u, c, already, say], [nah, think, goes, usf, lives, around, though]]
Type of token: <class 'spacy.tokens.token.Token'>

Average number of words per message: 8.375987078248384
Five most frequent words: ['gt', 'ur', 'get', 'call', 'u']
Number of words that appear once: 3588


The two tokenization techniques have similar processing speeds.</br>
In both techniques, the process speed depends on the size of the text.</br>
The NLTK library outputs tokens of type <'str'> while the SpaCy library outputs tokens of type <'spacy.tokens.token.Token'> that is special to the library.</br>
Another difference between NLTK and spaCy is their handling of punctuation. NLTK separates punctuation into individual tokens. For example, 'point,' is tokenized as 'point' and ','. While, spaCy keeps punctuation attached to the word, so 'point,' remains a single token.

In [7]:
### Performing Stemming on the Spam File Using NLTK ###
nltk_stemmed = stem_using_nltk(nltk_tokens)
print('\n{}'.format(nltk_stemmed[0:5]))
print('\nType of token: {}'.format(type(nltk_stemmed[0][0])))

print_basic_statistics_from_sentences(token_matrix_to_string_array(nltk_stemmed))

Time elapsed: 0.5581[sec]

[['go', 'jurong', 'point', 'crazy', 'avail', 'bug', 'n', 'gre', 'world', 'la', 'e', 'buffet', 'cin', 'got', 'am', 'wat'], ['ok', 'lar', 'jok', 'wif', 'u', 'on'], ['fre', 'entry', 'wkly', 'comp', 'win', 'fa', 'cup', 'fin', 'tkts', 'may', 'text', 'fa', 'receiv', 'entry', 'quest', 'std', 'txt', 'rat', 'c', 'apply'], ['u', 'dun', 'say', 'ear', 'hor', 'u', 'c', 'already', 'say'], ['nah', 'think', 'goe', 'usf', 'liv', 'around', 'though']]

Type of token: <class 'str'>
Average number of words per message: 8.375987078248384
Five most frequent words: ['gt', 'ur', 'get', 'cal', 'u']
Number of words that appear once: 2431


The running time of the syemming depends on the size of the tokenized text.</br> Stemming involves applying rules to reduce tokens to their root forms, adding more rules and handling irregular forms can add complexity and worsen processing time.</br>
Just as with its tokenization method, NLTK outputs stemmed tokens of type <'str'>.</br>
From what I've searched online, spaCy does not have a stemming method so I couldn't apply it in this assignment.

In [8]:
### Performing Lemmatization on the Spam File Using NLTK ###
nltk_lemmatized = lemmatize_using_nltk(nltk_tokens)
print('\n{}'.format(nltk_lemmatized[0:5]))
print('Type of token: {}\n'.format(type(nltk_lemmatized[0][0])))

print_basic_statistics_from_sentences(token_matrix_to_string_array(nltk_lemmatized))

Time elapsed: 1.7149[sec]

[['go', 'jurong', 'point', 'crazy', 'available', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'got', 'amore', 'wat'], ['ok', 'lar', 'joking', 'wif', 'u', 'oni'], ['free', 'entry', 'wkly', 'comp', 'win', 'fa', 'cup', 'final', 'tkts', 'may', 'text', 'fa', 'receive', 'entry', 'question', 'std', 'txt', 'rate', 'c', 'apply'], ['u', 'dun', 'say', 'early', 'hor', 'u', 'c', 'already', 'say'], ['nah', 'think', 'go', 'usf', 'life', 'around', 'though']]
Type of token: <class 'str'>

Average number of words per message: 8.375987078248384
Five most frequent words: ['gt', 'ur', 'get', 'call', 'u']
Number of words that appear once: 3293


In [9]:
### Performing Lemmatization on the Spam File Using SpaCy ###
spacy_lemmatized = lemmatize_using_spacy(spam_df['v2'])
print('\n{}'.format(spacy_lemmatized[0:5]))
print('Type of token: {}\n'.format(type(spacy_lemmatized[0][0])))

print_basic_statistics_from_sentences(token_matrix_to_string_array(spacy_lemmatized))

Time elapsed: 7.4103[sec]

[['go', 'jurong', 'point', 'crazy', 'available', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'get', 'amore', 'wat'], ['ok', 'lar', 'joke', 'wif', 'u', 'oni'], ['free', 'entry', 'wkly', 'comp', 'win', 'fa', 'cup', 'final', 'tkts', 'may', 'text', 'fa', 'receive', 'entry', 'question', 'std', 'txt', 'rate', 'c', 'apply'], ['u', 'dun', 'say', 'early', 'hor', 'u', 'c', 'already', 'say'], ['nah', 'think', 'go', 'usf', 'live', 'around', 'though']]
Type of token: <class 'str'>

Average number of words per message: 8.46859296482412
Five most frequent words: ['ur', 'go', 'call', 'get', 'u']
Number of words that appear once: 3081


The outputs of both teqniques is of type <'str'>.</br>
The processing speed of NLTK is substantially quicker than of spaCy.</br> NLTK uses a simpler and more direct implementation of lemmatization algorithms while spaCy's lemmatization process is more complex and includes additional features like POS tagging, NER, etc. to find the base forms of the text.</br>
NLTK lemmatizes using the tokens while spaCy works on the original text (when I tried giving it the tokens from the previous question it didn't work).</br>
In the lemmatization process, we're trying to reduce the words in the text to their root forms based on a dictionary. The running time is affected by the length of the text to be lemmatized as well as the size of the vocabulary.

In [10]:
### Loading data from URL link ###
url = 'https://www.rottentomatoes.com/'
#url = ''
response = requests.get(url)

if response.status_code == 200:
    web_content = response.text
    #parse the HTML Content
    soup = BeautifulSoup(web_content, 'html.parser')
    #extract All Text
    all_text = soup.get_text(separator='\n', strip=True)
    #split the text into lines
    text_lines = all_text.split('\n')
    #remove empty lines
    text_lines = [line for line in text_lines if line.strip() != ""]

    url_df = pd.DataFrame(text_lines, columns=['Text'])
    url_df = clean_data(url_df, 'Text')
    print(url_df)
else:
    print("Failed to retrieve the webpage.")

                                                  Text
0    rotten tomatoes movies tv shows movie trailers...
1                                               signed
2                                    skip main content
3                                               cancel
4                                            movies tv
..                                                 ...
640                                    cookie settings
641                                  california notice
642                                         ad choices
643                                      accessibility
644                 copyright fandango rights reserved

[645 rows x 1 columns]


In [11]:
print_basic_statistics_from_sentences(url_df['Text'])

Average number of words per message: 1.5891472868217054
Five most frequent words: ['movie', 'view', 'season', 'tv', 'movies']
Number of words that appear once: 148


In [12]:
### Performing Tokenization on the URL Data Using NLTK ###
url_tokens = tokenize_using_nltk(url_df['Text'])
print('\n{}\n'.format(url_tokens[0:5]))

print_basic_statistics_from_sentences(token_matrix_to_string_array(url_tokens))

Time elapsed: 0.0604[sec]

[['rotten', 'tomatoes', 'movies', 'tv', 'shows', 'movie', 'trailers', 'reviews', 'rotten', 'tomatoes'], ['signed'], ['skip', 'main', 'content'], ['cancel'], ['movies', 'tv']]

Average number of words per message: 1.5891472868217054
Five most frequent words: ['movie', 'view', 'season', 'tv', 'movies']
Number of words that appear once: 148


In [13]:
### Performing Stemming on the URL Data Using NLTK ###
url_stemmed = stem_using_nltk(url_tokens)
print('\n{}\n'.format(url_stemmed[0:5]))

print_basic_statistics_from_sentences(token_matrix_to_string_array(url_stemmed))

Time elapsed: 0.0212[sec]

[['rot', 'tomato', 'movy', 'tv', 'show', 'movy', 'trail', 'review', 'rot', 'tomato'], ['sign'], ['skip', 'main', 'cont'], ['cancel'], ['movy', 'tv']]

Average number of words per message: 1.5891472868217054
Five most frequent words: ['new', 'view', 'season', 'tv', 'movy']
Number of words that appear once: 122


In [14]:
### Performing Lemmatization on the URL Data Using NLTK ###
url_lemmatized = lemmatize_using_nltk(url_tokens)
print('\n{}\n'.format(url_lemmatized[0:5]))

print_basic_statistics_from_sentences(token_matrix_to_string_array(url_lemmatized))

Time elapsed: 0.0089[sec]

[['rotten', 'tomato', 'movie', 'tv', 'show', 'movie', 'trailer', 'review', 'rotten', 'tomato'], ['signed'], ['skip', 'main', 'content'], ['cancel'], ['movie', 'tv']]

Average number of words per message: 1.5891472868217054
Five most frequent words: ['boy', 'view', 'season', 'tv', 'movie']
Number of words that appear once: 135


In [15]:
### Loading data from txt file ###
with open('whatsapp_messages.txt', 'r') as file:
    messages = file.readlines()

whatsapp_df = pd.DataFrame(messages, columns=['Messages'])
whatsapp_df = clean_data(whatsapp_df, 'Messages')
whatsapp_df

Unnamed: 0,Messages
0,hey team quick reminder today meeting conferen...
1,anyone latest sales report need presentation
2,happy friday everyone plans weekend
3,heads printer floor toner
4,someone review draft proposal sent
...,...
72,update network upgrade scheduled weekend minim...
73,congrats accounting team closing books ahead s...
74,reminder town hall meeting next tuesday submit...
75,hr update wellness program launches next week ...


In [16]:
print_basic_statistics_from_sentences(whatsapp_df['Messages'])

Average number of words per message: 7.038961038961039
Five most frequent words: ['tomorrow', 'update', 'team', 'next', 'reminder']
Number of words that appear once: 226


In [17]:
### Performing Tokenization on the Whatsapp Data Using NLTK ###
whatsapp_tokens = tokenize_using_nltk(whatsapp_df['Messages'])
print('\n{}\n'.format(whatsapp_tokens[0:5]))

print_basic_statistics_from_sentences(token_matrix_to_string_array(whatsapp_tokens))

Time elapsed: 0.0147[sec]

[['hey', 'team', 'quick', 'reminder', 'today', 'meeting', 'conference', 'room'], ['anyone', 'latest', 'sales', 'report', 'need', 'presentation'], ['happy', 'friday', 'everyone', 'plans', 'weekend'], ['heads', 'printer', 'floor', 'toner'], ['someone', 'review', 'draft', 'proposal', 'sent']]

Average number of words per message: 7.038961038961039
Five most frequent words: ['tomorrow', 'update', 'team', 'next', 'reminder']
Number of words that appear once: 226


In [18]:
### Performing Stemming on the Whatsapp Data Using NLTK ###
whatsapp_stemmed = stem_using_nltk(whatsapp_tokens)
print('\n{}\n'.format(whatsapp_stemmed[0:5]))

print_basic_statistics_from_sentences(token_matrix_to_string_array(whatsapp_stemmed))

Time elapsed: 0.0115[sec]

[['hey', 'team', 'quick', 'remind', 'today', 'meet', 'conf', 'room'], ['anyon', 'latest', 'sal', 'report', 'nee', 'pres'], ['happy', 'friday', 'everyon', 'plan', 'weekend'], ['head', 'print', 'flo', 'ton'], ['someon', 'review', 'draft', 'propos', 'sent']]

Average number of words per message: 7.038961038961039
Five most frequent words: ['tomorrow', 'upd', 'team', 'next', 'remind']
Number of words that appear once: 196


In [19]:
### Performing Lemmatization on the Whatsapp Data Using NLTK ###
whatsapp_lemmatized = lemmatize_using_nltk(whatsapp_tokens)
print('\n{}\n'.format(whatsapp_lemmatized[0:5]))

print_basic_statistics_from_sentences(token_matrix_to_string_array(whatsapp_lemmatized))

Time elapsed: 0.0022[sec]

[['hey', 'team', 'quick', 'reminder', 'today', 'meeting', 'conference', 'room'], ['anyone', 'latest', 'sale', 'report', 'need', 'presentation'], ['happy', 'friday', 'everyone', 'plan', 'weekend'], ['head', 'printer', 'floor', 'toner'], ['someone', 'review', 'draft', 'proposal', 'sent']]

Average number of words per message: 7.038961038961039
Five most frequent words: ['tomorrow', 'update', 'team', 'next', 'reminder']
Number of words that appear once: 212


The solution for the whatsapp messages is done in English because I couldn't find a stemming/lemmatization implementation for Hebrew.