<a href="https://colab.research.google.com/github/tomfirer/NLP_Assignment1/blob/main/NLP_Assignment1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
#imports
import pandas as pd
import numpy as np
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
import time

import nltk
from nltk.stem import LancasterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

nltk.download('punkt')
nltk.download('wordnet')

import spacy
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English

import requests
from bs4 import BeautifulSoup

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [3]:
def print_basic_statistics_from_df(df):
  print('----- Basic Statistics -----')
  print('Number of messages: {}'.format(df.shape[0]))
  print('\tHam messages: {}'.format(df['v1'].value_counts()['ham']))
  print('\tSpam messages: {}'.format(df['v1'].value_counts()['spam']))
  print_basic_statistics_from_sentences(df['v2'])


def print_basic_statistics_from_sentences(string_arr):
  print('Average number of words per message: {}'.format(get_average_number_of_words(string_arr)))
  word_frequency_info = get_word_frequency_info(string_arr)
  print('Five most frequent words: {}'.format(get_five_most_frequent_words(word_frequency_info)))
  print('Number of words that appear once: {}'.format(get_number_of_unique_words(word_frequency_info)))


def get_average_number_of_words(string_arr):
  num_of_words = [count_num_of_words(s) for s in string_arr]
  return sum(num_of_words)/len(num_of_words)


def get_word_frequency_info(string_arr):
  corpus = np.array(string_arr)
  vectorizer = CountVectorizer(tokenizer=lambda x: x.split(), token_pattern=None)
  X = vectorizer.fit_transform(corpus)
  words = vectorizer.get_feature_names_out()
  frequencies = np.sum(np.array(X.toarray()), axis=0)
  return (words, frequencies)


def get_number_of_unique_words(word_frequency_info):
  return list(word_frequency_info[1]).count(1)


def get_five_most_frequent_words(word_frequency_info):
  words = word_frequency_info[0]
  frequencies = word_frequency_info[1]

  frequency_and_index_tupples = list(zip(frequencies, range(len(frequencies))))
  sorted_frequency_and_index_tupples = sorted(frequency_and_index_tupples, key = lambda tup: tup[0])
  five_most_frequent_tupple = sorted_frequency_and_index_tupples[-5:len(sorted_frequency_and_index_tupples)]
  five_most_frequent_indices = [tup[1] for tup in five_most_frequent_tupple]
  five_most_frequent_words = [words[i] for i in five_most_frequent_indices]
  return five_most_frequent_words


def count_num_of_words(string):
  if string.strip == "":
    return 0
  return len(string.split())


def clean_data(df, column):
  tokenizer = RegexpTokenizer(r'\w+')
  new_column = [" ".join(tokens) for tokens in df[column].apply(tokenizer.tokenize)]
  df[column] = new_column
  return df


  #receives an array of sentenecs and returns a matrix of tokens
def tokenize_using_nltk(sentence_arr):
  result_tokens_array = []
  start_time = time.time()

  for sentence in sentence_arr:
    result_tokens_array.append(nltk.word_tokenize(sentence))
  print('Time elapsed: {}[sec]'.format(np.round((time.time() - start_time), decimals = 4)))

  return result_tokens_array


#receives an array of sentenecs and returns a matrix of tokens
def tokenize_using_spacy(sentence_arr):
  nlp = English()
  tokenizer = Tokenizer(nlp.vocab)
  result_tokens_array = []
  start_time = time.time()

  for sentence in sentence_arr:
    result_tokens_array.append(list(tokenizer(sentence)))
  print('Time elapsed: {}[sec]'.format(np.round((time.time() - start_time), decimals = 4)))

  return result_tokens_array


def stem_using_nltk(token_mat):
  stemming_res = []
  ls = LancasterStemmer()
  start_time = time.time()

  for s in token_mat:
    stemming_res.append([ls.stem(token) for token in s])

  print('Time elapsed: {}[sec]'.format(np.round((time.time() - start_time), decimals = 4)))
  return stemming_res


#turns a matrix of tokens into an array of strings
def token_matrix_to_string_array(mat):
  return [' '.join([str(x) for x in token_arr]) for token_arr in mat]


def lemmatize_using_nltk(token_mat):
  lemmatizer = WordNetLemmatizer()
  lemmatized_res = []
  start_time = time.time()

  for s in token_mat:
    lemmatized_res.append([lemmatizer.lemmatize(token) for token in s])

  print('Time elapsed: {}[sec]'.format(np.round((time.time() - start_time), decimals = 4)))
  return lemmatized_res


def lemmatize_using_spacy(string_arr):
  nlp = spacy.load("en_core_web_sm")
  lemmatized_res = []
  start_time = time.time()

  for doc in nlp.pipe(string_arr):
    lemmatized_res.append([token.lemma_ for token in doc])

  print('Time elapsed: {}[sec]'.format(np.round((time.time() - start_time), decimals = 4)))
  return lemmatized_res

In [5]:
spam_df = pd.read_csv("/content/spam.csv", encoding = "ISO-8859-1")
spam_df = spam_df.drop(spam_df.columns[[2, 3, 4]], axis = 'columns')
spam_df = clean_data(spam_df, 'v2')
spam_df

Unnamed: 0,v1,v2
0,ham,Go until jurong point crazy Available only in ...
1,ham,Ok lar Joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor U c already then say
4,ham,Nah I don t think he goes to usf he lives arou...
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home
5569,ham,Pity was in mood for that So any other suggest...
5570,ham,The guy did some bitching but I acted like i d...


In [6]:
print_basic_statistics_from_df(spam_df)

----- Basic Statistics -----
Number of messages: 5572
	Ham messages: 4825
	Spam messages: 747
Average number of words per message: 16.17121320890165
Five most frequent words: ['the', 'a', 'you', 'to', 'i']
Number of words that appear once: 4376


In [8]:
### Performing Tokenization on the Spam File Using NLTK ###
nltk_tokens = tokenize_using_nltk(spam_df['v2'])
print('\n{}'.format(nltk_tokens[0:5]))
print('Type of token: {}\n'.format(type(nltk_tokens[0][0])))

print_basic_statistics_from_sentences(token_matrix_to_string_array(nltk_tokens))

Time elapsed: 0.6491[sec]

[['Go', 'until', 'jurong', 'point', 'crazy', 'Available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', 'Cine', 'there', 'got', 'amore', 'wat'], ['Ok', 'lar', 'Joking', 'wif', 'u', 'oni'], ['Free', 'entry', 'in', '2', 'a', 'wkly', 'comp', 'to', 'win', 'FA', 'Cup', 'final', 'tkts', '21st', 'May', '2005', 'Text', 'FA', 'to', '87121', 'to', 'receive', 'entry', 'question', 'std', 'txt', 'rate', 'T', 'C', 's', 'apply', '08452810075over18', 's'], ['U', 'dun', 'say', 'so', 'early', 'hor', 'U', 'c', 'already', 'then', 'say'], ['Nah', 'I', 'don', 't', 'think', 'he', 'goes', 'to', 'usf', 'he', 'lives', 'around', 'here', 'though']]
Type of token: <class 'str'>

Average number of words per message: 16.193467336683415
Five most frequent words: ['the', 'a', 'you', 'to', 'i']
Number of words that appear once: 4375


In [9]:
### Performing Tokenization on the Spam File Using SpaCy ###
spacy_tokens = tokenize_using_spacy(spam_df['v2'])
print('\n{}'.format(spacy_tokens[0:5]))
print('Type of token: {}\n'.format(type(spacy_tokens[0][0])))

print_basic_statistics_from_sentences(token_matrix_to_string_array(spacy_tokens))

Time elapsed: 1.0077[sec]

[[Go, until, jurong, point, crazy, Available, only, in, bugis, n, great, world, la, e, buffet, Cine, there, got, amore, wat], [Ok, lar, Joking, wif, u, oni], [Free, entry, in, 2, a, wkly, comp, to, win, FA, Cup, final, tkts, 21st, May, 2005, Text, FA, to, 87121, to, receive, entry, question, std, txt, rate, T, C, s, apply, 08452810075over18, s], [U, dun, say, so, early, hor, U, c, already, then, say], [Nah, I, don, t, think, he, goes, to, usf, he, lives, around, here, though]]
Type of token: <class 'spacy.tokens.token.Token'>

Average number of words per message: 16.17121320890165
Five most frequent words: ['the', 'a', 'you', 'to', 'i']
Number of words that appear once: 4376


The two tokenization techniques have similar processing speeds.</br>
In both techniques, the process speed depends on the size of the text.</br>
The NLTK library outputs tokens of type <'str'> while the SpaCy library outputs tokens of type <'spacy.tokens.token.Token'> that is special to the library.</br>
Another difference between NLTK and spaCy is their handling of punctuation. NLTK separates punctuation into individual tokens. For example, 'point,' is tokenized as 'point' and ','. While, spaCy keeps punctuation attached to the word, so 'point,' remains a single token.

In [10]:
### Performing Stemming on the Spam File Using NLTK ###
nltk_stemmed = stem_using_nltk(nltk_tokens)
print('\n{}'.format(nltk_stemmed[0:5]))
print('\nType of token: {}'.format(type(nltk_stemmed[0][0])))

print_basic_statistics_from_sentences(token_matrix_to_string_array(nltk_stemmed))

Time elapsed: 0.9103[sec]

[['go', 'until', 'jurong', 'point', 'crazy', 'avail', 'on', 'in', 'bug', 'n', 'gre', 'world', 'la', 'e', 'buffet', 'cin', 'ther', 'got', 'am', 'wat'], ['ok', 'lar', 'jok', 'wif', 'u', 'on']]

Type of token: <class 'str'>
Average number of words per message: 16.193467336683415
Five most frequent words: ['the', 'a', 'you', 'to', 'i']
Number of words that appear once: 3169


The running time of the syemming depends on the size of the tokenized text.</br> Stemming involves applying rules to reduce tokens to their root forms, adding more rules and handling irregular forms can add complexity and worsen processing time.</br>
Just as with its tokenization method, NLTK outputs stemmed tokens of type <'str'>.</br>
From what I've searched online, spaCy does not have a stemming method so I couldn't apply it in this assignment.

In [11]:
### Performing Lemmatization on the Spam File Using NLTK ###
nltk_lemmatized = lemmatize_using_nltk(nltk_tokens)
print('\n{}'.format(nltk_lemmatized[0:5]))
print('Type of token: {}\n'.format(type(nltk_lemmatized[0][0])))

print_basic_statistics_from_sentences(token_matrix_to_string_array(nltk_lemmatized))

Time elapsed: 2.1561[sec]

[['Go', 'until', 'jurong', 'point', 'crazy', 'Available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', 'Cine', 'there', 'got', 'amore', 'wat'], ['Ok', 'lar', 'Joking', 'wif', 'u', 'oni']]
Type of token: <class 'str'>

Average number of words per message: 16.193467336683415
Five most frequent words: ['the', 'a', 'you', 'to', 'i']
Number of words that appear once: 4161


In [12]:
### Performing Lemmatization on the Spam File Using SpaCy ###
spacy_lemmatized = lemmatize_using_spacy(spam_df['v2'])
print('\n{}'.format(spacy_lemmatized[0:5]))
print('Type of token: {}\n'.format(type(spacy_lemmatized[0][0])))

print_basic_statistics_from_sentences(token_matrix_to_string_array(spacy_lemmatized))

Time elapsed: 14.0803[sec]

[['go', 'until', 'jurong', 'point', 'crazy', 'available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', 'Cine', 'there', 'get', 'amore', 'wat'], ['ok', 'lar', 'Joking', 'wif', 'u', 'oni']]
Type of token: <class 'str'>

Average number of words per message: 16.310122038765254
Five most frequent words: ['a', 'you', 'to', 'be', 'i']
Number of words that appear once: 3903


The outputs of both teqniques is of type <'str'>.</br>
The processing speed of NLTK is substantially quicker than of spaCy.</br> NLTK uses a simpler and more direct implementation of lemmatization algorithms while spaCy's lemmatization process is more complex and includes additional features like POS tagging, NER, etc. to find the base forms of the text.</br>
NLTK lemmatizes using the tokens while spaCy works on the original text (when I tried giving it the tokens from the previous question it didn't work).</br>
In the lemmatization process, we're trying to reduce the words in the text to their root forms based on a dictionary. The running time is affected by the length of the text to be lemmatized as well as the size of the vocabulary.

In [14]:
### Loading data from URL link ###
url = 'https://www.rottentomatoes.com/'
#url = ''
response = requests.get(url)

if response.status_code == 200:
    web_content = response.text
    #parse the HTML Content
    soup = BeautifulSoup(web_content, 'html.parser')
    #extract All Text
    all_text = soup.get_text(separator='\n', strip=True)
    #split the text into lines
    text_lines = all_text.split('\n')
    #remove empty lines
    text_lines = [line for line in text_lines if line.strip() != ""]

    url_df = pd.DataFrame(text_lines, columns=['Text'])
    url_df = clean_data(url_df, 'Text')
    print(url_df)
else:
    print("Failed to retrieve the webpage.")

                                                  Text
0    Rotten Tomatoes Movies TV Shows Movie Trailers...
1                                            Signed in
2                                 Skip to Main Content
3                                               Cancel
4                                            Movies TV
..                                                 ...
650                                    Cookie Settings
651                                  California Notice
652                                         Ad Choices
653                                      Accessibility
654             Copyright Fandango All rights reserved

[655 rows x 1 columns]


In [15]:
print_basic_statistics_from_sentences(url_df['Text'])

Average number of words per message: 2.5389312977099237
Five most frequent words: ['season', 'all', 'to', 'of', 'the']
Number of words that appear once: 172


In [16]:
### Performing Tokenization on the URL Data Using NLTK ###
url_tokens = tokenize_using_nltk(url_df['Text'])
print('\n{}\n'.format(url_tokens[0:5]))

print_basic_statistics_from_sentences(token_matrix_to_string_array(url_tokens))

Time elapsed: 0.0512[sec]

[['Rotten', 'Tomatoes', 'Movies', 'TV', 'Shows', 'Movie', 'Trailers', 'Reviews', 'Rotten', 'Tomatoes'], ['Signed', 'in'], ['Skip', 'to', 'Main', 'Content'], ['Cancel'], ['Movies', 'TV']]

Average number of words per message: 2.5389312977099237
Five most frequent words: ['season', 'all', 'to', 'of', 'the']
Number of words that appear once: 172


In [17]:
### Performing Stemming on the URL Data Using NLTK ###
url_stemmed = stem_using_nltk(url_tokens)
print('\n{}\n'.format(url_stemmed[0:5]))

print_basic_statistics_from_sentences(token_matrix_to_string_array(url_stemmed))

Time elapsed: 0.0241[sec]

[['rot', 'tomato', 'movy', 'tv', 'show', 'movy', 'trail', 'review', 'rot', 'tomato'], ['sign', 'in'], ['skip', 'to', 'main', 'cont'], ['cancel'], ['movy', 'tv']]

Average number of words per message: 2.5389312977099237
Five most frequent words: ['al', 'to', 'movy', 'of', 'the']
Number of words that appear once: 145


In [18]:
### Performing Lemmatization on the URL Data Using NLTK ###
url_lemmatized = lemmatize_using_nltk(url_tokens)
print('\n{}\n'.format(url_lemmatized[0:5]))

print_basic_statistics_from_sentences(token_matrix_to_string_array(url_lemmatized))

Time elapsed: 0.0109[sec]

[['Rotten', 'Tomatoes', 'Movies', 'TV', 'Shows', 'Movie', 'Trailers', 'Reviews', 'Rotten', 'Tomatoes'], ['Signed', 'in'], ['Skip', 'to', 'Main', 'Content'], ['Cancel'], ['Movies', 'TV']]

Average number of words per message: 2.5389312977099237
Five most frequent words: ['season', 'all', 'to', 'of', 'the']
Number of words that appear once: 171


In [21]:
### Loading data from txt file ###
with open('whatsapp_messages.txt', 'r') as file:
    messages = file.readlines()

whatsapp_df = pd.DataFrame(messages, columns=['Messages'])
whatsapp_df = clean_data(whatsapp_df, 'Messages')
whatsapp_df

Unnamed: 0,Messages
0,Hey team quick reminder today s meeting is at ...
1,Does anyone have the latest sales report Need ...
2,Happy Friday everyone Any plans for the weekend
3,Just a heads up the printer on the 3rd floor i...
4,Can someone review the draft proposal I just s...
...,...
72,IT update network upgrade scheduled for this w...
73,Congrats to the accounting team for closing th...
74,Reminder company wide town hall meeting next T...
75,HR update wellness program launches next week ...


In [22]:
print_basic_statistics_from_sentences(whatsapp_df['Messages'])

Average number of words per message: 11.623376623376624
Five most frequent words: ['next', 'reminder', 'to', 'for', 'the']
Number of words that appear once: 257


In [23]:
### Performing Tokenization on the Whatsapp Data Using NLTK ###
whatsapp_tokens = tokenize_using_nltk(whatsapp_df['Messages'])
print('\n{}\n'.format(whatsapp_tokens[0:5]))

print_basic_statistics_from_sentences(token_matrix_to_string_array(whatsapp_tokens))

Time elapsed: 0.013[sec]

[['Hey', 'team', 'quick', 'reminder', 'today', 's', 'meeting', 'is', 'at', '10', 'AM', 'in', 'Conference', 'Room', 'A'], ['Does', 'anyone', 'have', 'the', 'latest', 'sales', 'report', 'Need', 'it', 'for', 'the', 'presentation'], ['Happy', 'Friday', 'everyone', 'Any', 'plans', 'for', 'the', 'weekend'], ['Just', 'a', 'heads', 'up', 'the', 'printer', 'on', 'the', '3rd', 'floor', 'is', 'out', 'of', 'toner', 'again'], ['Can', 'someone', 'review', 'the', 'draft', 'proposal', 'I', 'just', 'sent', 'over']]

Average number of words per message: 11.623376623376624
Five most frequent words: ['next', 'reminder', 'to', 'for', 'the']
Number of words that appear once: 257


In [24]:
### Performing Stemming on the Whatsapp Data Using NLTK ###
whatsapp_stemmed = stem_using_nltk(whatsapp_tokens)
print('\n{}\n'.format(whatsapp_stemmed[0:5]))

print_basic_statistics_from_sentences(token_matrix_to_string_array(whatsapp_stemmed))

Time elapsed: 0.0191[sec]

[['hey', 'team', 'quick', 'remind', 'today', 's', 'meet', 'is', 'at', '10', 'am', 'in', 'conf', 'room', 'a'], ['doe', 'anyon', 'hav', 'the', 'latest', 'sal', 'report', 'nee', 'it', 'for', 'the', 'pres'], ['happy', 'friday', 'everyon', 'any', 'plan', 'for', 'the', 'weekend'], ['just', 'a', 'head', 'up', 'the', 'print', 'on', 'the', '3rd', 'flo', 'is', 'out', 'of', 'ton', 'again'], ['can', 'someon', 'review', 'the', 'draft', 'propos', 'i', 'just', 'sent', 'ov']]

Average number of words per message: 11.623376623376624
Five most frequent words: ['next', 'remind', 'to', 'for', 'the']
Number of words that appear once: 220


In [26]:
### Performing Lemmatization on the Whatsapp Data Using NLTK ###
whatsapp_lemmatized = lemmatize_using_nltk(whatsapp_tokens)
print('\n{}\n'.format(whatsapp_lemmatized[0:5]))

print_basic_statistics_from_sentences(token_matrix_to_string_array(whatsapp_lemmatized))

Time elapsed: 0.0053[sec]

[['Hey', 'team', 'quick', 'reminder', 'today', 's', 'meeting', 'is', 'at', '10', 'AM', 'in', 'Conference', 'Room', 'A'], ['Does', 'anyone', 'have', 'the', 'latest', 'sale', 'report', 'Need', 'it', 'for', 'the', 'presentation'], ['Happy', 'Friday', 'everyone', 'Any', 'plan', 'for', 'the', 'weekend'], ['Just', 'a', 'head', 'up', 'the', 'printer', 'on', 'the', '3rd', 'floor', 'is', 'out', 'of', 'toner', 'again'], ['Can', 'someone', 'review', 'the', 'draft', 'proposal', 'I', 'just', 'sent', 'over']]

Average number of words per message: 11.623376623376624
Five most frequent words: ['next', 'reminder', 'to', 'for', 'the']
Number of words that appear once: 245


The solution for the whatsapp messages is done in English because I couldn't find a stemming/lemmatization implementation for Hebrew.