Bag of Words
- convert sentences to vectors
- Extract all the unique words from sentences
- binary = 0 and 1s , could also be count of words
- unigram = take individual word, bigram = 2 words
- Limitation: words not in training set

In [52]:
class Category:
    BOOKS = 'BOOKS'
    CLOTHING = 'CLOTHING'

train_x = ['i love the book', 'this is a great book', 'the fit is great', 'i love the shoes']
train_y = [Category.BOOKS, Category.BOOKS, Category.CLOTHING, Category.CLOTHING]

In [53]:
from sklearn.feature_extraction.text import CountVectorizer
# binary for 0s and 1s
# ngram_range for unigram, bigram
vectorizer = CountVectorizer(binary = True)
train_x_vectors = vectorizer.fit_transform(train_x)

print(vectorizer.get_feature_names_out())
# stripped away 'a', 'i'
print(train_x_vectors.toarray())

['book' 'fit' 'great' 'is' 'love' 'shoes' 'the' 'this']
[[1 0 0 0 1 0 1 0]
 [1 0 1 1 0 0 0 1]
 [0 1 1 1 0 0 1 0]
 [0 0 0 0 1 1 1 0]]


In [54]:
from sklearn import svm

clf_svm = svm.SVC(kernel = 'linear')
clf_svm.fit(train_x_vectors, train_y)

In [55]:
test_x = vectorizer.transform(['shoes are alright'])
clf_svm.predict(test_x)


array(['CLOTHING'], dtype='<U8')

Word Vectors
- capture semantic meaning of words
- map similar words to similar vector space
- use a window of text as context window, use surrounding token to find meaning of individual token
- e.g. book and read, story and characters
- uses neural network architecture

In [56]:
# medium model
# !python -m spacy download en_core_web_md

In [57]:
import spacy
# word embedding
nlp = spacy.load('en_core_web_md')

In [58]:
print(train_x)

['i love the book', 'this is a great book', 'the fit is great', 'i love the shoes']


In [59]:
docs = [nlp(text) for text in train_x]
# i love the book
# print(docs[0].vector)
train_x_word_vectors = [x.vector for x in docs]

In [60]:
clf_svm_wv = svm.SVC(kernel = 'linear')
clf_svm_wv.fit(train_x_word_vectors, train_y)

In [61]:
test_x = ['these earings hurt']
# npl() = average embedding
# words with multiple meaning may be problematic
test_docs = [nlp(text) for text in test_x]
test_x_word_vectors = [x.vector for x in test_docs]

clf_svm_wv.predict(test_x_word_vectors)

array(['CLOTHING'], dtype='<U8')

In [40]:
test_docs

[these earings hurt]

Regexes

- pattern matching of strings
- e.g. password, emails

In [41]:
import re

regexp = re.compile(r'^ab[^\s]*cd$')

phrases = ['abcd', 'xxx', 'abxxxcd', 'ab cd']

matches = []
for phrase in phrases:
    # re.search for any place in the string
    if re.match(regexp, phrase):
        matches.append(phrase)
print(matches)

['abcd', 'abxxxcd']


In [42]:
# \b\b for words by itself
regexp = re.compile(r'\bread\b|\bstory\b|book')

phrases = ['I like that history', 'the car treaded up the hill', 'this hat is nice']

matches = []
for phrase in phrases:
    if re.search(regexp, phrase):
        matches.append(phrase)
print(matches)

[]


Stemming/Lemmatization

- normalize text
- stories -> stori(stemming), stories -> story (lemmatization, actual english words)

In [43]:
import nltk

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\thoma\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\thoma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\thoma\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [44]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

# strip punctuation first
phrase = 'reading the books'
words = word_tokenize(phrase)

stemmed_words = []
for word in words:
    stemmed_words.append(stemmer.stem(word))

' '.join(stemmed_words)

'read the book'

In [45]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

phrase = 'reading the books'
words = word_tokenize(phrase)

lemmatized_words = []
for word in words:
    # pos = verbs
    # may need part of speech tag to identify the nature of the words
    lemmatized_words.append(lemmatizer.lemmatize(word, pos = 'v'))

' '.join(lemmatized_words)

'read the book'

Stopwords

- most common english words
- strip out as no additional value, e.g. the, this, that

In [46]:
from nltk.corpus import stopwords

stop_words = stopwords.words('english')

phrase = 'Here is an example sentence demonstrating the removal of stopwords'

words = word_tokenize(phrase)

stripped_phrase = []
for word in words:
    if word not in stop_words:
        stripped_phrase.append(word)
        
' '.join(stripped_phrase)
    

'Here example sentence demonstrating removal stopwords'

In [47]:
# Spell Correction
from textblob import TextBlob

phrase = 'the bok was horrible'

tb_phrase = TextBlob(phrase)

tb_phrase.correct()


# Part of speech tagging
# identify noun, verbs, ...
tb_phrase.tags

# polarity = positive/negative sentiment
tb_phrase.sentiment

Sentiment(polarity=-1.0, subjectivity=1.0)

Transformer architecture

In [48]:
# BERT model
import spacy
import torch
import spacy_transformers


nlp = spacy.load('en_core_web_trf')
doc = nlp('Here is some text to encode.')

In [62]:
class Category:
    BOOKS = 'BOOKS'
    BANK = 'BANK'

train_x = ["good characters and plot progression", 'check out the book', 'good story. would recommend', 'novel recommendation', 'need to make a deposit to the bank', 'balance inquiry savings', 'save money']
train_y = [Category.BOOKS, Category.BOOKS, Category.BOOKS, Category.BOOKS, Category.BANK, Category.BANK, Category.BANK]

In [65]:
docs = [nlp(text) for text in train_x]
train_x_vectors = [doc.vector for doc in docs]

clf_svm.fit(train_x_vectors, train_y)

# classified as bank even though no mention in training set
test_x = ['i need to write a check']
docs = [nlp(text) for text in test_x]
test_x_vectors = [doc.vector for doc in docs]

clf_svm.predict(test_x_vectors)

array(['BANK'], dtype='<U5')