In [27]:
class Category:
    Books = "Books"
    CLOTHING = "CLOTHING"

train_x = ["i love the book", "this is a great book", "the fit is great", "i love the shoes"]
train_y = [Category.Books, Category.Books, Category.CLOTHING, Category.CLOTHING]

Fit vectorizer to transfrom text to bag-of-words vectors

In [28]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(binary=True)
train_x_vectors = vectorizer.fit_transform(train_x)

print(vectorizer.get_feature_names_out())
print(train_x_vectors.toarray())

['book' 'fit' 'great' 'is' 'love' 'shoes' 'the' 'this']
[[1 0 0 0 1 0 1 0]
 [1 0 1 1 0 0 0 1]
 [0 1 1 1 0 0 1 0]
 [0 0 0 0 1 1 1 0]]


Train SVM Model

In [29]:
from sklearn import svm 

clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(train_x_vectors, train_y)

Test new utterances on trained model

In [30]:
test_x = vectorizer.transform(['i love the books'])

clf_svm.predict(test_x)

array(['CLOTHING'], dtype='<U8')

## Word Vectors

In [31]:
import spacy
nlp = spacy.load('en_core_web_md')

In [32]:
print(train_x)

['i love the book', 'this is a great book', 'the fit is great', 'i love the shoes']


In [33]:
docs = [nlp(text) for text in train_x]
print(docs)

[i love the book, this is a great book, the fit is great, i love the shoes]


In [34]:
train_x_word_vectors = [x.vector for x in docs]
print(train_x_word_vectors)

[array([-7.33089983e-01, -5.24749886e-03, -2.35488251e-01,  1.59274936e-02,
        9.66347754e-02,  1.56278491e-01,  1.38615012e-01, -1.82292491e-01,
        8.84527490e-02,  1.54077005e+00, -2.41762251e-01, -8.96672532e-02,
        1.74057245e-01,  3.10127772e-02,  4.62116897e-02, -5.05267493e-02,
       -1.48660004e-01,  1.03792381e+00, -1.71565011e-01, -6.28000051e-02,
        1.03982493e-01,  1.28997505e-01,  1.35554761e-01, -2.06535250e-01,
       -2.21828252e-01, -1.54980987e-01, -2.25717485e-01, -2.63060927e-01,
        2.91349851e-02,  9.59425047e-02, -2.11517513e-02,  3.45300019e-01,
       -1.88805014e-01,  1.19102523e-02,  1.82815492e-01,  1.35538995e-01,
       -1.14783749e-01,  2.49261260e-01, -1.00740008e-01,  6.52624816e-02,
       -1.29889250e-01,  1.79949999e-02, -1.20909005e-01, -2.06174999e-02,
        1.49652511e-01,  1.26080498e-01,  4.98107485e-02,  1.36212513e-01,
       -6.19465038e-02,  1.98888257e-01, -1.23281501e-01,  9.30762440e-02,
       -8.31630006e-02, 

In [35]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(train_x_word_vectors, train_y)

In [39]:
test_x = ["I love the books"]
test_docs = [nlp(text) for text in test_x]
test_x_word_vectors = [x.vector for x in test_docs]

In [40]:
clf_svm.predict(test_x_word_vectors)


array(['Books'], dtype='<U8')

## Regexes

In [41]:
import re

regexp = re.compile(r"\bread\b|\bstory\b|book")

phrases = ["I liked that story.", "the car treaded up the hill", "this hat is nice"]

matches = []

for phrase in phrases:
    if re.search(regexp, phrase):
        matches.append(phrase)

print(matches)

['I liked that story.']


## Stemming/Lemmatization

In [42]:
import nltk

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vamsh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vamsh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vamsh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Stemming

In [45]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()


phrase = "reading the books"
words = word_tokenize(phrase)
print(words)

stemmed_words = []
for word in words:
    stemmed_words.append(stemmer.stem(word))

" ".join(stemmed_words)

['reading', 'the', 'books']


'read the book'

Lemmatizing

In [46]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

phrase = "reading the books"
words = word_tokenize(phrase)

lemmatized_words = []
for word in words:
    lemmatized_words.append(lemmatizer.lemmatize(word, pos='v'))

" ".join(lemmatized_words)

'read the book'

### StopWords

In [47]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

stop_words = stopwords.words('english')

phrase = "Here is an example sentence demonstrating the removal of stopwords"

words = word_tokenize(phrase)

stripped_phrase = []

for word in words:
    if word not in stop_words:
        stripped_phrase.append(word)

" ".join(stripped_phrase)

'Here example sentence demonstrating removal stopwords'

### Various other techniques (Spell correction, sentiment and pos tagging)

In [48]:
from textblob import TextBlob

phrase = "The book was horrible"

tb_phrase = TextBlob(phrase)

tb_phrase.correct()

tb_phrase.tags

tb_phrase.sentiment

Sentiment(polarity=-1.0, subjectivity=1.0)

### Transformer Architecture

Using Spacy to utilize BERT model

In [65]:
import spacy
nlp = spacy.load("en_core_web_trf")

doc = nlp("Here is some text to encode.")

ValueError: [E002] Can't find factory for 'curated_transformer' for language English (en). This usually happens when spaCy calls `nlp.create_pipe` with a custom component name that's not registered on the current language class. If you're using a custom component, make sure you've added the decorator `@Language.component` (for function components) or `@Language.factory` (for class components).

Available factories: attribute_ruler, tok2vec, merge_noun_chunks, merge_entities, merge_subtokens, token_splitter, doc_cleaner, parser, beam_parser, lemmatizer, trainable_lemmatizer, entity_linker, entity_ruler, tagger, morphologizer, ner, beam_ner, senter, sentencizer, spancat, spancat_singlelabel, span_finder, future_entity_ruler, span_ruler, textcat, textcat_multilabel, en.lemmatizer