<a href="https://colab.research.google.com/github/v-zeng/python_projects/blob/main/NLP_techniques.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Review of Major NLP Concepts

In [None]:
# download medium sized embeddings from spaCy
!pip install spacy
!python -m spacy download en_core_web_md

In [3]:
# bag of words
class Category:
  BOOKS = "BOOKS"
  CLOTHING = "CLOTHING"

train_x = ["i love the book", "this is a great book", "the fit is great", "i love the shoes"]
train_y = [Category.BOOKS, Category.BOOKS, Category.CLOTHING, Category.CLOTHING]

In [20]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(binary=True) # unigram model
train_x_vectors = vectorizer.fit_transform(train_x)

print(vectorizer.get_feature_names_out())

print(vectors.toarray())

['book' 'fit' 'fit is' 'great' 'great book' 'is' 'is great' 'love'
 'love the' 'shoes' 'the' 'the book' 'the fit' 'the shoes' 'this'
 'this is']
[[1 0 0 0 1 0 1 0]
 [1 0 1 1 0 0 0 1]
 [0 1 1 1 0 0 1 0]
 [0 0 0 0 1 1 1 0]]


In [16]:
# text classification model using bag-of-words (SVM)
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(train_x_vectors, train_y)

In [19]:
# predicting new utterances classes with model (transform)
test_x = vectorizer.transform(['shoes are alright''])

clf_svm.predict(test_x)

array(['CLOTHING'], dtype='<U8')

## Word Vectors

In [2]:
import spacy

# load installed spacy model
nlp = spacy.load("en_core_web_md")

In [4]:
print(train_x)

['i love the book', 'this is a great book', 'the fit is great', 'i love the shoes']


In [7]:
# text classification model with word vectors
from sklearn import svm

# items in docs list as word vector representations of sentences defined above
docs = [nlp(text) for text in train_x]
train_x_word_vectors = [x.vector for x in docs]

# build word vector model
clf_svm_wv = svm.SVC(kernel='linear')
clf_svm_wv.fit(train_x_word_vectors, train_y)

In [13]:
# predict new utterances with model
test_x = ["the fit is a bit snug"]
# nlp representation of test_x
test_docs = [nlp(text) for text in test_x]
test_x_word_vectors = [x.vector for x in test_docs]

clf_svm_wv.predict(test_x_word_vectors)

array(['CLOTHING'], dtype='<U8')

## Regex (pattern matching)

In [21]:
import re

regexp = re.compile(r"\bread\b|\bstory\b|book")

phrases = ["I liked that story", "the car treaded up the hill", "this hat is nice"]

matches = []
for phrase in phrases:
  if re.search(regexp, phrase):
    matches.append(phrase)

print(matches)

['I liked that story']


## Stemming/Lemmatization

In [3]:
import nltk


nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

phrase = "reading the books"
words = word_tokenize(phrase)
print(words)

stemmed_words = []
for word in words:
  stemmed_words.append(stemmer.stem(word))

" ".join(stemmed_words)

['reading', 'the', 'books']


'read the book'

In [9]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

phrase = "reading the books"
words = word_tokenize(phrase)

lemmatized_words = []
for word in words:
  lemmatized_words.append(lemmatizer.lemmatize(word, pos='v'))

" ".join(lemmatized_words)

'read the book'

## Stopwords Removal

In [14]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

stop_words = stopwords.words('english')

phrase = "Here is an example sentence demonstrating the removal of stopwords"

words = word_tokenize(phrase)

stripped_phrase = []
for word in words:
  if word not in stop_words:
    stripped_phrase.append(word)

" ".join(stripped_phrase)

'Here example sentence demonstrating removal stopwords'

## Other Techniques - Spell Correction, Sentiment, Part-Of-Speech (POS) tagging

In [21]:
!python -m textblob.download_corpora

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Unzipping corpora/conll2000.zip.
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
Finished.


In [19]:
from textblob import TextBlob

TextBlob("this is an example phrase")

In [22]:
# spell correction
phrase = "thiis is an example prase"

tb_phrase = TextBlob(phrase)
tb_phrase.correct()

TextBlob("this is an example phrase")

In [25]:
# tags (word, POS tag)
phrase = "i read the book"

tb_phrase = TextBlob(phrase)

tb_phrase.correct()

tb_phrase.tags

[('i', 'NN'), ('read', 'VBD'), ('the', 'DT'), ('book', 'NN')]

In [28]:
# positive sentiment
phrase = "the book was great"

tb_phrase = TextBlob(phrase)

tb_phrase.correct()

tb_phrase.sentiment

Sentiment(polarity=0.8, subjectivity=0.75)

In [29]:
# negative sentiment
phrase = "the book was horrible"

tb_phrase = TextBlob(phrase)

tb_phrase.correct()

tb_phrase.sentiment

Sentiment(polarity=-1.0, subjectivity=1.0)

## Transformer Architecture (BERT | spaCy)

In [34]:
!pip install spacy-transformers
!python -m spacy download en_core_web_lg

  _torch_pytree._register_pytree_node(
Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m859.5 kB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [1]:
import spacy
import torch

nlp = spacy.load("en_core_web_lg")
doc = nlp("Here is some text to encode.")

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [2]:
class Category:
  BOOKS = "BOOKS"
  BANK = "BANK"

train_x = ["good characters and plot progression", "check out the book", "good story. would recommend", "novel recommendation", "need to make a deposit to the bank", "balance inquiry savings", "save money"]
train_y = [Category.BOOKS, Category.BOOKS, Category.BOOKS, Category.BOOKS, Category.BANK, Category.BANK, Category.BANK]

In [4]:
from sklearn import svm

docs = [nlp(text) for text in train_x]
train_x_vectors = [doc.vector for doc in docs]
clf_svm = svm.SVC(kernel='linear')

clf_svm.fit(train_x_vectors, train_y)

test_x = ["i need to write a check"]
docs = [nlp(text) for text in test_x]
test_x_vectors = [doc.vector for doc in docs]

clf_svm.predict(test_x_vectors)

array(['BANK'], dtype='<U5')