# Bag of Words

In [None]:
!pip install spacy
!python -m spacy download en_core_web_md

In [None]:
!python -m textblob.download_corpora

In [None]:
class Category:
  BOOKS = "BOOKS"
  CLOTHING = "CLOTHING"

train_x = ["i love the book", "this is a great book", "i love the shoes", "the fit is great"]
train_y = [Category.BOOKS, Category.BOOKS, Category.CLOTHING, Category.CLOTHING]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(binary=True)
train_x_vectors = vectorizer.fit_transform(train_x)

In [None]:
print(vectorizer.get_feature_names_out())
print(train_x_vectors.toarray())

['book' 'fit' 'great' 'is' 'love' 'shoes' 'the' 'this']
[[1 0 0 0 1 0 1 0]
 [1 0 1 1 0 0 0 1]
 [0 0 0 0 1 1 1 0]
 [0 1 1 1 0 0 1 0]]


In [None]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(train_x_vectors, train_y)

In [None]:
test_x = vectorizer.transform(['I want to buy a book'])

clf_svm.predict(test_x)

array(['BOOKS'], dtype='<U8')

# Word Vectors

In [None]:
import spacy

In [None]:
nlp = spacy.load("en_core_web_md")

In [None]:
docs = [nlp(text) for text in train_x]
train_x_word_vectors = [x.vector for x in docs]

In [None]:
print(docs[0].vector)

In [None]:
clf_svm_wv = svm.SVC(kernel='linear')
clf_svm_wv.fit(train_x_word_vectors, train_y)

In [None]:
test_x = ["i love the story"]
test_docs = [nlp(text) for text in test_x]
test_x_word_vectors = [x.vector for x in test_docs]

clf_svm_wv.predict(test_x_word_vectors)

array(['BOOKS'], dtype='<U8')

#Regexes

In [None]:
import re
regexp = re.compile(r"^ab[^\s]*cd$")
phrases = ["abcd", "xxx", "abxxxcd", "ab cd"]

matches = []
for phrase in phrases:
  if re.search(regexp, phrase):
    matches.append(phrase)

print(matches)

['abcd', 'abxxxcd']


In [None]:
import re
regexp = re.compile(r"\bread\b|\bstory\b|book")
phrases = ["i love the book", "this is a great book", "i love the shoes", "the fit is great"]

matches = []
for phrase in phrases:
  if re.search(regexp, phrase):
    matches.append(phrase)

print(matches)

['i love the book', 'this is a great book']


#Stemming/Lemmatization

In [None]:
import nltk

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

phrase = "reading the books"
words = word_tokenize(phrase)

stemmed_words = []
for word in words:
  stemmed_words.append(stemmer.stem(word))

" ".join(stemmed_words)

'read the book'

In [None]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

phrase = "reading the books"
words = word_tokenize(phrase)

lemmatized_words = []
for word in words:
  lemmatized_words.append(lemmatizer.lemmatize(word, pos='v'))

" ".join(stemmed_words)

'read the book'

#Stopword Removal

In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

stop_words = stopwords.words('english')

phrase = "Here is an example sentence demostrating the removal stopwords"

words = word_tokenize(phrase)

stripped_phrase = []
for word in words:
  if word not in stop_words:
    stripped_phrase.append(word)

" ".join(stripped_phrase)

'Here example sentence demostrating removal stopwords'

#spell correction, sentiment, and pos tagging

In [None]:
from textblob import TextBlob

phrase = "this is a good examplee"

tb_phrase = TextBlob(phrase)

tb_phrase.correct()

TextBlob("this is a good example")

In [None]:
tb_phrase.tags

[('this', 'DT'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('good', 'JJ'),
 ('examplee', 'NN')]

In [None]:
tb_phrase.sentiment

Sentiment(polarity=0.7, subjectivity=0.6000000000000001)

# Transformer Architecture

In [None]:
!pip install spacy transformers
!python -m spacy download en_trf_bertbaseuncased_lg


[38;5;1m✘ No compatible package found for 'en_trf_bertbaseuncased_lg' (spaCy
v3.7.2)[0m



In [None]:
!pip install --upgrade spacy

Collecting spacy
  Using cached spacy-3.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.6 MB)
Collecting thinc<8.3.0,>=8.1.8 (from spacy)
  Downloading thinc-8.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (922 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m922.3/922.3 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: thinc, spacy
  Attempting uninstall: thinc
    Found existing installation: thinc 8.0.17
    Uninstalling thinc-8.0.17:
      Successfully uninstalled thinc-8.0.17
  Attempting uninstall: spacy
    Found existing installation: spacy 3.0.8
    Uninstalling spacy-3.0.8:
      Successfully uninstalled spacy-3.0.8
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
en-core-web-md 3.6.0 requires spacy<3.7.0,>=3.6.0, but you have spacy 3.7.2 which is incompatible.
en-cor

In [None]:
!pip install --upgrade spacy transformers

Collecting transformers
  Downloading transformers-4.36.2-py3-none-any.whl (8.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.35.2
    Uninstalling transformers-4.35.2:
      Successfully uninstalled transformers-4.35.2
Successfully installed transformers-4.36.2


In [None]:
!python -m spacy download en_trf_bertbaseuncased_lg

2024-01-17 15:55:24.939182: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-17 15:55:24.939243: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-17 15:55:24.941150: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered

[38;5;1m✘ No compatible package found for 'en_trf_bertbaseuncased_lg' (spaCy
v3.0.8)[0m



In [None]:
import spacy
import torch

In [None]:
nlp = spacy.load("en_trf_bertbaseuncased_lg")
doc = nlp("Here is some text to encode.")

OSError: [E050] Can't find model 'en_trf_bertbaseuncased_lg'. It doesn't seem to be a Python package or a valid path to a data directory.