In [None]:
!pip install spacy
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.5.0/en_core_web_md-3.5.0-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.5.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [None]:
class Category:
  BOOKS = "BOOKS"
  CLOTHING = "CLOTHING"

train_x = ["i love my book", "this is a great book", "the fit is great", "i love my shoes"]
train_y = [Category.BOOKS, Category.BOOKS, Category.CLOTHING, Category.CLOTHING]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# unigram Bag of Words approach
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(train_x)
vectorizer.get_feature_names_out()

array(['book', 'fit', 'great', 'is', 'love', 'my', 'shoes', 'the', 'this'],
      dtype=object)

In [None]:
X.toarray()

array([[1, 0, 0, 0, 1, 1, 0, 0, 0],
       [1, 0, 1, 1, 0, 0, 0, 0, 1],
       [0, 1, 1, 1, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 1, 1, 1, 0, 0]])

In [None]:
from sklearn import svm

# using SVC classifier
clf_svm = svm.SVC(kernel = 'linear')
clf_svm.fit(X, train_y)

In [None]:
text_X = vectorizer.transform(["i understood the book"])
clf_svm.predict(text_X)

array(['BOOKS'], dtype='<U8')

In [None]:
# using bigram BoW

bi_vec = CountVectorizer(ngram_range = (2, 2))
train_x_bi = bi_vec.fit_transform(train_x)
bi_vec.get_feature_names_out()

array(['fit is', 'great book', 'is great', 'love my', 'my book',
       'my shoes', 'the fit', 'this is'], dtype=object)

In [None]:
train_x_bi.toarray()

array([[0, 0, 0, 1, 1, 0, 0, 0],
       [0, 1, 1, 0, 0, 0, 0, 1],
       [1, 0, 1, 0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0, 1, 0, 0]])

In [None]:
clf_svm.fit(train_x_bi, train_y)

In [None]:
text_X = bi_vec.transform(["i love the books"])
clf_svm.predict(text_X)

array(['CLOTHING'], dtype='<U8')

## Word vectors using spacy

In [None]:
import spacy

In [None]:
nlp = spacy.load("en_core_web_md")

In [None]:
print(train_x)

['i love my book', 'this is a great book', 'the fit is great', 'i love my shoes']


In [None]:
docreps = [nlp(i) for i in train_x]
docreps

[i love my book, this is a great book, the fit is great, i love my shoes]

In [None]:
docreps[0].vector

array([  2.6315024 ,  -1.353375  ,  -4.0092    ,  -7.30155   ,
        -4.097495  ,  -2.80835   ,   2.3547776 ,   5.1974254 ,
        -5.19495   ,   5.23498   ,   5.914975  ,   3.0949    ,
        -7.4033003 ,   0.61856997,   2.29324   ,  -4.178875  ,
         3.3873649 ,  -4.42395   ,  -1.8173249 ,  -1.3811501 ,
         1.05144   ,   2.5754874 ,  -2.5551248 ,  -6.6485577 ,
        -2.0954475 ,  -3.0525975 ,  -2.83079   ,   0.72137105,
        -3.7929251 ,   4.63505   ,  -1.7388445 ,  -0.54357505,
        -2.321425  ,   4.868975  ,   3.4078276 ,   1.4085475 ,
        -1.56035   ,   0.88356245,   5.2574253 ,  -0.5571306 ,
        -0.40328997,   2.7360873 ,  -0.06505001,  -3.661525  ,
         8.177125  ,   3.0302749 ,  -4.398325  ,  -1.2461226 ,
        -0.9817501 ,   2.3884249 ,   3.0547748 ,  -3.2250998 ,
        -1.02268   ,  -1.3915601 ,  -4.2412624 ,  -2.1919224 ,
        -0.6086174 ,   2.7178333 ,   1.9102001 ,   3.2952876 ,
         9.967074  ,   3.0036526 ,  -3.833775  ,  -0.44

In [None]:
# using SVC classifier
clf_svm_spacy = svm.SVC(kernel = 'linear')

In [None]:
train_spacy = [x.vector for x in docreps]
clf_svm_spacy.fit(train_spacy, train_y)

In [None]:
clf_svm_spacy.predict([nlp("i love shakespeare").vector])

array(['BOOKS'], dtype='<U8')

## Stemming and Lemmatization

In [None]:
import nltk

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [None]:
stemmer = PorterStemmer()

phrase = "reading books is the best entertainment personally"

words = word_tokenize(phrase)
print(words)

['reading', 'books', 'is', 'the', 'best', 'entertainment', 'personally']


In [None]:
stemmedwords = []
for w in words:
  stemmedwords.append(stemmer.stem(w))

" ".join(stemmedwords)

'read book is the best entertain person'

In [None]:
from nltk.stem import WordNetLemmatizer

lem = WordNetLemmatizer()

phrase = "reading books is the best entertainment personally"

words = word_tokenize(phrase)
lemwords = []
for w in words:
  lemwords.append(lem.lemmatize(w, pos = 'v'))

" ".join(lemwords)


'read book be the best entertainment personally'

## Stopword Removal

In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [None]:
stop_words = stopwords.words('english')
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [None]:
eg = "i love walking in the mornings because the weather isn't too dreadful"
words_eg = word_tokenize(eg)

stripped = []

for w in words_eg:
  if w not in stop_words:
    stripped.append(w)

" ".join(stripped)

"love walking mornings weather n't dreadful"

## TextBlob

In [None]:
from textblob import TextBlob
!python -m textblob.download_corpora

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Unzipping corpora/conll2000.zip.
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
Finished.


### Spell Correct

In [None]:
tbsent = "this is an exampple sentance"
blob = TextBlob(tbsent)
blob.correct()

TextBlob("this is an example sentence")

### Parts of speech tagging

In [None]:
tbsent1 = "roses are red but i always run in the morning"
blob1 = TextBlob(tbsent1)
blob1.tags

[('roses', 'NNS'),
 ('are', 'VBP'),
 ('red', 'JJ'),
 ('but', 'CC'),
 ('i', 'JJ'),
 ('always', 'RB'),
 ('run', 'VBP'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('morning', 'NN')]

### Sentiment Analysis

In [None]:
sent_anal = ["i love movies", "that book was disgusting"]

sentvals = []

for s in sent_anal:
  b1 = TextBlob(s)
  sentvals.append(b1.sentiment)

sentvals

[Sentiment(polarity=0.5, subjectivity=0.6),
 Sentiment(polarity=-1.0, subjectivity=1.0)]

## Transformer Architecture

In [None]:
!pip install spacy-transformers
!python -m spacy download en_core_web_trf

Collecting en-core-web-trf==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.5.0/en_core_web_trf-3.5.0-py3-none-any.whl (460.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m460.3/460.3 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_trf')


In [None]:
import spacy
import torch

mod1 = spacy.load('en_core_web_md')
doc = mod1("this is what i want to say")

In [None]:
class Category:
  BOOKS = "BOOKS"
  BANK = "BANK"

train_x = ["i love my book", "this is a great book", "the plot was intriguing", "the character development arc was very satisfying", "deposit this check", "my bank balance is low", "i need to pay my loan", "the interest rate is low"]
train_y = [Category.BOOKS, Category.BOOKS, Category.BOOKS, Category.BOOKS, Category.BANK, Category.BANK, Category.BANK, Category.BANK]

In [None]:
from sklearn import svm

In [None]:
docs = [mod1(t).vector for t in train_x]

In [None]:
clf_svm_core = svm.SVC(kernel = 'linear')
clf_svm_core.fit(docs, train_y)

In [None]:
clf_svm_core.predict([mod1("check out the new manga series").vector])

array(['BOOKS'], dtype='<U5')

In [None]:
import spacy
import torch

mod1 = spacy.load('en_core_web_trf')
doc = mod1("this is what i want to say")

In [None]:
class Category:
  BOOKS = "BOOKS"
  BANK = "BANK"

train_x = ["i love my book", "this is a great book", "the plot was intriguing", "the character development arc was very satisfying", "deposit this check", "my bank balance is low", "i need to pay my loan", "the interest rate is low"]
train_y = [Category.BOOKS, Category.BOOKS, Category.BOOKS, Category.BOOKS, Category.BANK, Category.BANK, Category.BANK, Category.BANK]

In [None]:

from sklearn import svm

In [None]:
docs = [mod1(t).vector for t in train_x]

In [None]:
clf_svm_core = svm.SVC(kernel = 'linear')
clf_svm_core.fit(docs, train_y)

ValueError: ignored

In [None]:
clf_svm_core.predict([mod1("check out the new manga series").vector])

array(['BOOKS'], dtype='<U5')