# BoW Implementation

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer 
import numpy as np
import pandas as pd
toy_corpus= ["the fat cat sat on the mat", "the big cat slept",
             "the dog chased a cat"]
vectorizer=TfidfVectorizer() 
corpus_tfidf=vectorizer.fit_transform(toy_corpus)
print(f"The vocabulary size is:  {len(vectorizer.vocabulary_.keys())} ")
print(f"The document-term matrix shape is: {corpus_tfidf.shape}")
df=pd.DataFrame(np.round(corpus_tfidf.toarray(),2))
df.columns=vectorizer.get_feature_names()

The vocabulary size is:  10 
The document-term matrix shape is: (3, 10)


# SVM Pipeline

In [2]:
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
labels= [0,1,0]
clf = SVC()
clf.fit(df.to_numpy(), labels)

SVC()

In [3]:
clf.predict(df.to_numpy())

array([0, 1, 0])

# Maximum Likelihood Estimator (MLE) with  Natural Language Toolkit (NLTK)

In [4]:
import nltk
from nltk.corpus import gutenberg
from nltk.lm import MLE
from nltk.lm.preprocessing import padded_everygram_pipeline
nltk.download('gutenberg')
nltk.download('punkt')
macbeth = gutenberg.sents('shakespeare-macbeth.txt')
model, vocab = padded_everygram_pipeline(2, macbeth)
lm=MLE(2)
lm.fit(model,vocab)
print(list(lm.vocab)[:10])
print(f"The number of words is {len(lm.vocab)}")

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\Vasilis\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Vasilis\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['<s>', '[', 'The', 'Tragedie', 'of', 'Macbeth', 'by', 'William', 'Shakespeare', '1603']
The number of words is 4020


In [5]:
print(f"The frequency of the term 'Macbeth' is {lm.counts['Macbeth']}")
print(f"The language model probability score of 'Macbeth' is {lm.score('Macbeth')}")
print(f"The number of times 'Macbeth' follows 'Enter' is {lm.counts[['Enter']]['Macbeth']} ")
print(f"P(Macbeth | Enter) is {lm.score('Macbeth',['Enter'])}")
print(f"P(shaking | for) is {lm.score('shaking', ['for'])}")

The frequency of the term 'Macbeth' is 61
The language model probability score of 'Macbeth' is 0.0022631149365585812
The number of times 'Macbeth' follows 'Enter' is 15 
P(Macbeth | Enter) is 0.1875
P(shaking | for) is 0.012195121951219513


# Word2vec model

In [6]:
from gensim.test.utils import common_texts

In [7]:
from gensim.models import Word2vec
model = Word2vec(sentences=macbeth, size=100, window= 4,min_count=10, workers=4, iter=10)

ImportError: cannot import name 'Word2vec' from 'gensim.models' (C:\Users\Vasilis\AppData\Roaming\Python\Python38\site-packages\gensim\models\__init__.py)

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import random
np.random.seed(42)
words=list([e for e in model.wv.vocab if len(e)>4]) 
random.shuffle(words)
words3d = PCA(n_components=3,random_state=42).fit_transform(model.wv[words[:100]])
def plotWords3D(vecs, words, title):
 ...
plotWords3D(words3d, words, "Visualizing Word2vec Word Embeddings using PCA")

In [9]:
wget https://dl.fbaipublicfiles.com/glue/data/SST-2.zip

SyntaxError: invalid syntax (<ipython-input-9-0ef3f1eee100>, line 1)