#Demonstration: BoW, TF-IDF and Word Embeddings

## Load Dataset

In [1]:
import pandas as pd
df = pd.read_csv("nlp_dataset.csv")
print("Original Dataset:")
print(df.head())

Original Dataset:
                                            Sentence
0       The quick brown fox jumps over the lazy dog.
1  Artificial intelligence is transforming the wo...
2  Machine learning models require large datasets...
3  Deep learning architectures such as CNNs and R...
4  Natural language processing enables machines t...


# POS Tagging

In [2]:
# Using NLTK
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('universal_tagset')
nltk.download('averaged_perceptron_tagger_eng')
from nltk import pos_tag, word_tokenize

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/varuniexpress/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/varuniexpress/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/varuniexpress/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /Users/varuniexpress/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/varuniexpress/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


##Using NLTK

In [3]:
def pos_tag_nltk(text):
    words = word_tokenize(text)
    return pos_tag(words, tagset='universal')

df["POS_NLTK"] = df["Sentence"].apply(pos_tag_nltk)
print("- Using NLTK:")
print(df[["Sentence", "POS_NLTK"]].head())

- Using NLTK:
                                            Sentence  \
0       The quick brown fox jumps over the lazy dog.   
1  Artificial intelligence is transforming the wo...   
2  Machine learning models require large datasets...   
3  Deep learning architectures such as CNNs and R...   
4  Natural language processing enables machines t...   

                                            POS_NLTK  
0  [(The, DET), (quick, ADJ), (brown, NOUN), (fox...  
1  [(Artificial, ADJ), (intelligence, NOUN), (is,...  
2  [(Machine, NOUN), (learning, NOUN), (models, N...  
3  [(Deep, NOUN), (learning, VERB), (architecture...  
4  [(Natural, ADJ), (language, NOUN), (processing...  


## Using spaCy

In [4]:
import spacy
nlp = spacy.load("en_core_web_sm")

def pos_tag_spacy(text):
    doc = nlp(text)
    return [(token.text, token.pos_) for token in doc]

df["POS_spaCy"] = df["Sentence"].apply(pos_tag_spacy)
print("- Using spaCy:")
print(df[["Sentence", "POS_spaCy"]].head())

- Using spaCy:
                                            Sentence  \
0       The quick brown fox jumps over the lazy dog.   
1  Artificial intelligence is transforming the wo...   
2  Machine learning models require large datasets...   
3  Deep learning architectures such as CNNs and R...   
4  Natural language processing enables machines t...   

                                           POS_spaCy  
0  [(The, DET), (quick, ADJ), (brown, ADJ), (fox,...  
1  [(Artificial, ADJ), (intelligence, NOUN), (is,...  
2  [(Machine, NOUN), (learning, NOUN), (models, N...  
3  [(Deep, PROPN), (learning, NOUN), (architectur...  
4  [(Natural, ADJ), (language, NOUN), (processing...  


# Bag of Words (BoW) and TF-IDF

In [5]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
vectorizer_bow = CountVectorizer()
vectorizer_tfidf = TfidfVectorizer()

In [6]:
# BoW Representation
df_bow = pd.DataFrame(vectorizer_bow.fit_transform(df["Sentence"]).toarray(),
                       columns=vectorizer_bow.get_feature_names_out())
print("- BoW Representation:")
print(df_bow.head())

- BoW Representation:
   action  and  architectures  are  artificial  as  brown  buy  by  cars  ...  \
0       0    0              0    0           0   0      1    0   0     0  ...   
1       0    0              0    0           1   0      0    0   0     0  ...   
2       0    0              0    0           0   0      0    0   0     0  ...   
3       0    1              1    1           0   1      0    0   0     0  ...   
4       0    0              0    0           0   0      0    0   0     0  ...   

   that  the  to  training  transforming  understand  urgent  use  went  world  
0     0    2   0         0             0           0       0    0     0      0  
1     0    1   0         0             1           0       0    0     0      1  
2     0    0   0         1             0           0       0    0     0      0  
3     0    0   0         0             0           0       0    0     0      0  
4     0    0   1         0             0           1       0    0     0      0  

[5 r

In [7]:
# TF-IDF Representation
df_tfidf = pd.DataFrame(vectorizer_tfidf.fit_transform(df["Sentence"]).toarray(),
                         columns=vectorizer_tfidf.get_feature_names_out())
print("- TF-IDF Representation:")
print(df_tfidf.head())

- TF-IDF Representation:
   action     and  architectures       are  artificial        as     brown  \
0     0.0  0.0000       0.000000  0.000000    0.000000  0.000000  0.344817   
1     0.0  0.0000       0.000000  0.000000    0.443885  0.000000  0.000000   
2     0.0  0.0000       0.000000  0.000000    0.000000  0.000000  0.000000   
3     0.0  0.2503       0.336547  0.336547    0.000000  0.336547  0.000000   
4     0.0  0.0000       0.000000  0.000000    0.000000  0.000000  0.000000   

   buy   by  cars  ...  that       the        to  training  transforming  \
0  0.0  0.0   0.0  ...   0.0  0.409520  0.000000  0.000000      0.000000   
1  0.0  0.0   0.0  ...   0.0  0.263588  0.000000  0.000000      0.443885   
2  0.0  0.0   0.0  ...   0.0  0.000000  0.000000  0.370732      0.000000   
3  0.0  0.0   0.0  ...   0.0  0.000000  0.000000  0.000000      0.000000   
4  0.0  0.0   0.0  ...   0.0  0.000000  0.305902  0.000000      0.000000   

   understand  urgent  use  went     world  
0   

# Word Embeddings (Word2Vec, FastText)

In [8]:
!pip3 install gensim



In [9]:
print("\nWord Embeddings:")
import gensim
from gensim.models import Word2Vec, FastText

nltk.download('punkt')
nltk.download('punkt_tab')
sentences = [word_tokenize(sentence) for sentence in df["Sentence"]]


Word Embeddings:


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/varuniexpress/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/varuniexpress/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


## Word2Vec

In [10]:
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
print("- Word2Vec Example (vector for 'learning'):")
print(word2vec_model.wv['learning'])

- Word2Vec Example (vector for 'learning'):
[-0.00714262  0.00123934 -0.00719911 -0.00224833  0.0037139   0.00582085
  0.00121568  0.00212919 -0.00413245  0.00721701 -0.00629886  0.0046435
 -0.00823621  0.00202604 -0.0049813  -0.00426119 -0.0030935   0.00564472
  0.00579024 -0.0049958   0.00077382 -0.00849237  0.00781627  0.00925238
 -0.00273513  0.00078918  0.00074849  0.0054746  -0.00861361  0.00058809
  0.00687229  0.00222359  0.00113784 -0.0093557   0.00848499 -0.00625204
 -0.00298735  0.00348573 -0.00076114  0.00139133  0.00178694 -0.00684534
 -0.00972519  0.00905068  0.00621701 -0.00690978  0.00338968  0.0002201
  0.00475308 -0.00712166  0.00403597  0.00433994  0.00995938 -0.00447299
 -0.00139421 -0.00732321 -0.00968426 -0.00909021 -0.00103136 -0.00650065
  0.0048594  -0.0061687   0.00253097  0.00073111 -0.00339446 -0.00096088
  0.0099858   0.00917453 -0.00449193  0.00908251 -0.00565186  0.00594476
 -0.00308296  0.00343732  0.00304221  0.00689994 -0.00238398  0.00877762
  0.00757

## FastText

In [11]:
fasttext_model = FastText(sentences, vector_size=100, window=5, min_count=1, workers=4)
print("- FastText Example (vector for 'learning'):")
print(fasttext_model.wv['learning'])

- FastText Example (vector for 'learning'):
[ 8.41071014e-04 -1.44253936e-05 -1.40550779e-03 -1.95405955e-05
 -1.49504491e-03 -1.56484530e-05 -7.58774171e-04 -9.25881439e-04
  1.37625961e-03  7.78312678e-04 -4.12646274e-04  5.67153620e-04
 -1.24123076e-03  1.12083333e-03  4.30703018e-04 -1.62910367e-03
 -1.81642117e-03 -1.38210831e-03  5.55048347e-04 -2.03332305e-03
 -1.89027167e-03 -2.52464460e-03  1.56069861e-03  1.87282465e-04
 -9.09589464e-04 -8.62546614e-04  2.41440794e-04 -9.19878366e-04
 -5.59224281e-04 -1.27525767e-03 -1.46470428e-03  1.73418969e-03
  6.40327053e-04 -2.77742220e-04  4.63260541e-04  8.35091574e-04
 -5.92246652e-04  5.62027795e-04 -1.42553693e-03 -1.01097615e-03
 -1.19380211e-03 -1.53008569e-03 -1.01618865e-03  1.19788852e-03
 -8.55557097e-04 -4.37811468e-05  1.49167347e-04  2.23333671e-04
  2.30090140e-04  1.78240545e-04  1.63390697e-03 -6.22406893e-04
  2.85973547e-05  1.47500460e-03  8.02899071e-04  8.26830394e-04
 -4.73300810e-04  4.35493675e-05 -2.82393332e-

# Transformers (BERT-Based Embeddings)

In [12]:
import torch
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
def get_bert_embedding(text):
    tokens = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        output = model(**tokens)
    return output.last_hidden_state.mean(dim=1).squeeze().tolist()

df["BERT_Embeddings"] = df["Sentence"].apply(get_bert_embedding)
print("- BERT Embeddings Example:")
print(df[["Sentence", "BERT_Embeddings"]].head())

print("Processing complete.")

- BERT Embeddings Example:
                                            Sentence  \
0       The quick brown fox jumps over the lazy dog.   
1  Artificial intelligence is transforming the wo...   
2  Machine learning models require large datasets...   
3  Deep learning architectures such as CNNs and R...   
4  Natural language processing enables machines t...   

                                     BERT_Embeddings  
0  [-0.01446607243269682, -0.07488731294870377, 0...  
1  [0.21177445352077484, -0.053160350769758224, -...  
2  [0.0969255343079567, -0.007682323455810547, -0...  
3  [-0.19142761826515198, -0.26351794600486755, 0...  
4  [-0.07791925966739655, 0.14749549329280853, -0...  
Processing complete.
