# **Feature Extraction in NLP**

In [1]:
# Install libraries if needed
!pip install nltk scikit-learn gensim transformers torch

Collecting gensim
  Downloading gensim-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy>=1.19.5 (from scikit-learn)
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.13.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.6/26.6 MB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.0 MB)
[2K   [90m━━━━━━━━━━━━

In [1]:
import nltk
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
from gensim.downloader import load as gensim_load
from transformers import BertTokenizer, BertModel
import torch

In [2]:
# Download resources
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## **Sample Corpus**

In [3]:
corpus = [
    "Natural Language Processing with Python is fun",
    "Python is great for text mining and machine learning",
    "I enjoy learning NLP and feature extraction"
]

In [4]:
print("Sample Corpus:")
for i, doc in enumerate(corpus, 1):
    print(f"Doc{i}: {doc}")

Sample Corpus:
Doc1: Natural Language Processing with Python is fun
Doc2: Python is great for text mining and machine learning
Doc3: I enjoy learning NLP and feature extraction


## **Bag of Words (BoW)**

**Math:**

$$
V_d = [f_1, f_2, ..., f_n], \quad f_i = \text{word frequency in document } d
$$

In [5]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

In [6]:
bow_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
bow_df

Unnamed: 0,and,enjoy,extraction,feature,for,fun,great,is,language,learning,machine,mining,natural,nlp,processing,python,text,with
0,0,0,0,0,0,1,0,1,1,0,0,0,1,0,1,1,0,1
1,1,0,0,0,1,0,1,1,0,1,1,1,0,0,0,1,1,0
2,1,1,1,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0


## **TF-IDF**

**Math:**

$$
TF(t,d) = \frac{f_{t,d}}{\sum_k f_{k,d}}, \quad
IDF(t) = \log \frac{N}{1+n_t}, \quad
w_{t,d} = TF \times IDF
$$

In [7]:
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(corpus)

In [8]:
tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
tfidf_df

Unnamed: 0,and,enjoy,extraction,feature,for,fun,great,is,language,learning,machine,mining,natural,nlp,processing,python,text,with
0,0.0,0.0,0.0,0.0,0.0,0.403016,0.0,0.306504,0.403016,0.0,0.0,0.0,0.403016,0.0,0.403016,0.306504,0.0,0.403016
1,0.281221,0.0,0.0,0.0,0.369772,0.0,0.369772,0.281221,0.0,0.281221,0.369772,0.369772,0.0,0.0,0.0,0.281221,0.369772,0.0
2,0.334907,0.440362,0.440362,0.440362,0.0,0.0,0.0,0.0,0.0,0.334907,0.0,0.0,0.0,0.440362,0.0,0.0,0.0,0.0


## **Word Embeddings – Word2Vec**

**Idea:** Similar words → similar vectors
Skip-gram objective:

$$
L = - \sum_{t=1}^{T} \sum_{-c \leq j \leq c, j \neq 0} \log P(w_{t+j} | w_t)
$$

In [10]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [11]:
# Tokenize corpus
tokenized_corpus = [nltk.word_tokenize(sentence.lower()) for sentence in corpus]

In [12]:
# Train Word2Vec model
w2v_model = Word2Vec(sentences=tokenized_corpus, vector_size=50, window=3, min_count=1, workers=4)

In [13]:
# Example: embedding for "python"
print("Word2Vec embedding for 'python' (first 10 dims):")
print(w2v_model.wv['python'][:10])

Word2Vec embedding for 'python' (first 10 dims):
[-0.01631583  0.0089916  -0.00827415  0.00164907  0.01699724 -0.00892435
  0.009035   -0.01357392 -0.00709698  0.01879702]


In [14]:
# Similarity example
print("\nSimilarity between 'python' and 'nlp':", w2v_model.wv.similarity('python','nlp'))


Similarity between 'python' and 'nlp': -0.011612347


## **Pre-trained Embeddings – GloVe**

**Idea:** Word vectors learned from co-occurrence statistics.

In [15]:
# Load pre-trained GloVe embeddings (50-dim)
glove = gensim_load("glove-wiki-gigaword-50")



In [16]:
# Example: vector for "python"
print("GloVe embedding for 'python' (first 10 dims):")
print(glove['python'][:10])

GloVe embedding for 'python' (first 10 dims):
[ 0.5897  -0.55043 -1.0106   0.41226  0.57348  0.23464 -0.35773 -1.78
  0.10745  0.74913]


In [17]:
# Analogy example: king - man + woman ≈ queen
result = glove.most_similar(positive=['king','woman'], negative=['man'])
print("\nAnalogy test (king - man + woman):", result[:3])


Analogy test (king - man + woman): [('queen', 0.8523604273796082), ('throne', 0.7664334177970886), ('prince', 0.7592144012451172)]


## **Contextual Embeddings – BERT**

**Math:**

$$
Attention(Q,K,V) = \text{softmax}\left(\frac{QK^T}{\sqrt{d_k}}\right)V
$$

In [18]:
# Load pre-trained BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [19]:
sentence = "I love learning NLP with Python"
inputs = tokenizer(sentence, return_tensors="pt")

In [20]:
# Get embeddings
with torch.no_grad():
    outputs = model(**inputs)

In [21]:
last_hidden_states = outputs.last_hidden_state
print("BERT Embeddings Shape:", last_hidden_states.shape)  # (batch, seq_len, hidden_dim=768)

BERT Embeddings Shape: torch.Size([1, 9, 768])


In [22]:
# Example: vector for token "Python"
token_index = inputs['input_ids'][0].tolist().index(tokenizer.convert_tokens_to_ids("python"))
print("\nBERT embedding for 'Python' (first 10 dims):")
print(last_hidden_states[0, token_index, :10])


BERT embedding for 'Python' (first 10 dims):
tensor([-0.1367,  0.4536, -0.4612, -0.6565, -0.2077, -0.6414,  0.4445,  1.2419,
        -0.8451,  0.2756])
