# NLP Unit 2 — Complete Notebook (Ready-to-run)


In [1]:
# INSTALL required packages (runs inside the notebook)
# This cell will install packages into the environment Jupyter is using.
import sys
!{sys.executable} -m pip install --quiet nltk scikit-learn gensim matplotlib pandas requests tqdm
print("Install commands executed. If any package failed, re-run this cell.")

Install commands executed. If any package failed, re-run this cell.



[notice] A new release of pip is available: 24.3.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# NLTK setup
import nltk
nltk.download('punkt')
nltk.download('stopwords')
print("NLTK resources downloaded.")

NLTK resources downloaded.


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rcvik\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rcvik\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
import nltk
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\rcvik\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [6]:
df.head()


Unnamed: 0,text
0,Government announces new education policy to i...
1,Sports: local team wins the national champions...
2,Economy grows steadily as exports increase thi...
3,Technology companies release latest smartphone...
4,Health officials warn about seasonal flu and u...


## 1) Sample dataset (small)

In [7]:
# Small dataset (you can replace with your data)
docs = [
"Government announces new education policy to improve schools",
"Sports: local team wins the national championship in a thrilling final",
"Economy grows steadily as exports increase this quarter",
"Technology companies release latest smartphones with better cameras",
"Health officials warn about seasonal flu and urge vaccinations",
"Environment: city plants 10,000 trees to fight air pollution",
"Entertainment: blockbuster movie breaks box office records",
"Education experts discuss reforms in higher education curriculum",
"Researchers discover a new method to reduce carbon emissions",
"Local startup raises funds to expand renewable energy business"
]
import pandas as pd
df = pd.DataFrame({'text': docs})
df.head()

Unnamed: 0,text
0,Government announces new education policy to i...
1,Sports: local team wins the national champions...
2,Economy grows steadily as exports increase thi...
3,Technology companies release latest smartphone...
4,Health officials warn about seasonal flu and u...


## 2) Cleaning & Tokenization

In [10]:
import re, string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^a-z0-9\s]', '', text)
    tokens = [w for w in word_tokenize(text) if w not in stop_words]
    return ' '.join(tokens), tokens

df['clean'], df['tokens'] = zip(*df['text'].apply(clean_text))
df.head()

Unnamed: 0,text,clean,tokens
0,Government announces new education policy to i...,government announces new education policy impr...,"[government, announces, new, education, policy..."
1,Sports: local team wins the national champions...,sports local team wins national championship t...,"[sports, local, team, wins, national, champion..."
2,Economy grows steadily as exports increase thi...,economy grows steadily exports increase quarter,"[economy, grows, steadily, exports, increase, ..."
3,Technology companies release latest smartphone...,technology companies release latest smartphone...,"[technology, companies, release, latest, smart..."
4,Health officials warn about seasonal flu and u...,health officials warn seasonal flu urge vaccin...,"[health, officials, warn, seasonal, flu, urge,..."


## 3) Count Vectorizer, TF-IDF, N-grams

In [11]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Count Vectorizer
cv = CountVectorizer()
X_cv = cv.fit_transform(df['clean'])
print("CountVectorizer shape:", X_cv.shape)

# TF-IDF
tfidf = TfidfVectorizer()
X_tf = tfidf.fit_transform(df['clean'])
print("TF-IDF shape:", X_tf.shape)

# N-grams (1,2)
cv_ng = CountVectorizer(ngram_range=(1,2), min_df=1)
X_ng = cv_ng.fit_transform(df['clean'])
print("With bi-grams shape:", X_ng.shape)

# Show sample features
print("\nSample features (first 20):", cv.get_feature_names_out()[:20])

CountVectorizer shape: (10, 68)
TF-IDF shape: (10, 68)
With bi-grams shape: (10, 130)

Sample features (first 20): ['10000' 'air' 'announces' 'better' 'blockbuster' 'box' 'breaks'
 'business' 'cameras' 'carbon' 'championship' 'city' 'companies'
 'curriculum' 'discover' 'discuss' 'economy' 'education' 'emissions'
 'energy']


## 4) Train small Word2Vec (demo)

In [12]:
from gensim.models import Word2Vec

sentences = df['tokens'].tolist()
w2v = Word2Vec(sentences, vector_size=50, window=3, min_count=1, workers=2, epochs=100, seed=42)
print("Trained Word2Vec on small corpus. Vocab size:", len(w2v.wv.index_to_key))

# example similarity
for w in ['education','technology','energy','health','team']:
    if w in w2v.wv:
        print(f"Top similar to {w}:", w2v.wv.most_similar(w, topn=5))
    else:
        print(f"{w} not in vocab (trained).")

Trained Word2Vec on small corpus. Vocab size: 68
Top similar to education: [('government', 0.4143822193145752), ('urge', 0.34158065915107727), ('national', 0.2501566708087921), ('city', 0.2418750524520874), ('companies', 0.23779667913913727)]
Top similar to technology: [('health', 0.2729586660861969), ('grows', 0.26984816789627075), ('box', 0.23557914793491364), ('seasonal', 0.2289569228887558), ('researchers', 0.22848105430603027)]
Top similar to energy: [('movie', 0.2762676477432251), ('curriculum', 0.2741406261920929), ('box', 0.21177679300308228), ('thrilling', 0.20857055485248566), ('increase', 0.20815494656562805)]
Top similar to health: [('increase', 0.3330708146095276), ('warn', 0.28281116485595703), ('seasonal', 0.2807413339614868), ('technology', 0.2729586660861969), ('local', 0.2531234920024872)]
Top similar to team: [('new', 0.3971807658672333), ('higher', 0.3221035897731781), ('vaccinations', 0.2398131936788559), ('announces', 0.22932805120944977), ('renewable', 0.22028471

## 5) Pre-trained GloVe (download, convert, load)

This cell downloads GloVe (100d), converts to word2vec format, and loads KeyedVectors. It will take some time and requires internet access when you run the notebook locally.

In [None]:
# Download and load pre-trained GloVe embeddings (100d) if not present.
# NOTE: This downloads a ~822MB zip (glove.6B.zip is ~822MB). If you prefer smaller, change to 50d or 100d accordingly.
import os
import requests, zipfile, io
from tqdm import tqdm

glove_dir = "glove"
os.makedirs(glove_dir, exist_ok=True)
glove_zip_path = os.path.join(glove_dir, "glove.6B.zip")
glove_txt_100d = os.path.join(glove_dir, "glove.6B.100d.txt")
word2vec_output = os.path.join(glove_dir, "glove.6B.100d.word2vec.txt")

# If glove text not already present, download and extract
if not os.path.isfile(glove_txt_100d):
    print("Downloading GloVe (this may be large, ~822MB). If you want smaller, edit the notebook to use 50d.")
    url = "http://nlp.stanford.edu/data/glove.6B.zip"
    # stream download with progress
    r = requests.get(url, stream=True)
    total = int(r.headers.get('content-length', 0))
    with open(glove_zip_path, 'wb') as f, tqdm(total=total, unit='B', unit_scale=True, desc='Downloading GloVe') as pbar:
        for chunk in r.iter_content(chunk_size=1024):
            if chunk:
                f.write(chunk)
                pbar.update(len(chunk))
    print("Extracting...")
    with zipfile.ZipFile(glove_zip_path, 'r') as z:
        z.extractall(glove_dir)
    print("GloVe extracted.")

# Convert to word2vec format (if needed) and load using gensim
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors

if not os.path.isfile(word2vec_output):
    print("Converting GloVe to word2vec format...")
    glove2word2vec(glove_txt_100d, word2vec_output)
    print("Conversion done.")

print("Loading embeddings (this may take time)...")
glove_kv = KeyedVectors.load_word2vec_format(word2vec_output, binary=False)
print("Loaded GloVe vectors. Vocab size:", len(glove_kv.index_to_key))

Downloading GloVe (this may be large, ~822MB). If you want smaller, edit the notebook to use 50d.


Downloading GloVe:   2%|█                                                          | 15.3M/862M [00:10<06:45, 2.09MB/s]

## 6) Use pre-trained embeddings: similarity and examples

In [None]:
# Check similarities using loaded GloVe (if loaded)
try:
    model = glove_kv
    words = ['king','queen','man','woman','apple','banana','education','technology','energy','health']
    for w in words:
        if w in model:
            print("\nTop similar to", w, "->", model.most_similar(w, topn=5))
        else:
            print("\nWord not in GloVe vocab:", w)
except NameError:
    print("GloVe model not loaded. Run the previous cell (download+load).")

## 7) Visualize embeddings (selected words)

In [None]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# Choose a set of words to visualize (only those present in model)
def visualize_words(model, words_to_plot, method='pca', perplexity=30):
    words = [w for w in words_to_plot if w in model]
    vecs = np.array([model[w] for w in words])
    if method=='pca':
        reducer = PCA(n_components=2, random_state=42)
    else:
        reducer = TSNE(n_components=2, random_state=42, init='pca', perplexity=perplexity)
    coords = reducer.fit_transform(vecs)
    plt.figure(figsize=(8,6))
    plt.scatter(coords[:,0], coords[:,1])
    for i,w in enumerate(words):
        plt.text(coords[i,0]+0.02, coords[i,1]+0.02, w)
    plt.title(f"{method.upper()} projection of selected embeddings")
    plt.show()

# Example list
words_to_plot = ['king','queen','man','woman','prince','princess','apple','banana','orange','education','school','student','technology','computer','phone','energy','renewable','solar','wind','health','hospital']
try:
    visualize_words(glove_kv, words_to_plot, method='pca')
    visualize_words(glove_kv, words_to_plot, method='tsne')
except NameError:
    print("Pre-trained embeddings not found. Run download cell.")

## 8) Save cleaned dataset and vectors (optional)

In [None]:
# Save cleaned data and small trained w2v model
df.to_csv("cleaned_texts.csv", index=False)
w2v.save("small_word2vec.model")
print("Saved cleaned_texts.csv and small_word2vec.model in current folder.")