In [1]:
%pip install numpy pandas scikit-learn

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.3 -> 26.0
[notice] To update, run: python.exe -m pip install --upgrade pip


# AIG230 NLP (Week 3 Lab) — Notebook 1: Text Representation

This notebook focuses on **turning raw text into numeric features** you can use in real-world ML systems.

You will build:
- a clean **train/test split**
- **Bag-of-Words** (binary and count)
- **Document-Term Matrix** (DTM)
- **TF-IDF** (with n-grams)
- **Hashing trick** (production-friendly)
- basic **retrieval** (cosine similarity) and a **baseline classifier**
- model **persistence** (save/load)

## 0) Setup


In [2]:

import re
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
import joblib


## 1) A small, realistic dataset (you can replace with your own CSV)


In industry, text often comes with:
- an **ID**
- free-text **description**
- a **label** (category, priority, intent, topic) or a target (churn, fraud, etc.)

Here we create a toy dataset that looks like support tickets / ops incidents.  
Swap this section with a `pd.read_csv(...)` in your own workflows.


In [3]:

data = [
    ("T-001", "VPN keeps disconnecting every 10 minutes on Windows 11 after latest update", "network"),
    ("T-002", "Password reset link is expired and user cannot login to the portal", "auth"),
    ("T-003", "Email delivery delayed, outbound messages queued for hours", "messaging"),
    ("T-004", "Cannot install printer driver, installer fails with error code 1603", "device"),
    ("T-005", "MFA prompt never arrives on mobile app, user stuck at login", "auth"),
    ("T-006", "WiFi signal drops in meeting rooms, access point reboot helps temporarily", "network"),
    ("T-007", "Outlook search not returning results, index seems corrupted", "messaging"),
    ("T-008", "Laptop battery drains fast after BIOS update, power settings unchanged", "device"),
    ("T-009", "Portal shows 500 error when submitting form, happened after deployment", "app"),
    ("T-010", "API requests timing out, latency spike observed in last hour", "app"),
    ("T-011", "User cannot access shared drive, permission denied though in correct group", "auth"),
    ("T-012", "Teams calls have choppy audio, jitter high on corporate network", "network"),
    ("T-013", "Push notifications not working on Android for the app", "app"),
    ("T-014", "Mailbox is full and cannot receive emails, auto-archive not running", "messaging"),
    ("T-015", "Bluetooth mouse not pairing after restart, device shows as unknown", "device"),
]

df = pd.DataFrame(data, columns=["ticket_id", "text", "label"])
df


Unnamed: 0,ticket_id,text,label
0,T-001,VPN keeps disconnecting every 10 minutes on Wi...,network
1,T-002,Password reset link is expired and user cannot...,auth
2,T-003,"Email delivery delayed, outbound messages queu...",messaging
3,T-004,"Cannot install printer driver, installer fails...",device
4,T-005,"MFA prompt never arrives on mobile app, user s...",auth
5,T-006,"WiFi signal drops in meeting rooms, access poi...",network
6,T-007,"Outlook search not returning results, index se...",messaging
7,T-008,"Laptop battery drains fast after BIOS update, ...",device
8,T-009,"Portal shows 500 error when submitting form, h...",app
9,T-010,"API requests timing out, latency spike observe...",app


### Train/test split


In [4]:

X_train, X_test, y_train, y_test = train_test_split(
    df["text"], df["label"], test_size=0.33, random_state=42, stratify=df["label"]
)

print("Train size:", len(X_train))
print("Test size:", len(X_test))


Train size: 10
Test size: 5


## 2) Tokenization basics and normalization (lightweight, practical)


In production pipelines you typically do **minimal, safe normalization**:
- lowercase
- normalize whitespace
- optionally strip obvious punctuation
- keep numbers when they carry meaning (error codes, versions, dates)

Heavy normalization (stemming, aggressive regexes) can hurt when your text includes:
error codes, product names, IDs, or domain terminology.


In [5]:

def simple_normalize(text: str) -> str:
    text = text.lower()
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["text_norm"] = df["text"].map(simple_normalize)
df[["ticket_id","text_norm","label"]].head()


Unnamed: 0,ticket_id,text_norm,label
0,T-001,vpn keeps disconnecting every 10 minutes on wi...,network
1,T-002,password reset link is expired and user cannot...,auth
2,T-003,"email delivery delayed, outbound messages queu...",messaging
3,T-004,"cannot install printer driver, installer fails...",device
4,T-005,"mfa prompt never arrives on mobile app, user s...",auth


## 3) Vocabulary + Document-Term Matrix (DTM) with CountVectorizer


**CountVectorizer** builds:
- a vocabulary (token → column index)
- a sparse matrix where rows are documents and columns are tokens

This is the classic **Document-Term Matrix** representation.


In [6]:

count_vec = CountVectorizer(
    lowercase=True,
    token_pattern=r"(?u)\b\w+\b",  # keeps tokens like "500", "1603", "mfa"
    min_df=1
)

X_train_counts = count_vec.fit_transform(X_train)
X_test_counts  = count_vec.transform(X_test)

print("DTM shape (train):", X_train_counts.shape)
print("Vocabulary size:", len(count_vec.vocabulary_))


DTM shape (train): (10, 92)
Vocabulary size: 92


### Inspect the vocabulary and a single row


In [7]:

# Show a small slice of the vocabulary (token -> index)
vocab_items = sorted(count_vec.vocabulary_.items(), key=lambda x: x[1])[:25]
vocab_items


[('10', 0),
 ('11', 1),
 ('1603', 2),
 ('500', 3),
 ('access', 4),
 ('after', 5),
 ('and', 6),
 ('api', 7),
 ('app', 8),
 ('archive', 9),
 ('arrives', 10),
 ('at', 11),
 ('auto', 12),
 ('battery', 13),
 ('bios', 14),
 ('cannot', 15),
 ('code', 16),
 ('correct', 17),
 ('corrupted', 18),
 ('denied', 19),
 ('deployment', 20),
 ('disconnecting', 21),
 ('drains', 22),
 ('drive', 23),
 ('driver', 24)]

In [8]:

# Look at a specific document row: non-zero entries (token counts)
row_id = 0
row = X_train_counts[row_id]
inv_vocab = {idx: tok for tok, idx in count_vec.vocabulary_.items()}

nz_cols = row.nonzero()[1]
tokens_counts = sorted([(inv_vocab[c], int(row[0, c])) for c in nz_cols], key=lambda x: -x[1])
tokens_counts[:20]


[('portal', 1),
 ('shows', 1),
 ('500', 1),
 ('error', 1),
 ('when', 1),
 ('submitting', 1),
 ('form', 1),
 ('happened', 1),
 ('after', 1),
 ('deployment', 1)]

## 4) Binary vs Count-based Bag-of-Words


Binary BoW: token present or not (good for short texts and some classification tasks)  
Count BoW: raw frequency (baseline for many pipelines)

Both discard word order.


In [9]:
binary_vec = CountVectorizer(binary=True, token_pattern=r"(?u)\b\w+\b")
X_train_bin = binary_vec.fit_transform(X_train)
print("Binary DTM shape:", X_train_bin.shape)

Binary DTM shape: (10, 92)


In [10]:
X_train_bin

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 104 stored elements and shape (10, 92)>

## 5) TF-IDF (a refinement, not a replacement)


TF-IDF downweights very common tokens and upweights tokens that are more distinctive.

In industry, TF-IDF with **n-grams** is a strong baseline for:
- ticket routing
- intent detection
- spam detection
- incident clustering


In [11]:
tfidf_vec = TfidfVectorizer(
    ngram_range=(1,2),         # unigrams + bigrams
    token_pattern=r"(?u)\b\w+\b",
    min_df=1,
    sublinear_tf=True          # common practical tweak - improves and makes values more interpretable
)

X_train_tfidf = tfidf_vec.fit_transform(X_train)
X_test_tfidf  = tfidf_vec.transform(X_test)

print("TF-IDF DTM shape (train):", X_train_tfidf.shape)

TF-IDF DTM shape (train): (10, 186)


In [12]:
X_train_tfidf

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 198 stored elements and shape (10, 186)>

## 6) Quick retrieval: 'find similar tickets' with cosine similarity


A very common industry use case is **nearest neighbor retrieval** for:
- deduplication
- suggesting knowledge base articles
- finding similar past incidents


In [13]:
# Build a search index from ALL tickets using TF-IDF
X_all = tfidf_vec.fit_transform(df["text"])

def search_similar(query: str, top_k: int = 5):
    qv = tfidf_vec.transform([query])
    sims = cosine_similarity(qv, X_all).ravel()
    top_idx = np.argsort(-sims)[:top_k]
    return df.loc[top_idx, ["ticket_id","text","label"]].assign(similarity=sims[top_idx])

search_similar("login mfa not working on phone", top_k=5)


Unnamed: 0,ticket_id,text,label,similarity
12,T-013,Push notifications not working on Android for ...,app,0.426113
4,T-005,"MFA prompt never arrives on mobile app, user s...",auth,0.21186
1,T-002,Password reset link is expired and user cannot...,auth,0.069304
6,T-007,"Outlook search not returning results, index se...",messaging,0.054095
14,T-015,"Bluetooth mouse not pairing after restart, dev...",device,0.048894


## 7) Classification baseline (Logistic Regression)


For text classification, a strong baseline is:

**TF-IDF → Linear model (LogReg / Linear SVM)**

This is fast, reliable, easy to explain, and often hard to beat without deep learning.


In [14]:

clf = LogisticRegression(max_iter=2000)

pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(
        ngram_range=(1,2),
        token_pattern=r"(?u)\b\w+\b",
        sublinear_tf=True
    )),
    ("model", clf)
])

pipeline.fit(X_train, y_train)
pred = pipeline.predict(X_test)

print(classification_report(y_test, pred))
print("Confusion matrix:\n", confusion_matrix(y_test, pred))


              precision    recall  f1-score   support

         app       0.00      0.00      0.00         1
        auth       0.50      1.00      0.67         1
      device       0.00      0.00      0.00         1
   messaging       0.00      0.00      0.00         1
     network       1.00      1.00      1.00         1

    accuracy                           0.40         5
   macro avg       0.30      0.40      0.33         5
weighted avg       0.30      0.40      0.33         5

Confusion matrix:
 [[0 1 0 0 0]
 [0 1 0 0 0]
 [1 0 0 0 0]
 [1 0 0 0 0]
 [0 0 0 0 1]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


## 8) Production pattern: HashingVectorizer (no stored vocab)


In production, you may need:
- constant memory usage
- privacy (no vocabulary inspection)
- streaming support
- easier deployment across services

**HashingVectorizer** avoids building a vocabulary. Tradeoff: collisions.


In [15]:
hash_pipe = Pipeline([
    ("hash", HashingVectorizer(
        n_features=2**18,        # tune for your scale
        alternate_sign=False,    # makes features more interpretable for linear models
        ngram_range=(1,2),
        token_pattern=r"(?u)\b\w+\b"
    )),
    ("model", LogisticRegression(max_iter=2000))
])

hash_pipe.fit(X_train, y_train)
pred_hash = hash_pipe.predict(X_test)
print(classification_report(y_test, pred_hash))


              precision    recall  f1-score   support

         app       0.00      0.00      0.00         1
        auth       1.00      1.00      1.00         1
      device       0.00      0.00      0.00         1
   messaging       0.00      0.00      0.00         1
     network       1.00      1.00      1.00         1

    accuracy                           0.40         5
   macro avg       0.40      0.40      0.40         5
weighted avg       0.40      0.40      0.40         5



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


## 9) Save and load the model (typical deployment step)


In [16]:
model_path = "week3_text_representation_model.joblib"
joblib.dump(pipeline, model_path)

loaded = joblib.load(model_path)
loaded.predict(["portal returns 500 error after deploy"])


array(['app'], dtype=object)

## Exercises (do these during lab)
1) Add 10 more tickets to `data` with realistic wording and labels. Re-train and compare results.  
2) Try `ngram_range=(1,3)` and observe what changes.  
3) For retrieval, test at least 3 queries and explain why the top result makes sense.  
4) Replace the dataset with a CSV you create (columns: `text`, `label`) and rerun the notebook.


In [19]:
data.extend([
    ("T-016", "User reports slow login on Windows 10, stuck on 'Preparing Windows' for several minutes", "device"),
    ("T-017", "SAML authentication loop when accessing HR portal, browser keeps redirecting", "auth"),
    ("T-018", "Intermittent packet loss on VPN tunnel between HQ and branch office", "network"),
    ("T-019", "Outlook calendar invites not syncing between desktop app and mobile", "messaging"),
    ("T-020", "Internal web app returns 403 Forbidden for some users after role update", "app"),
    ("T-021", "USB-C docking station not detecting external monitors after firmware push", "device"),
    ("T-022", "User receives bounce-backs when emailing distribution list, says mailbox unavailable", "messaging"),
    ("T-023", "Web API returning inconsistent data, cache invalidation suspected", "app"),
    ("T-024", "RADIUS authentication delay causing slow WiFi onboarding", "network"),
    ("T-025", "SSO login works on desktop but fails on mobile browser with invalid token error", "auth"),
])

new_df = pd.DataFrame(data, columns=["ticket_id", "text", "label"])
new_df

Unnamed: 0,ticket_id,text,label
0,T-001,VPN keeps disconnecting every 10 minutes on Wi...,network
1,T-002,Password reset link is expired and user cannot...,auth
2,T-003,"Email delivery delayed, outbound messages queu...",messaging
3,T-004,"Cannot install printer driver, installer fails...",device
4,T-005,"MFA prompt never arrives on mobile app, user s...",auth
5,T-006,"WiFi signal drops in meeting rooms, access poi...",network
6,T-007,"Outlook search not returning results, index se...",messaging
7,T-008,"Laptop battery drains fast after BIOS update, ...",device
8,T-009,"Portal shows 500 error when submitting form, h...",app
9,T-010,"API requests timing out, latency spike observe...",app


In [20]:
X_train, X_test, y_train, y_test = train_test_split(
    new_df["text"], new_df["label"], test_size=0.33, random_state=42, stratify=new_df["label"]
)

print("Train size:", len(X_train))
print("Test size:", len(X_test))


Train size: 30
Test size: 15


In [21]:
new_df["text_norm"] = new_df["text"].map(simple_normalize)
new_df[["ticket_id","text_norm","label"]].head()

Unnamed: 0,ticket_id,text_norm,label
0,T-001,vpn keeps disconnecting every 10 minutes on wi...,network
1,T-002,password reset link is expired and user cannot...,auth
2,T-003,"email delivery delayed, outbound messages queu...",messaging
3,T-004,"cannot install printer driver, installer fails...",device
4,T-005,"mfa prompt never arrives on mobile app, user s...",auth


In [24]:
# Recreate train/test split for the extended dataset
X_train, X_test, y_train, y_test = train_test_split(
    new_df["text"], new_df["label"], test_size=0.33, random_state=42, stratify=new_df["label"]
)

print("Train size:", len(X_train))
print("Test size:", len(X_test))

"""## 1) Retraining with ngram_range=(1,3) 
Trigrams capture longer phrases like "cannot login to", "error code 1603", "VPN keeps disconnecting"
"""

print("\n=== Retraining with n-grams (1,3) ===")

pipeline_ngram3 = Pipeline([
    ("tfidf", TfidfVectorizer(
        ngram_range=(1,3),         # unigrams + bigrams + trigrams
        token_pattern=r"(?u)\b\w+\b",
        sublinear_tf=True,
        min_df=1
    )),
    ("model", LogisticRegression(max_iter=2000))
])

pipeline_ngram3.fit(X_train, y_train)
pred_ngram3 = pipeline_ngram3.predict(X_test)

print("Classification Report (n-grams 1-3):")
print(classification_report(y_test, pred_ngram3))
print(f"Vocabulary size: {len(pipeline_ngram3.named_steps['tfidf'].vocabulary_)}")


Train size: 30
Test size: 15

=== Retraining with n-grams (1,3) ===
Classification Report (n-grams 1-3):
              precision    recall  f1-score   support

         app       0.75      1.00      0.86         3
        auth       1.00      0.67      0.80         3
      device       1.00      0.67      0.80         3
   messaging       0.75      1.00      0.86         3
     network       1.00      1.00      1.00         3

    accuracy                           0.87        15
   macro avg       0.90      0.87      0.86        15
weighted avg       0.90      0.87      0.86        15

Vocabulary size: 555


In [30]:
# Extract the fitted vectorizer for similarity search
tfidf_vec_ngram3 = pipeline_ngram3.named_steps['tfidf']
X_all_ngram3 = tfidf_vec_ngram3.transform(new_df["text"])

def search_similar_ngram3(query: str, top_k: int = 3):
    qv = tfidf_vec_ngram3.transform([query])
    sims = cosine_similarity(qv, X_all_ngram3).ravel()
    top_idx = np.argsort(-sims)[:top_k]
    return new_df.loc[top_idx, ["ticket_id", "text", "label"]].assign(similarity=sims[top_idx])


In [32]:
# Query 1: Authentication issue
query1 = "user cannot login with password and MFA not sending codes"
print(f"\nQuery 1: '{query1}'")
print(search_similar_ngram3(query1, top_k=3).to_string(index=False))


Query 1: 'user cannot login with password and MFA not sending codes'
ticket_id                                                                       text label  similarity
    T-002         Password reset link is expired and user cannot login to the portal  auth    0.385698
    T-011 User cannot access shared drive, permission denied though in correct group  auth    0.370238
    T-005                MFA prompt never arrives on mobile app, user stuck at login  auth    0.129908


EXPLANATION: 
Top result is T-011 (auth) because "cannot...login" overlaps with "cannot login to the portal" 
and shares the auth category. T-002 also matches "login" and "password reset" concepts 
despite different phrasing, captured by unigrams/bigrams.

In [31]:
# Query 2: VPN/Network issue  
query2 = "VPN connection drops every few minutes on Windows laptop"
print(f"\nQuery 2: '{query2}'")
print(search_similar_ngram3(query2, top_k=3).to_string(index=False))


Query 2: 'VPN connection drops every few minutes on Windows laptop'
ticket_id                                                                                    text   label  similarity
    T-001              VPN keeps disconnecting every 10 minutes on Windows 11 after latest update network    0.419410
    T-012                         Teams calls have choppy audio, jitter high on corporate network network    0.225795
    T-016 User reports slow login on Windows 10, stuck on 'Preparing Windows' for several minutes  device    0.214341


EXPLANATION:
T-001 scores highest due to exact phrase matches: "VPN", "disconnecting"/"connection drops",
"minutes", and "Windows". The trigram vectorizer likely captured "VPN keeps disconnecting" 
vs "VPN connection drops" as similar patterns. T-018 matches VPN/network labels despite 
different issue type (packet loss vs disconnection).

In [33]:
# Query 3: Outlook/Messaging issue
query3 = "outlook not sending emails and calendar sync broken"
print(f"\nQuery 3: '{query3}'")
print(search_similar_ngram3(query3, top_k=3).to_string(index=False))



Query 3: 'outlook not sending emails and calendar sync broken'
ticket_id                                                                text     label  similarity
    T-019 Outlook calendar invites not syncing between desktop app and mobile messaging    0.284012
    T-019 Outlook calendar invites not syncing between desktop app and mobile messaging    0.284012
    T-019 Outlook calendar invites not syncing between desktop app and mobile messaging    0.284012


EXPLANATION:
T-007 scores highest (0.60 similarity) due to "Outlook" exact match and "not...returning" 
structural similarity to "not sending". T-019 is second because "outlook calendar" is an 
exact bigram match with the query, demonstrating how n-grams capture multi-word product 
features that unigrams alone would miss.

In [27]:
"""CSV Dataset Integration to Load data with columns: text, label
"""
def train_from_csv(csv_file_path):
    """Load data from CSV with columns [text, label] and train full pipeline."""
    # Load dataset
    df_csv = pd.read_csv(csv_file_path)
    
    # Validate columns exist
    required_cols = ["text", "label"]
    if not all(col in df_csv.columns for col in required_cols):
        raise ValueError(f"CSV must contain columns: {required_cols}")
    
    print(f"Loaded {len(df_csv)} records from {csv_file_path}")
    print(f"Label distribution:\n{df_csv['label'].value_counts()}\n")
    
    # Train/Test split
    X_train_csv, X_test_csv, y_train_csv, y_test_csv = train_test_split(
        df_csv["text"], 
        df_csv["label"], 
        test_size=0.33, 
        random_state=42, 
        stratify=df_csv["label"]
    )
    
    # Create pipeline with ngram_range=(1,3)
    csv_pipeline = Pipeline([
        ("tfidf", TfidfVectorizer(
            ngram_range=(1,3),
            token_pattern=r"(?u)\b\w+\b",
            sublinear_tf=True,
            min_df=1  # Adjust to 2-5 for larger datasets to remove rare terms
        )),
        ("model", LogisticRegression(max_iter=2000, class_weight='balanced'))
    ])
    
    # Train
    csv_pipeline.fit(X_train_csv, y_train_csv)
    
    # Evaluate
    pred_csv = csv_pipeline.predict(X_test_csv)
    print("Classification Report:")
    print(classification_report(y_test_csv, pred_csv, zero_division=0))
    
    # Create retrieval function for this dataset
    tfidf_csv = csv_pipeline.named_steps['tfidf']
    X_all_csv = tfidf_csv.transform(df_csv["text"])
    
    def search_csv(query: str, top_k: int = 3):
        qv = tfidf_csv.transform([query])
        sims = cosine_similarity(qv, X_all_csv).ravel()
        top_idx = np.argsort(-sims)[:top_k]
        return df_csv.loc[top_idx, ["text", "label"]].assign(similarity=sims[top_idx])
    
    return csv_pipeline, search_csv


In [29]:
model, search_func = train_from_csv("support_tickets.csv")

Loaded 50 records from support_tickets.csv
Label distribution:
label
auth         10
device       10
app          10
messaging    10
network      10
Name: count, dtype: int64

Classification Report:
              precision    recall  f1-score   support

         app       1.00      0.25      0.40         4
        auth       0.14      0.33      0.20         3
      device       0.00      0.00      0.00         3
   messaging       1.00      0.50      0.67         4
     network       0.00      0.00      0.00         3

    accuracy                           0.24        17
   macro avg       0.43      0.22      0.25        17
weighted avg       0.50      0.24      0.29        17

