# Document Clustering Using Word2Vec and K-means

This notebook demonstrates how to cluster text documents using Word2Vec to generate word embeddings and K-means for clustering.

## Objectives
1. Clean and tokenize text data.
2. Train a Word2Vec model for word embeddings.
3. Generate document vectors by aggregating word vectors.
4. Apply Mini-batch K-means clustering on document vectors.
5. Evaluate and interpret clustering results.

## 1.0 Setup and Imports
Install the required libraries and import the necessary modules.

In [None]:
!pip install numpy pandas gensim scikit-learn nltk matplotlib

In [None]:
import os
import random
import re
import string
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.pyplot as plt

nltk.download("stopwords")
nltk.download("punkt")

# Set random seed for reproducibility
SEED = 42
random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)
np.random.seed(SEED)

## 2.0 Load and Preprocess Data
Define functions to clean and tokenize text data, and load the dataset.

In [None]:
def clean_text(text, tokenizer, stopwords):
    """Pre-process text and generate tokens."""
    text = str(text).lower()
    text = re.sub(r"\[(.*?)\]", "", text)
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"\w+…|…", "", text)
    text = re.sub(r"(?<=\w)-(?=\w)", " ", text)
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    
    tokens = tokenizer(text)
    tokens = [t for t in tokens if t not in stopwords]
    tokens = [t for t in tokens if len(t) > 1]
    return tokens

# Custom stopwords
custom_stopwords = set(stopwords.words("english") + ["news", "new", "top"])
text_columns = ["title", "description", "content"]

# Load dataset
df_raw = pd.read_csv("data/news_data.csv")
df = df_raw.copy()
df["content"] = df["content"].fillna("")

# Preprocess text
for col in text_columns:
    df[col] = df[col].astype(str)

df["text"] = df[text_columns].apply(lambda x: " | ".join(x), axis=1)
df["tokens"] = df["text"].map(lambda x: clean_text(x, word_tokenize, custom_stopwords))

# Remove duplicates and nulls
_, idx = np.unique(df["tokens"], return_index=True)
df = df.iloc[idx, :]
df = df.loc[df.tokens.map(lambda x: len(x) > 0), ["text", "tokens"]]

docs = df["text"].values
tokenized_docs = df["tokens"].values

print(f"Original dataframe: {df_raw.shape}")
print(f"Pre-processed dataframe: {df.shape}")

## 3.0 Train Word2Vec Model
Train a Word2Vec model using the preprocessed tokens.

In [None]:
model = Word2Vec(sentences=tokenized_docs, vector_size=100, workers=1, seed=SEED)

# Test word similarity
print(model.wv.most_similar("trump"))

## 4.0 Generate Document Vectors
Create document vectors by averaging word embeddings.

In [None]:
def vectorize(list_of_docs, model):
    """Generate vectors for list of documents."""
    features = []

    for tokens in list_of_docs:
        zero_vector = np.zeros(model.vector_size)
        vectors = []
        for token in tokens:
            if token in model.wv:
                try:
                    vectors.append(model.wv[token])
                except KeyError:
                    continue
        if vectors:
            avg_vec = np.asarray(vectors).mean(axis=0)
            features.append(avg_vec)
        else:
            features.append(zero_vector)
    return features

vectorized_docs = vectorize(tokenized_docs, model=model)
len(vectorized_docs), len(vectorized_docs[0])

## 5.0 Apply K-means Clustering
Cluster the document vectors using Mini-batch K-means.

In [None]:
def mbkmeans_clusters(X, k, mb, print_silhouette_values):
    """Generate clusters and print Silhouette metrics."""
    km = MiniBatchKMeans(n_clusters=k, batch_size=mb).fit(X)
    print(f"For n_clusters = {k}")
    print(f"Silhouette coefficient: {silhouette_score(X, km.labels_):0.2f}")
    print(f"Inertia: {km.inertia_}")
    
    if print_silhouette_values:
        sample_silhouette_values = silhouette_samples(X, km.labels_)
        print(f"Silhouette values:")
        for i in range(k):
            cluster_silhouette_values = sample_silhouette_values[km.labels_ == i]
            print(f"Cluster {i}: Avg:{cluster_silhouette_values.mean():.2f}")
    return km, km.labels_

# Apply clustering
clustering, cluster_labels = mbkmeans_clusters(
    X=vectorized_docs,
    k=50,
    mb=500,
    print_silhouette_values=True
)

df_clusters = pd.DataFrame({
    "text": docs,
    "tokens": [" ".join(text) for text in tokenized_docs],
    "cluster": cluster_labels
})

## 6.0 Evaluate Clusters
Qualitatively analyze clusters by reviewing representative tokens and documents.

In [None]:
# Representative tokens
print("Most representative terms per cluster:")
for i in range(50):
    most_representative = model.wv.most_similar(positive=[clustering.cluster_centers_[i]], topn=5)
    print(f"Cluster {i}: {', '.join([t[0] for t in most_representative])}")

In [None]:
# Representative documents for a cluster
test_cluster = 29
most_representative_docs = np.argsort(
    np.linalg.norm(vectorized_docs - clustering.cluster_centers_[test_cluster], axis=1)
)
for d in most_representative_docs[:3]:
    print(docs[d])
    print("-------------")