In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
import logging

import gensim
from gensim import corpora
from gensim.models.coherencemodel import CoherenceModel

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dcyw\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dcyw\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dcyw\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Load Data
df = pd.read_csv(r"C:\Users\dcyw\combined_dfv4.csv", dtype=str)  # Update with your file path
text_data = df['Combined Text'].dropna()

# Clustering via k-means

In [4]:
# Logging configuration
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Preprocessing
mystopwords = stopwords.words("english") + ['one', 'become', 'get', 'make', 'take']
WNlemma = WordNetLemmatizer()

def pre_process(text):
    tokens = nltk.word_tokenize(text)
    tokens = [WNlemma.lemmatize(t.lower()) for t in tokens]
    tokens = [t for t in tokens if t not in mystopwords]
    tokens = [t for t in tokens if len(t) >= 3]  # Keep words with 3+ characters
    return " ".join(tokens)

# Apply preprocessing
toks = text_data.apply(pre_process)

# Create tf-idf matrix
vectorizer = TfidfVectorizer(max_df=0.7, max_features=2500, min_df=3, stop_words=mystopwords, use_idf=True)
X = vectorizer.fit_transform(toks)

# K-Means Clustering
km3 = KMeans(n_clusters=3, init='k-means++', max_iter=2000, random_state=5)
km3.fit(X)

# Evaluate the 3 clusters using silhouette score
print("Silhouette Coefficient for 3 clusters: %0.3f" % metrics.silhouette_score(X, km3.labels_))

# Cluster sizes
labels, counts = np.unique(km3.labels_[km3.labels_ >= 0], return_counts=True)
print("Cluster Labels:", labels)
print("Cluster Counts:", counts)

# Print top terms in each cluster
def print_terms(cm, num):
    original_space_centroids = cm.cluster_centers_
    order_centroids = original_space_centroids.argsort()[:, ::-1]
    terms = vectorizer.get_feature_names_out()
    for i in range(num):
        print("Cluster %d:" % i, end='')
        for ind in order_centroids[i, :10]:
            print(' %s' % terms[ind], end='')
        print()

print_terms(km3, 3)

# SVD dimensionality reduction
svd = TruncatedSVD(300)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
X_lsa = lsa.fit_transform(X)

# Apply KMeans again after SVD
km3_svd = KMeans(n_clusters=3, init='k-means++', max_iter=1000, n_init=1)
km3_svd.fit(X_lsa)

print("Silhouette Coefficient for 3 clusters after SVD: %0.3f" % metrics.silhouette_score(X_lsa, km3_svd.labels_))

# Cluster sizes after SVD
labels_svd, counts_svd = np.unique(km3_svd.labels_[km3_svd.labels_ >= 0], return_counts=True)
print("Cluster Labels after SVD:", labels_svd)
print("Cluster Counts after SVD:", counts_svd)

# Print top terms in each cluster after SVD
def print_SVD_terms(cm, num):
    original_space_centroids = svd.inverse_transform(cm.cluster_centers_)
    order_centroids = original_space_centroids.argsort()[:, ::-1]
    terms = vectorizer.get_feature_names_out()
    for i in range(num):
        print("Cluster %d:" % i, end='')
        for ind in order_centroids[i, :10]:
            print(' %s' % terms[ind], end='')
        print()

print_SVD_terms(km3_svd, 3)


  super()._check_params_vs_input(X, default_n_init=10)


Silhouette Coefficient for 3 clusters: 0.007
Cluster Labels: [0 1 2]
Cluster Counts: [ 6863 53263 36654]
Cluster 0: good quality ink price printer cartridge product easy print expensive
Cluster 1: ink cartridge easy great print use product expensive color time
Cluster 2: printer ink cartridge work print use new time would buy
Silhouette Coefficient for 3 clusters after SVD: 0.010
Cluster Labels after SVD: [0 1 2]
Cluster Counts after SVD: [46894 17112 32774]
Cluster 0: ink cartridge good easy product great price use expensive work
Cluster 1: print color ink cartridge printer black quality printing page good
Cluster 2: printer ink cartridge use work new time would buy like
