In [9]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from tabulate import tabulate
from collections import Counter 

# Load the dataset
df = pd.read_csv("customer_complaints_1.csv")   

# Step 2: Use the Text column for clustering
dataset = df['text'].tolist()

# Step 3: Vectorize the dataset using TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(dataset)

# Step 4: Perform clustering
k = 5  # Define the number of clusters
km = KMeans(n_clusters=k)
km.fit(X)

# Predict the clusters for each document
y_pred = km.predict(X)

# Display the document and its predicted cluster in a table
table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(dataset, y_pred)])

# Print the table of predicted clusters
table_output = tabulate(table_data, headers="firstrow", tablefmt="grid")

# Get top terms per cluster
top_terms_per_cluster = []
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()
for i in range(k):
    cluster_terms = []
    for ind in order_centroids[i, :10]:
        cluster_terms.append(terms[ind])
    top_terms_per_cluster.append(f"Cluster {i}: {', '.join(cluster_terms)}")

# Calculate purity
total_samples = len(y_pred)
cluster_label_counts = [Counter(y_pred)]
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples

table_output, top_terms_per_cluster, purity

# Print top terms per cluster
print("\nTop terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()

for i in range(k):
    print(f"Cluster {i}:")
    for ind in order_centroids[i, :10]:  # Print the top 10 terms for each cluster
        print(f' {terms[ind]}')
    print()

# Step 5: Evaluate results - Calculate purity
total_samples = len(y_pred)
cluster_label_counts = [Counter(y_pred)]

purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples
print("Purity:", purity)




Top terms per cluster:
Cluster 0:
 that
 is
 malfunction
 protocol
 investigating
 from
 since
 customer
 the
 their

Cluster 1:
 the
 to
 and
 it
 for
 mbps
 speed
 is
 was
 service

Cluster 2:
 rude
 was
 rep
 my
 me
 resolve
 cutting
 helpful
 tom
 charges

Cluster 3:
 the
 to
 and
 for
 you
 my
 it
 on
 internet
 contract

Cluster 4:
 to
 me
 although
 super
 actually
 excited
 promises
 promised
 pricing
 gave

Purity: 0.5263157894736842
