# TEXT CLUSTERING USING TF-IDF VECTORIZER

### Without Text Preprocessing

In [1]:
# import libraries
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from tabulate import tabulate
from collections import Counter

In [2]:
data = pd.read_csv('customer_complaints_1.csv')
data = data['text']
data

0     I used to love Comcast. Until all these consta...
1     I'm so over Comcast! The worst internet provid...
2     If I could give them a negative star or no sta...
3     I've had the worst experiences so far since in...
4     Check your contract when you sign up for Comca...
5     Thank God. I am changing to Dish. They gave me...
6     I Have been a long time customer and only have...
7     There is a malfunction on the DVR manager whic...
8     Charges overwhelming. Comcast service rep was ...
9     I have had cable, DISH, and U-verse, etc. in t...
10    Had them from 2014 to now. I'd tell new custom...
11    Disappointed. I have been a Comcast/Xfinity cu...
12    These people are unethical and disturbing obli...
13    Unplanned, unexpected, all day outages, rude s...
14    BE WARNED. You will have 10$ hidden fees when ...
15    Had Comcast. Overall the terrible experience e...
16    When I called the infinity customer service ce...
17    Outraged, that it should not take a month 

In [3]:
# Vectorize the dataset to see the coordinat
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data)

In [4]:
# Perform clustering
k = 2 # Define the number of clusters
km = KMeans(n_clusters=k)
km.fit(X)

  super()._check_params_vs_input(X, default_n_init=10)


In [5]:
# Predict the clusters for each document
y_pred = km.predict(X)
y_pred

array([1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1])

In [6]:
# Display the document and its predicted cluster in a table
table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(data, y_pred)])
print(tabulate(table_data, headers="firstrow"))

# Print top terms per cluster
print("\nTop terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()

for i in range(k):
 print("Cluster %d:" % i)
 for ind in order_centroids[i, :10]:
     print(' %s' % terms[ind])
 print()


Document                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                

In [7]:
# evaluate result

# Calculate purity
total_samples = len(y_pred)
cluster_label_counts = [Counter(y_pred)]
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples
print("Purity:", purity)

Purity: 0.7894736842105263


### With Text Preprocessing

In [8]:
data

0     I used to love Comcast. Until all these consta...
1     I'm so over Comcast! The worst internet provid...
2     If I could give them a negative star or no sta...
3     I've had the worst experiences so far since in...
4     Check your contract when you sign up for Comca...
5     Thank God. I am changing to Dish. They gave me...
6     I Have been a long time customer and only have...
7     There is a malfunction on the DVR manager whic...
8     Charges overwhelming. Comcast service rep was ...
9     I have had cable, DISH, and U-verse, etc. in t...
10    Had them from 2014 to now. I'd tell new custom...
11    Disappointed. I have been a Comcast/Xfinity cu...
12    These people are unethical and disturbing obli...
13    Unplanned, unexpected, all day outages, rude s...
14    BE WARNED. You will have 10$ hidden fees when ...
15    Had Comcast. Overall the terrible experience e...
16    When I called the infinity customer service ce...
17    Outraged, that it should not take a month 

In [9]:
# check if the text has duplicate or not
data.is_unique

True

In [10]:
# import library
import re
import string

In [11]:
# remove any number, tag and lowercases as well
def get_cleaned_textdata(sentence):
    modified_sentence = re.sub(r'<.*?>',' ', sentence)
    modified_sentence = ''.join([i if i not in string.punctuation else ' ' for i in modified_sentence])
    modified_sentence = re.sub(r'\d+', ' ', modified_sentence)
    modified_sentence = re.sub(r'\s+', ' ', modified_sentence)
    modified_sentence = modified_sentence.lower()
    return modified_sentence

In [12]:
data_clean = data.apply(get_cleaned_textdata)

In [13]:
print(data_clean)

0     i used to love comcast until all these constan...
1     i m so over comcast the worst internet provide...
2     if i could give them a negative star or no sta...
3     i ve had the worst experiences so far since in...
4     check your contract when you sign up for comca...
5     thank god i am changing to dish they gave me a...
6     i have been a long time customer and only have...
7     there is a malfunction on the dvr manager whic...
8     charges overwhelming comcast service rep was s...
9     i have had cable dish and u verse etc in the p...
10    had them from to now i d tell new customers to...
11    disappointed i have been a comcast xfinity cus...
12    these people are unethical and disturbing obli...
13    unplanned unexpected all day outages rude serv...
14    be warned you will have hidden fees when you s...
15    had comcast overall the terrible experience ev...
16    when i called the infinity customer service ce...
17    outraged that it should not take a month t

In [14]:
# import nltk
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\SW01081339\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SW01081339\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\SW01081339\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [15]:
#tokenization
data_clean = data_clean.apply(word_tokenize)

In [16]:
data_clean

0     [i, used, to, love, comcast, until, all, these...
1     [i, m, so, over, comcast, the, worst, internet...
2     [if, i, could, give, them, a, negative, star, ...
3     [i, ve, had, the, worst, experiences, so, far,...
4     [check, your, contract, when, you, sign, up, f...
5     [thank, god, i, am, changing, to, dish, they, ...
6     [i, have, been, a, long, time, customer, and, ...
7     [there, is, a, malfunction, on, the, dvr, mana...
8     [charges, overwhelming, comcast, service, rep,...
9     [i, have, had, cable, dish, and, u, verse, etc...
10    [had, them, from, to, now, i, d, tell, new, cu...
11    [disappointed, i, have, been, a, comcast, xfin...
12    [these, people, are, unethical, and, disturbin...
13    [unplanned, unexpected, all, day, outages, rud...
14    [be, warned, you, will, have, hidden, fees, wh...
15    [had, comcast, overall, the, terrible, experie...
16    [when, i, called, the, infinity, customer, ser...
17    [outraged, that, it, should, not, take, a,

In [17]:
stopwords = nltk.corpus.stopwords.words('english')

In [18]:
def remove_stopwords(text):
    output = []
    for i in text:
        if i not in stopwords:
            output.append(i)
    return output

In [19]:
#remove stopwords
data_clean = data_clean.apply(remove_stopwords)

In [20]:
data_clean

0     [used, love, comcast, constant, updates, inter...
1     [comcast, worst, internet, provider, taking, o...
2     [could, give, negative, star, stars, review, w...
3     [worst, experiences, far, since, install, noth...
4     [check, contract, sign, comcast, advertised, o...
5     [thank, god, changing, dish, gave, awesome, pr...
6     [long, time, customer, xfinity, isp, local, wa...
7     [malfunction, dvr, manager, preventing, us, ad...
8     [charges, overwhelming, comcast, service, rep,...
9     [cable, dish, u, verse, etc, past, eh, know, c...
10    [tell, new, customers, run, nowhere, run, trie...
11    [disappointed, comcast, xfinity, customer, alm...
12    [people, unethical, disturbing, oblivious, cus...
13    [unplanned, unexpected, day, outages, rude, se...
14    [warned, hidden, fees, sign, service, charge, ...
15    [comcast, overall, terrible, experience, every...
16    [called, infinity, customer, service, center, ...
17    [outraged, take, month, get, internet, ser

In [21]:
# porter_stemmer = PorterStemmer()

In [22]:
# def porter_stemming(text):
#     stem_text = []
#     for word in text:
#         stemmed_word = porter_stemmer.stem(word)
#         stem_text.append(stemmed_word)
#     return stem_text

In [23]:
# data_clean = data_clean.apply(porter_stemming)

In [24]:
# data_clean

In [25]:
# wordnet_lemmatizer = WordNetLemmatizer()

In [26]:
# #defining the function for lemmatization
# def lemmatizer(text):
#     lemm_text = []
#     for word in text:
#         lemmatized_word = wordnet_lemmatizer.lemmatize(word)
#         lemm_text.append(lemmatized_word)
#     return lemm_text

In [27]:
# data_clean = data_clean.apply(lemmatizer)

In [28]:
# data_clean

In [29]:
def combine_tokens(tokens):
    return ' '.join(tokens)

# Apply the function to each row of the 'text' column
data_clean = data_clean.apply(combine_tokens)

In [30]:
data_clean

0     used love comcast constant updates internet ca...
1     comcast worst internet provider taking online ...
2     could give negative star stars review would ne...
3     worst experiences far since install nothing pr...
4     check contract sign comcast advertised offers ...
5     thank god changing dish gave awesome pricing s...
6     long time customer xfinity isp local walmart n...
7     malfunction dvr manager preventing us adding r...
8     charges overwhelming comcast service rep ignor...
9     cable dish u verse etc past eh know comcast ta...
10    tell new customers run nowhere run tried turne...
11    disappointed comcast xfinity customer almost t...
12    people unethical disturbing oblivious customer...
13    unplanned unexpected day outages rude service ...
14    warned hidden fees sign service charge extra l...
15    comcast overall terrible experience everyone e...
16    called infinity customer service center compla...
17    outraged take month get internet service c

In [31]:
# Vectorize the dataset to see the coordinat
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data_clean)

In [32]:
k = 2 # Define the number of clusters
km = KMeans(n_clusters=k)
km.fit(X)

  super()._check_params_vs_input(X, default_n_init=10)


In [33]:
# Predict the clusters for each document
y_pred = km.predict(X)
y_pred

array([1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1])

In [34]:
# Display the document and its predicted cluster in a table
table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(data_clean, y_pred)])
print(tabulate(table_data, headers="firstrow"))

# Print top terms per cluster
print("\nTop terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()

for i in range(k):
 print("Cluster %d:" % i)
 for ind in order_centroids[i, :10]:
     print(' %s' % terms[ind])
 print()

Document                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                

In [35]:
# evaluate result

# Calculate purity
total_samples = len(y_pred)
cluster_label_counts = [Counter(y_pred)]
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples
print("Purity:", purity)

Purity: 0.8421052631578947


# TEXT CLUSTERING USING WORD2VEC VECTORIZER

### Without text preprocessing

In [36]:
#import libraries
import numpy as np
from sklearn.cluster import KMeans
from gensim.models import Word2Vec
from tabulate import tabulate
from collections import Counter

In [37]:
# train word2vec model
tokenized_dataset = [doc.split() for doc in data]
word2vec_model = Word2Vec(sentences=tokenized_dataset, vector_size=100, 
window=5, min_count=1, workers=4)

In [38]:
#create document embeddings
X = np.array([np.mean([word2vec_model.wv[word] for word in doc.split() if word in 
word2vec_model.wv], axis=0) for doc in data])

In [39]:
#perform clustering
k = 2 # Define the number of clusters
km = KMeans(n_clusters=k)
km.fit(X)

  super()._check_params_vs_input(X, default_n_init=10)


In [40]:
# Predict the clusters for each document
y_pred = km.predict(X)
y_pred

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1])

In [41]:
# Tabulate the document and predicted cluster
table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(data, y_pred)])
print(tabulate(table_data, headers="firstrow"))

Document                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                

In [42]:
#evaluate
# Calculate purity
total_samples = len(y_pred)
cluster_label_counts = [Counter(y_pred)]
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples
print("Purity:", purity)

Purity: 0.5263157894736842


### With text preprocessing

In [43]:
# train word2vec model
tokenized_dataset = [doc.split() for doc in data_clean]
word2vec_model = Word2Vec(sentences=tokenized_dataset, vector_size=100, 
window=5, min_count=1, workers=4)

In [44]:
#create document embeddings
X = np.array([np.mean([word2vec_model.wv[word] for word in doc.split() if word in 
word2vec_model.wv], axis=0) for doc in data_clean])

In [45]:
#perform clustering
k = 2 # Define the number of clusters
km = KMeans(n_clusters=k)
km.fit(X)

  super()._check_params_vs_input(X, default_n_init=10)


In [46]:
# Predict the clusters for each document
y_pred = km.predict(X)
y_pred

array([1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [47]:
# Tabulate the document and predicted cluster
table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(data_clean, y_pred)])
print(tabulate(table_data, headers="firstrow"))

Document                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                

In [48]:
#evaluate
# Calculate purity
total_samples = len(y_pred)
cluster_label_counts = [Counter(y_pred)]
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples
print("Purity:", purity)

Purity: 0.9473684210526315
