In [1]:
import numpy as np
from sklearn.cluster import KMeans
from gensim.models import Word2Vec
from tabulate import tabulate
from collections import Counter

In [2]:
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [3]:
dataset = [
    "I love playing football on the weekends",
    "I enjoy hiking and camping in the mountains",
    "I like to read books and watch movies",
    "I prefer playing video games over sports",
    "I love listening to music and going to concerts"
]

In [4]:
def remove_punctuation(text):
    return ''.join([char for char in text if char not in string.punctuation])

# Step 2: Convert to lowercase
def convert_to_lowercase(text):
    return text.lower()

# Step 3: Tokenization
def tokenize_text(text):
    return word_tokenize(text)

# Step 4: Remove stopwords
def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    return [token for token in tokens if token not in stop_words]

# Step 5: Stemming (Optional)
def perform_stemming(tokens):
    stemmer = PorterStemmer()
    return [stemmer.stem(token) for token in tokens]

In [5]:
preprocessed_dataset = []
for document in dataset:
    document = remove_punctuation(document)
    document = convert_to_lowercase(document)
    tokens = tokenize_text(document)
    tokens = remove_stopwords(tokens)
    # You can choose to perform stemming here if needed
    # tokens = perform_stemming(tokens)
    preprocessed_dataset.append(tokens)

In [6]:
for i, document in enumerate(preprocessed_dataset, 1):
    print(f"Document {i}: {document}")

Document 1: ['love', 'playing', 'football', 'weekends']
Document 2: ['enjoy', 'hiking', 'camping', 'mountains']
Document 3: ['like', 'read', 'books', 'watch', 'movies']
Document 4: ['prefer', 'playing', 'video', 'games', 'sports']
Document 5: ['love', 'listening', 'music', 'going', 'concerts']


In [7]:
tokenized_dataset = [doc.split() for doc in dataset]
word2vec_model = Word2Vec(sentences=tokenized_dataset, vector_size=100,
window=5, min_count=1, workers=4)

In [8]:
X = np.array([np.mean([word2vec_model.wv[word] for word in doc.split() if word in
word2vec_model.wv], axis=0) for doc in dataset])

In [9]:
k = 2 # Define the number of clusters
km = KMeans(n_clusters=k)
km.fit(X)
# Predict the clusters for each document
y_pred = km.predict(X)
# Tabulate the document and predicted cluster
table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(dataset, y_pred)])
print(tabulate(table_data, headers="firstrow"))

  super()._check_params_vs_input(X, default_n_init=10)


Document                                           Predicted Cluster
-----------------------------------------------  -------------------
I love playing football on the weekends                            0
I enjoy hiking and camping in the mountains                        0
I like to read books and watch movies                              1
I prefer playing video games over sports                           0
I love listening to music and going to concerts                    1


In [10]:
# Calculate purity
total_samples = len(y_pred)
cluster_label_counts = [Counter(y_pred)]
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples
print("Purity:", purity)

Purity: 0.6
