In [3]:
import numpy as np
from sklearn.cluster import KMeans
from gensim.models import Word2Vec
from tabulate import tabulate
from collections import Counter

In [5]:
dataset = ["I love playing football on the weekends",
 "I enjoy hiking and camping in the mountains",
 "I like to read books and watch movies",
 "I prefer playing video games over sports",
 "I love listening to music and going to concerts"]

In [7]:
def convert_to_lowercase(text):
    return text.lower()
dataset = [convert_to_lowercase(text) for text in dataset]
for i, text in enumerate(dataset, start=1):
    print(f"{i}: {text}")

1: i love playing football on the weekends
2: i enjoy hiking and camping in the mountains
3: i like to read books and watch movies
4: i prefer playing video games over sports
5: i love listening to music and going to concerts


In [9]:
import re
import string

def remove_punctuation(text):
    if not isinstance(text, str):
        return text
    text = re.sub(r'[\n\r]+', ' ', text)
    punctuation_to_remove = string.punctuation.replace("'", "")
    text = re.sub(r"[{}]".format(re.escape(punctuation_to_remove)), " ", text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

dataset = [remove_punctuation(text) for text in dataset]

for i, text in enumerate(dataset, start=1):
    print(f"{i}: {text}")

1: i love playing football on the weekends
2: i enjoy hiking and camping in the mountains
3: i like to read books and watch movies
4: i prefer playing video games over sports
5: i love listening to music and going to concerts


In [11]:
from nltk.corpus import stopwords

def remove_stopwords(text):
    if isinstance(text, str):
        return ' '.join(word for word in text.split() if word.lower() not in stop_words)
    return text
stop_words = set(stopwords.words('english'))
# Apply all transformations
dataset = [remove_stopwords(text) for text in dataset]

# Print the cleaned dataset
for i, text in enumerate(dataset, start=1):
    print(f"{i}: {text}")

1: love playing football weekends
2: enjoy hiking camping mountains
3: like read books watch movies
4: prefer playing video games sports
5: love listening music going concerts


In [13]:
def remove_suffixes(text):
    if isinstance(text, str):
        words = text.split()
        stripped_words = []
        for word in words:
            # Basic handling for common suffixes
            if word.endswith("ing") and len(word) > 4:
                stripped_words.append(word[:-3])
            elif word.endswith("es") and len(word) > 3:
                stripped_words.append(word[:-2])
            else:
                stripped_words.append(word)
        return ' '.join(stripped_words)
    return text

# Apply the new step AFTER the previous processing
dataset = [remove_suffixes(text) for text in dataset]

# Print the final output
for i, text in enumerate(dataset, start=1):
    print(f"{i}: {text}")

1: love play football weekends
2: enjoy hik camp mountains
3: like read books watch movi
4: prefer play video gam sports
5: love listen music go concerts


In [15]:
!pip install autocorrect

from autocorrect import Speller

spell = Speller(lang='en')

def autocorrect_text(text):
    if isinstance(text, str):
        return ' '.join(spell(word) for word in text.split())
    return text

# Apply autocorrect
dataset = [autocorrect_text(text) for text in dataset]

# Print results
print("\nAutocorrected dataset:")
for i, text in enumerate(dataset, start=1):
    print(f"{i}: {text}")


Autocorrected dataset:
1: love play football weekends
2: enjoy his camp mountains
3: like read books watch move
4: prefer play video gam sports
5: love listen music go concerts


In [17]:
tokenized_dataset = [doc.split() for doc in dataset]
word2vec_model = Word2Vec(sentences=tokenized_dataset, vector_size=100,
window=5, min_count=1, workers=4)

In [19]:
X = np.array([np.mean([word2vec_model.wv[word] for word in doc.split() if word in
word2vec_model.wv], axis=0) for doc in dataset])

In [21]:
k = 2 # Define the number of clusters
km = KMeans(n_clusters=k)
km.fit(X)
# Predict the clusters for each document
y_pred = km.predict(X)
# Tabulate the document and predicted cluster
table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(dataset, y_pred)])
print(tabulate(table_data, headers="firstrow"))

Document                         Predicted Cluster
-----------------------------  -------------------
love play football weekends                      0
enjoy his camp mountains                         1
like read books watch move                       0
prefer play video gam sports                     0
love listen music go concerts                    1




In [23]:
# Calculate purity
total_samples = len(y_pred)
cluster_label_counts = [Counter(y_pred)]
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples
print("Purity:", purity)


Purity: 0.6
