In [4]:
from google.colab import files
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec, Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [5]:
# Upload dataset
uploaded = files.upload()
df = pd.read_csv("news_Feb_14.csv")
df.columns = [col.strip() for col in df.columns]
df = df.dropna(subset=['title'])  # Drop missing titles


Saving news_Feb_14.csv to news_Feb_14 (1).csv


In [6]:
# Preprocessing
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words("english")]
    return tokens

df["Processed_tokens"] = df["title"].apply(preprocess_text)

In [10]:
# Word2Vec Model (CBOW)
w2v_model = Word2Vec(sentences=df["Processed_tokens"], vector_size=500, window=50, sg=0, epochs=100)

# Function to get Word2Vec vector for each headline
def get_w2v_vector(tokens, model, vector_size=500):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if len(vectors) > 0:
        return np.mean(vectors, axis=0)  # Average of word vectors
    else:
        return np.zeros(vector_size)  # Return zero vector if no words found

# Apply function to each row
df["W2V_Vector"] = df["Processed_tokens"].apply(lambda x: get_w2v_vector(x, w2v_model))

In [11]:
# Doc2Vec Model (DBOW)
tagged_data = [TaggedDocument(words=row, tags=[str(i)]) for i, row in enumerate(df["Processed_tokens"])]
d2v_model = Doc2Vec(vector_size=500, window=50, dm=0, epochs=100)
d2v_model.build_vocab(tagged_data)
d2v_model.train(tagged_data, total_examples=d2v_model.corpus_count, epochs=d2v_model.epochs)
df["D2V_Vector"] = df.index.to_series().apply(lambda i: d2v_model.dv[str(i)])


In [12]:
# Convert vectors into matrix for clustering
X_w2v = np.vstack(df["W2V_Vector"].values)
X_d2v = np.vstack(df["D2V_Vector"].values)


In [13]:
# K-Means Clustering (k=9)
k = 9
kmeans_w2v = KMeans(n_clusters=k, random_state=42, n_init=10)
kmeans_d2v = KMeans(n_clusters=k, random_state=42, n_init=10)
df['Cluster_W2V'] = kmeans_w2v.fit_predict(X_w2v)
df['Cluster_D2V'] = kmeans_d2v.fit_predict(X_d2v)


In [14]:
# Evaluation
sil_w2v = silhouette_score(X_w2v, df['Cluster_W2V'])
sil_d2v = silhouette_score(X_d2v, df['Cluster_D2V'])
print(f"Silhouette Score (Word2Vec): {sil_w2v}")
print(f"Silhouette Score (Doc2Vec): {sil_d2v}")


Silhouette Score (Word2Vec): 0.6016495822287984
Silhouette Score (Doc2Vec): 0.3732086718082428


In [15]:
# Save clustered headlines
def save_clusters(file_name, column):
    with open(file_name, "w", encoding="utf-8") as f:
        for i in range(k):
            f.write(f"Cluster {i}:\n")
            f.write("-" * 50 + "\n")
            headlines = df[df[column] == i]["title"].tolist()
            for headline in headlines:
                f.write(headline + "\n")
            f.write("\n")

save_clusters("clustered_titles_w2v.txt", "Cluster_W2V")
save_clusters("clustered_titles_d2v.txt", "Cluster_D2V")
print("Clusters saved successfully!")

Clusters saved successfully!
