In [15]:
from google.colab import files


uploaded = files.upload()

Saving news_Feb_14.csv to news_Feb_14 (2).csv


In [16]:
import pandas as pd
import io

df = pd.read_csv(io.BytesIO(uploaded['news_Feb_14 (2).csv']))
print(df)


                                                 title        date
0    ECC endorses purchase of $582mn capital shares...  14/02/2025
1    Netanyahu’s statement to establish Palestinian...  14/02/2025
2    India, US agree to resolve trade and tariff ro...  14/02/2025
3    Aurangzeb discusses Pakistan’s structural refo...  14/02/2025
4    HBL, S&P Global launch Pakistan’s first manufa...  14/02/2025
..                                                 ...         ...
448  SBP grants FPT clearance to Zia Ijaz as Askari...  14/02/2025
449  India’s Modi brings a tariff ‘gift’ to Trump t...  13/02/2025
450  Oil prices decline on optimism over potential ...  13/02/2025
451  Google partners with Poland to expand AI adopt...  13/02/2025
452  US regulator opens probe into 129,092 Honda ve...  13/02/2025

[453 rows x 2 columns]


In [24]:
import pandas as pd
import numpy as np

import gensim
from gensim.models import Word2Vec, Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score


In [28]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [32]:
# Load Dataset
df = pd.read_csv("news_Feb_14.csv")
df.dropna(inplace=True)

In [33]:
# Tokenize text
df["tokenized"] = df["title"].apply(nltk.word_tokenize)

In [55]:
# Word2Vec Parameters
vector_size = 1000  # Embedding dimension
window = 100  # Context window size
epochs = 200  # Training epochs

In [56]:
# Train Word2Vec (CBoW and Skip-gram)
w2v_cbow = Word2Vec(sentences=df["tokenized"], vector_size=vector_size, window=window, sg=0, epochs=epochs)
w2v_skipgram = Word2Vec(sentences=df["tokenized"], vector_size=vector_size, window=window, sg=1, epochs=epochs)


In [57]:
# Compute Word2Vec Averaged Embeddings
def average_word2vec(model, tokens, vector_size):
    valid_tokens = [token for token in tokens if token in model.wv]
    if valid_tokens:
        return np.mean(model.wv[valid_tokens], axis=0)
    else:
        return np.zeros(vector_size)

df["w2v_cbow"] = df["tokenized"].apply(lambda x: average_word2vec(w2v_cbow, x, vector_size))
df["w2v_skipgram"] = df["tokenized"].apply(lambda x: average_word2vec(w2v_skipgram, x, vector_size))


In [58]:
# Train Doc2Vec (DM and DBOW)
documents = [TaggedDocument(words=row, tags=[idx]) for idx, row in enumerate(df["tokenized"])]
d2v_dm = Doc2Vec(documents, vector_size=vector_size, window=window, dm=1, epochs=epochs)
d2v_dbow = Doc2Vec(documents, vector_size=vector_size, window=window, dm=0, epochs=epochs)

df["d2v_dm"] = [d2v_dm.dv[idx] for idx in range(len(df))]
df["d2v_dbow"] = [d2v_dbow.dv[idx] for idx in range(len(df))]


In [59]:
# Clustering Parameters
k_values = [5, 9, 13]
random_state = 26409  # Replace with your ERP ID

# Function for Clustering and Evaluation
def perform_kmeans(X, method):
    results = []
    for k in k_values:
        kmeans = KMeans(n_clusters=k, random_state=random_state, n_init=10)
        labels = kmeans.fit_predict(X.tolist())
        wss = kmeans.inertia_
        silhouette = silhouette_score(X.tolist(), labels)
        results.append([method, vector_size, window, epochs, k, wss, silhouette])
    return results

In [60]:
# Run K-Means Clustering and Collect Results
results = []
results += perform_kmeans(np.vstack(df["w2v_cbow"].values), "Word2Vec-CBoW")
results += perform_kmeans(np.vstack(df["w2v_skipgram"].values), "Word2Vec-Skipgram")
results += perform_kmeans(np.vstack(df["d2v_dm"].values), "Doc2Vec-DM")
results += perform_kmeans(np.vstack(df["d2v_dbow"].values), "Doc2Vec-DBOW")


In [61]:
# Convert results to DataFrame
results_df = pd.DataFrame(results, columns=["Method", "Vector Size", "Window", "Epochs", "Clusters", "WSS", "Silhouette"])
print(results_df)

               Method  Vector Size  Window  Epochs  Clusters         WSS  \
0       Word2Vec-CBoW         1000     100     200         5  586.990976   
1       Word2Vec-CBoW         1000     100     200         9  462.023051   
2       Word2Vec-CBoW         1000     100     200        13  383.292558   
3   Word2Vec-Skipgram         1000     100     200         5  641.514059   
4   Word2Vec-Skipgram         1000     100     200         9  542.495808   
5   Word2Vec-Skipgram         1000     100     200        13  472.750942   
6          Doc2Vec-DM         1000     100     200         5  186.417377   
7          Doc2Vec-DM         1000     100     200         9  144.025177   
8          Doc2Vec-DM         1000     100     200        13  119.499450   
9        Doc2Vec-DBOW         1000     100     200         5  148.087497   
10       Doc2Vec-DBOW         1000     100     200         9  114.769947   
11       Doc2Vec-DBOW         1000     100     200        13   93.549924   

    Silhoue