In [1]:
from google.colab import files


uploaded = files.upload()

Saving news_Feb_14.csv to news_Feb_14.csv


In [40]:
import pandas as pd
import io

df = pd.read_csv(io.BytesIO(uploaded['news_Feb_14.csv']))
print(df)


                                                 title        date
0    ECC endorses purchase of $582mn capital shares...  14/02/2025
1    Netanyahu’s statement to establish Palestinian...  14/02/2025
2    India, US agree to resolve trade and tariff ro...  14/02/2025
3    Aurangzeb discusses Pakistan’s structural refo...  14/02/2025
4    HBL, S&P Global launch Pakistan’s first manufa...  14/02/2025
..                                                 ...         ...
448  SBP grants FPT clearance to Zia Ijaz as Askari...  14/02/2025
449  India’s Modi brings a tariff ‘gift’ to Trump t...  13/02/2025
450  Oil prices decline on optimism over potential ...  13/02/2025
451  Google partners with Poland to expand AI adopt...  13/02/2025
452  US regulator opens probe into 129,092 Honda ve...  13/02/2025

[453 rows x 2 columns]


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer #Converts text into a bag-of-words representation.
                                                                             #Converts text into TF-IDF (Term Frequency-Inverse Document Frequency) representation.
from sklearn.decomposition import TruncatedSVD                               #TruncatedSVD: Performs Latent Semantic Analysis (LSA) by reducing dimensionality using Singular Value Decomposition (SVD).
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.pipeline import make_pipeline                                   #Allows multiple transformations (e.g., TF-IDF followed by SVD) in a single step
from sklearn.preprocessing import Normalizer
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
# Load Dataset
df = pd.read_csv("news_Feb_14.csv")
df.dropna(inplace=True)

def preprocess_text(text, remove_stopwords=True, use_stemming=False, use_lemmatization=False):
    text = text.lower() #Converts all text to lowercase (text.lower()) to avoid case sensitivity issues (e.g., "Apple" and "apple" should be the same).
    words = text.split() # splits the text into individual words.

    #If remove_stopwords=True, common words (e.g., "the", "is", "and") are removed using the NLTK stopwords list.

    if remove_stopwords:
        stop_words = set(stopwords.words("english"))
        words = [word for word in words if word not in stop_words]

    #If use_stemming=True, the Porter Stemmer reduces words to their root form (e.g., "running" -> "run").

    if use_stemming:
        stemmer = PorterStemmer()
        words = [stemmer.stem(word) for word in words]

    #If use_lemmatization=True, words are lemmatized using WordNet (e.g., "better" -> "good").

    if use_lemmatization:
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words]

    return " ".join(words) #joins words back into a single text string.

In [None]:
# Apply preprocessing
remove_stopwords = True
use_stemming = False
use_lemmatization = True
ngram_range = (1, 2) # Using bigrams (ngram_range=(1,2)) means both single words and two-word phrases will be used for vectorization.
                     # ngram_range(1,1) signifies uni-grams
df["processed_text"] = df["title"].apply(lambda x: preprocess_text(x, remove_stopwords, use_stemming, use_lemmatization))


In [None]:
# Define vectorization techniques
vectorizers = {
    "CountVectorizer": CountVectorizer(ngram_range=ngram_range), #Converts text into a matrix of term frequencies. Example: If "gold truck" appears twice, it will have a value of 2.
    "TFIDF": TfidfVectorizer(ngram_range=ngram_range), #Weights terms based on how important they are across documents using formula: TF-IDF=TFxlog(N/DF)
    "LSA": make_pipeline(TfidfVectorizer(ngram_range=ngram_range), TruncatedSVD(n_components=3)),  #Uses TF-IDF + TruncatedSVD to reduce the number of dimensions and capture hidden relationships between words.
    "TruncatedSVD": TruncatedSVD(n_components=3)  #Performs SVD alone (without TF-IDF).
}

In [74]:
results = []
k_values = [13]
random_state = 26409  # Change to your ERP ID

for vec_name, vectorizer in vectorizers.items():
    if vec_name == "TruncatedSVD":
        # Apply SVD to previously vectorized text (e.g., TF-IDF)
        X = vectorizers["TFIDF"].fit_transform(df["processed_text"])
        X = vectorizer.fit_transform(X)  # Apply TruncatedSVD separately
    else:
        X = vectorizer.fit_transform(df["processed_text"])

    for k in k_values:
        kmeans = KMeans(n_clusters=k, random_state=random_state, n_init=10)
        labels = kmeans.fit_predict(X)

        wss = kmeans.inertia_
        silhouette = silhouette_score(X, labels)

        results.append([k, vec_name, remove_stopwords, use_stemming, use_lemmatization, "bi-grams", wss, silhouette])

In [75]:
# Convert results to DataFrame
results_df = pd.DataFrame(results, columns=["Clusters", "Vectorizer", "Stopwords", "Stemming", "Lemmatization", "N-Grams", "WSS", "Silhouette"])
print(results_df)


   Clusters       Vectorizer  Stopwords  Stemming  Lemmatization   N-Grams  \
0        13  CountVectorizer       True     False           True  bi-grams   
1        13            TFIDF       True     False           True  bi-grams   
2        13              LSA       True     False           True  bi-grams   
3        13     TruncatedSVD       True     False           True  bi-grams   

           WSS  Silhouette  
0  6735.730287   -0.033216  
1   431.990878    0.004609  
2     0.481857    0.496570  
3     0.486733    0.511863  
