In [14]:
# Import necessary libraries
import pandas as pd

# Load the dataset
df = pd.read_csv('/kaggle/input/bbcdata/bbc_news.csv')  # replace with your dataset path

# Check the first few rows
df.head()

# Check the column names in the dataset
df.columns
# Extract the article text for clustering
texts = df['description'].values  # numpy array of all articles

# Check the first 5 articles
texts[:5]
# Combine title and description for clustering
texts = (df['title'] + " " + df['description']).values

# Check first 5 combined texts
texts[:5]
# Import libraries
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK resources (only first time)
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Preprocessing function
def preprocess(text):
    text = re.sub(r'\W', ' ', text)       # Remove punctuation
    text = re.sub(r'\d', ' ', text)       # Remove numbers
    text = text.lower()                    # Lowercase
    words = text.split()
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    return ' '.join(words)

# Apply preprocessing to all articles
processed_texts = [preprocess(doc) for doc in texts]

# Check first 5 processed articles
processed_texts[:5]
# Import TF-IDF vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=1000)  # you can increase to 2000 if needed

# Fit and transform the preprocessed texts
X = vectorizer.fit_transform(processed_texts)

# Check the shape of the resulting matrix
print(X.shape)

# Import KMeans
from sklearn.cluster import KMeans

# Choose the number of clusters (topics)
num_clusters = 5  # e.g., 5 topics: business, tech, sports, politics, entertainment

# Initialize KMeans
kmeans = KMeans(n_clusters=num_clusters, random_state=42)

# Fit KMeans on the TF-IDF vectors
kmeans.fit(X)

# Get cluster labels for each article
labels = kmeans.labels_

# Check first 10 cluster assignments
labels[:10]

terms = vectorizer.get_feature_names_out()
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]

for i in range(num_clusters):
    print(f"Cluster {i}:")
    top_words = [terms[ind] for ind in order_centroids[i, :10]]
    print(top_words)
    print()
# Add the cluster labels to your original dataframe
df['cluster'] = kmeans.labels_

cluster_names = {
    0: 'UK News / Politics',
    1: 'War / Ukraine-Russia',
    2: 'Sports / Football',
    3: 'Human-interest / Daily News',
    4: 'Crime / Police Reports'
}
df['cluster_name'] = df['cluster'].map(cluster_names)

# Check the first 10 rows
df.head(5)
# Loop through each cluster and print 3 sample articles
for cluster in df['cluster_name'].unique():
    print(f"\nCluster: {cluster}\n" + "-"*50)
    sample_articles = df[df['cluster_name'] == cluster][['title', 'description']].head(3)
    for idx, row in sample_articles.iterrows():
        print(f"Title: {row['title']}")
        print(f"Description: {row['description']}\n")
df.to_csv('bbc_clustered.csv', index=False)























[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


(42115, 1000)




Cluster 0:
['say', 'uk', 'people', 'new', 'bbc', 'day', 'election', 'could', 'first', 'one']

Cluster 1:
['ukraine', 'war', 'russia', 'russian', 'ukrainian', 'putin', 'say', 'invasion', 'kyiv', 'attack']

Cluster 2:
['world', 'england', 'cup', 'win', 'final', 'league', 'beat', 'champion', 'manchester', 'first']

Cluster 3:
['year', 'old', 'two', 'say', 'last', 'first', 'new', 'uk', 'ago', 'bbc']

Cluster 4:
['police', 'officer', 'say', 'man', 'met', 'woman', 'murder', 'arrested', 'found', 'arrest']


Cluster: War / Ukraine-Russia
--------------------------------------------------
Title: Ukraine: Angry Zelensky vows to punish Russian atrocities
Description: The Ukrainian president says the country will not forgive or forget those who murder its civilians.

Title: War in Ukraine: Taking cover in a town under attack
Description: Jeremy Bowen was on the frontline in Irpin, as residents came under Russian fire while trying to flee.

Title: Ukraine war 'catastrophic for global food'
Descript