In [4]:
# https://www.kaggle.com/datasets/hgultekin/bbcnewsarchive

import pandas as pd
import nltk
nltk.download('wordnet')
nltk.download('names')
# nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import names
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package names to /root/nltk_data...
[nltk_data]   Package names is already up-to-date!


In [7]:
dataset = pd.read_csv("bbc-news-data.csv", delimiter='	')
dataset.head()

Unnamed: 0,category,filename,title,content
0,business,001.txt,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...
1,business,002.txt,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...
2,business,003.txt,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...
3,business,004.txt,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...
4,business,005.txt,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...


In [8]:
labels = dataset['category'].unique()
labels

array(['business', 'entertainment', 'politics', 'sport', 'tech'],
      dtype=object)

In [9]:
dataset = dataset['content'].to_numpy()
dataset

array([' Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.  The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.  Time Warner said on Friday that it now owns 8% of search-engine Google. But its own internet business, AOL, had has mixed fortunes. It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. However, the company said AOL\'s underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues. It hopes to increase subscribers by offering the online service free to TimeWarner internet customers and will try to sign up AOL\'s existing customers fo

In [15]:
import re
import string
import numpy as np

data_proc = []
lemmatizer = WordNetLemmatizer()
nomes = set([word.lower() for word in names.words()])

for doc in dataset:
  doc = doc.lower()
  doc = re.sub(f'[{re.escape(string.punctuation)}]', '', doc)
  doc = re.sub(r'(\d+)', ' ', doc)
  doc = re.sub(r'(\s+)', ' ', doc)
  doc_lem = ' '.join(lemmatizer.lemmatize(word) for word in doc.split() if word not in nomes)
  data_proc.append(doc_lem)

data_proc = np.array(data_proc)

In [16]:
data_proc

array(['quarterly profit at u medium giant timewarner jumped to bn £ m for the three month to december from m yearearlier the firm which is now one of the biggest investor in google benefited from sale of highspeed internet connection and higher advert sale timewarner said fourth quarter sale to bn from bn it profit were buoyed by oneoff gain which offset a profit dip at bros and le user for aol time said on friday that it now owns of searchengine google but it own internet business aol had ha mixed fortune it lost subscriber in the fourth quarter profit were lower than in the preceding three quarter however the company said aols underlying profit before exceptional item on the back of stronger internet advertising revenue it hope to increase subscriber by offering the online service free to timewarner internet customer and try to sign up aols existing customer for highspeed broadband timewarner also ha to restate and result following a probe by the u security exchange commission sec w

In [17]:
tfidf = TfidfVectorizer(max_df=0.5, min_df=2, stop_words='english')

vect_tfidf = tfidf.fit_transform(data_proc)

In [18]:
lda = LatentDirichletAllocation(n_components=5, random_state=42)

lda.fit(vect_tfidf)

In [19]:
vect_tfidf.shape

(2225, 14606)

In [23]:
lda.components_.shape

(5, 14606)

In [25]:
labels

array(['business', 'entertainment', 'politics', 'sport', 'tech'],
      dtype=object)

In [27]:
terms = tfidf.get_feature_names_out()
for topic_idx, topic in enumerate(lda.components_):
  print("Topic {}:" .format(topic_idx))
  print(" ".join([terms[i] for i in topic.argsort()[-10:]]))

Topic 0:
nogar savoy miyazakis eslates eduvision wham howl exeem tgwu mock
Topic 1:
commodore mirza euronext rusedski nintendo lse henman boerse kilroysilk ukip
Topic 2:
cup play won match player england award best film game
Topic 3:
spyware uwb firefox screensaver bez antivirus printer bluray ballet lycos
Topic 4:
party uk labour firm company government bn new people mr
