In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install bertopic

Collecting bertopic
  Downloading bertopic-0.16.0-py2.py3-none-any.whl (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.1/154.1 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.33.tar.gz (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting umap-learn>=0.5.0 (from bertopic)
  Downloading umap-learn-0.5.5.tar.gz (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.9/90.9 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentence-transformers>=0.4.1 (from bertopic)
  Downloading sentence_transformers-2.3.1-py3-none-any.whl (132 kB)
[2K     [90m━━━━━━

In [8]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)

    filtered_text = [word for word in word_tokens if word.lower() not in stop_words]

    return ' '.join(filtered_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [9]:
import os

base_path = '/content/drive/MyDrive/bbc/'
# topics = ['tech','politics','sport','business','entertainment']
topics = ['politics']
contents = {}
for topic in topics:
  contents[topic] = []
  folder_path = base_path + topic
  # Get a list of all files in the folder
  all_files = os.listdir(folder_path)
  # Filter out only the files (excluding subdirectories)
  files_only = [file for file in all_files if os.path.isfile(os.path.join(folder_path, file))]
  for file in files_only:
      file_path = os.path.join(folder_path, file)
      with open(file_path, 'r', encoding='utf-8') as txt_file:
            file_content = txt_file.read()
            contents[topic].append(remove_stopwords(file_content))


In [10]:
from bertopic import BERTopic
from sklearn.datasets import fetch_20newsgroups

top_topics = {}
for topic in topics:
  # Create BERTopic model
  model = BERTopic()
  topics, probabilities = model.fit_transform(contents[topic])
  # Get the top topics
  top_topic = model.get_topic_freq().head(5)  # Get the top 5 topics
  top_topics[topic] = {"topics":top_topic, "model":model}

In [11]:
politics = top_topics['politics']
# Print the top topics
print(politics["topics"])
politics["model"].get_topic_info()
# Visualize topics
# politics["model"].visualize_topics()

   Topic  Count
0     -1    114
5      0     59
1      1     58
2      2     49
3      3     18


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,114,-1_said_mr_would_people,"[said, mr, would, people, labour, government, ...",[Howard denies split ID cards Michael Howard d...
1,0,59,0_mr_blair_labour_brown,"[mr, blair, labour, brown, election, minister,...",[Blair dismisses quit claim report Tony Blair ...
2,1,58,1_tax_labour_would_election,"[tax, labour, would, election, mr, brown, said...",[Howard dismisses Tory tax fears Michael Howar...
3,2,49,2_said_police_law_would,"[said, police, law, would, rights, home, human...",[Lords wrong detainees - Straw Jack Straw atta...
4,3,18,3_aid_world_africa_brown,"[aid, world, africa, brown, g8, countries, sai...",['No UK apology ' colonial past days Britain a...
5,4,17,4_asylum_immigration_uk_howard,"[asylum, immigration, uk, howard, said, system...",[Clarke plans migrant point scheme Anyone plan...
6,5,15,5_hunting_ban_hunt_law,"[hunting, ban, hunt, law, hunts, said, dogs, p...",[Minister defends hunting ban law law banning ...
7,6,15,6_age_vote_election_electoral,"[age, vote, election, electoral, ballot, mock,...",[Parties warned 'grey vote' Political parties ...
8,7,15,7_eu_straw_constitution_china,"[eu, straw, constitution, china, referendum, t...",[Straw attacked China arms Moves lift European...
9,8,13,8_ukip_kilroysilk_party_veritas,"[ukip, kilroysilk, party, veritas, mr, robert,...",[Kilroy launches 'Veritas ' party Ex-BBC chat ...


In [37]:
# Get topic names
label_names = politics["model"].generate_topic_labels(nr_words=3, topic_prefix=True, word_length=7, separator='_')
# politics["model"].set_topic_labels(labels)
politics["model"].custom_labels_ = label_names
politics["model"].visualize_barchart(custom_labels=True)

In [40]:
politics["model"].visualize_barchart(topics=[0,1,2,4,5,6,7,12], custom_labels=True)

In [44]:
df = politics["model"].get_document_info(contents['politics'])
df.head()

Unnamed: 0,Document,Topic,Name,CustomName,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,Election 'could terror target' Terrorists migh...,-1,-1_said_mr_would_people,-1_said_mr_would,"[said, mr, would, people, labour, government, ...",[Howard denies split ID cards Michael Howard d...,said - mr - would - people - labour - governme...,0.0,False
1,'Debate needed ' donations cap cap donations p...,1,1_tax_labour_would_election,1_tax_labour_would,"[tax, labour, would, election, mr, brown, said...",[Howard dismisses Tory tax fears Michael Howar...,tax - labour - would - election - mr - brown -...,0.983589,False
2,Kennedy begins pre-election tour Liberal Democ...,-1,-1_said_mr_would_people,-1_said_mr_would,"[said, mr, would, people, labour, government, ...",[Howard denies split ID cards Michael Howard d...,said - mr - would - people - labour - governme...,0.0,False
3,'Errors ' doomed first Dome sale initial attem...,-1,-1_said_mr_would_people,-1_said_mr_would,"[said, mr, would, people, labour, government, ...",[Howard denies split ID cards Michael Howard d...,said - mr - would - people - labour - governme...,0.0,False
4,Correction agency plans dropped Plans create s...,2,2_said_police_law_would,2_said_police_law,"[said, police, law, would, rights, home, human...",[Lords wrong detainees - Straw Jack Straw atta...,said - police - law - would - rights - home - ...,0.6661,False


In [45]:
df.to_csv("politics_topics.csv", index=False)