## BERTopic

In [2]:
import pandas as pd
import numpy as np
from bertopic import BERTopic
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [4]:
%run -i "../util/lang_utils.ipynb"

In [5]:
stop_words = stopwords.words('english')
stop_words.append("said")
stop_words.append("mr")
bbc_df = pd.read_csv("../data/bbc-text.csv")

In [8]:
bbc_df['text'] = bbc_df['text'].apply(lambda x: word_tokenize(x.lower()))
bbc_df['text'] = bbc_df['text'].apply(lambda x: [w for w in x if w not in stop_words])
bbc_df['text'] = bbc_df['text'].apply(lambda x: ' '.join(x))

In [9]:
bbc_df.head()

Unnamed: 0,category,text
0,tech,tv future hands viewers home theatre systems p...
1,business,worldcom boss left books alone former worldcom...
2,sport,tigers wary farrell gamble leicester say rushe...
3,sport,yeading face newcastle fa cup premiership side...
4,entertainment,ocean twelve raids box office ocean twelve cri...


In [10]:
bbc_train, bbc_test = train_test_split(bbc_df, test_size=0.1)
len(bbc_train), len(bbc_test)

(2002, 223)

In [11]:
docs = bbc_train['text'].values

In [13]:
topic_model = BERTopic(nr_topics=6) # note that one of the topics will be -1 for outliers
topics, probs = topic_model.fit_transform(docs)

In [14]:
print(topic_model.get_topic_info())

   Topic  Count                              Name  \
0     -1    517      -1_would_also_new_government   
1      0    455          0_game_england_win_first   
2      1    291          1_us_year_growth_economy   
3      2    286  2_people_mobile_technology_users   
4      3    262          3_best_film_music_awards   
5      4    191   4_labour_would_government_blair   

                                      Representation  \
0  [would, also, new, government, people, us, yea...   
1  [game, england, win, first, cup, world, club, ...   
2  [us, year, growth, economy, company, market, y...   
3  [people, mobile, technology, users, games, dig...   
4  [best, film, music, awards, show, band, one, a...   
5  [labour, would, government, blair, party, elec...   

                                 Representative_Docs  
0  [blair backs pre-election budget tony blair ba...  
1  [paris promise raises welsh hopes better six n...  
2  [yukos bankruptcy us matter russian authoritie...  
3  [mobiles med

In [17]:
# Define a function to extract predictions
def get_prediction(input_text, model):
    pred = model.transform([input_text])
    return pred[0][0]  # Return the topic number
bbc_test['prediction'] = bbc_test['text'].apply(lambda x: get_prediction(x, topic_model))
topic_mapping = {
    0: 'sport',
    1: 'business',
    2: 'tech',
    3: 'entertainment',
    4: 'politics',
    -1: 'other'  # for outliers
}
bbc_test['predicted_category'] = bbc_test['prediction'].map(topic_mapping)
test_data = bbc_test.loc[bbc_test['prediction'] != -1]
print(classification_report(test_data['category'], test_data['predicted_category']))

               precision    recall  f1-score   support

     business       0.92      0.92      0.92        26
entertainment       1.00      0.92      0.96        13
     politics       0.91      0.83      0.87        12
        sport       0.97      1.00      0.98        65
         tech       0.95      0.95      0.95        19

     accuracy                           0.96       135
    macro avg       0.95      0.93      0.94       135
 weighted avg       0.96      0.96      0.96       135



In [18]:
new_input = bbc_test['text'].iloc[0]
print(new_input)
print(topic_model.transform([new_input]))

off-colour gardener storms win britain jason gardener shook upset stomach win 60m sunday leipzig international meeting . gardener clocked 6.56 seconds equal meeting record finished well ahead germany marc blume crossed line 6.67 secs . world indoor champion : got airport stomach upset vomiting . almost went home . felt little better sunday morning decided run main race . everything went perfectly . gardener part great britain 4x100m quartet gold athens olympics turn attention next weekend norwich union european indoor trials sheffield . given still off-colour know plenty tank expect get faster next weeks . case chipping away done previous years results come . scotland ian mackie also action leipzig . stepped favoured 400m 200m finish third 21.72 secs . germany alexander kosenkow race 21.07 secs dutchman patrick van balkom second 21.58 secs . plenty senior british athletes showing indoor form weekend . promising 60m hurdler clocked new uk record 7.98 seconds meeting norway . 24-year-old

In [19]:
topic, similarity = topic_model.find_topics("entertainment", top_n=5)
sim_topics = list(zip(topic, similarity))
print(sim_topics)

[(2, np.float64(0.20587171867731857)), (3, np.float64(0.15695120748189495)), (-1, np.float64(0.05573423153649602)), (0, np.float64(0.021746000971420685)), (1, np.float64(0.02049767330961437))]
