## Community detection clustering with SBERT

In [1]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer, util


In [2]:
%run -i "../util/lang_utils.ipynb"

In [5]:
bbc_df = pd.read_csv("../data/bbc-text.csv")
bbc_df.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [6]:
# Load the model and set up embedding
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(bbc_df['text'], convert_to_tensor=True)

In [10]:
model, embeddings

(SentenceTransformer(
   (0): Transformer({'max_seq_length': 256, 'do_lower_case': False, 'architecture': 'BertModel'})
   (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
   (2): Normalize()
 ),
 tensor([[-0.0016, -0.0673,  0.0112,  ..., -0.0152, -0.0637,  0.0846],
         [-0.0835,  0.0595, -0.0132,  ..., -0.0932, -0.0025,  0.0212],
         [-0.0560, -0.0085, -0.0258,  ..., -0.0560,  0.0439, -0.0470],
         ...,
         [ 0.0272, -0.1363,  0.0270,  ..., -0.0920, -0.0774, -0.0078],
         [ 0.0464, -0.0360,  0.0653,  ...,  0.0458, -0.0162,  0.0487],
         [-0.0211,  0.0875, -0.0688,  ..., -0.0633, -0.0909,  0.0512]]))

In [11]:
# Create clusters
# List of lists, where inner lists are indices of texts in the same cluster
clusters = util.community_detection(embeddings, min_community_size=10, threshold=0.7)
print(clusters)

[[1553, 2059, 192, 1208, 493, 827, 1594, 1082, 516, 1938, 1650, 530, 883, 638, 1359, 1154, 2152, 117, 1257, 1898, 168], [178, 1813, 76, 290, 1810, 518, 337, 1172, 1242, 1151, 1057, 1981, 755, 923, 1942, 1560, 497, 1882, 1105], [150, 1645, 1636, 503, 281, 1940, 1633, 758, 1971, 376, 1405, 1156, 900, 1946], [1824, 1014, 2024, 1440, 1018, 565, 389, 1917, 1588, 399, 1259, 791, 1288], [1004, 901, 1621, 1580, 1499, 1751, 1037, 1323, 1534, 2178, 1041, 373], [1244, 42, 2128, 1063, 1597, 2104, 1292, 1915, 959, 2081, 1304], [767, 787, 186, 1625, 1651, 193, 1171, 2148, 1797, 1284], [1476, 2129, 388, 134, 1069, 682, 1680, 2186, 2198, 2106]]


In [22]:
# Get the most frequent words in a list
from nltk.probability import FreqDist
def get_most_frequent_words(text, num_words):
    word_list = word_tokenize(text)
    fdist = FreqDist(word_list)
    most_common = fdist.most_common(num_words)
    return [word[0] for word in most_common]

# Gain some intuition about the clusters, with most frequent words in each cluster
def print_words_by_cluster(clusters, input_df):
    for i, cluster in enumerate(clusters):
        print(f"\nCluster {i+1}, {len(cluster)} documents")
        sentences = input_df.iloc[cluster]['text']
        all_text = ' '.join(sentences).lower()
        freq_words = get_most_frequent_words(all_text, 10)
        print("Most frequent words:", freq_words)

In [23]:
print_words_by_cluster(clusters, bbc_df)


Cluster 1, 21 documents
Most frequent words: ['the', '.', 'to', 'and', 'a', 'of', 'mr', 'in', 'he', 's']

Cluster 2, 19 documents
Most frequent words: ['the', '.', 'to', 'of', 'in', 'a', 'yukos', 'and', 'for', 'is']

Cluster 3, 14 documents
Most frequent words: ['the', '.', 'and', 'to', 'of', 'a', 'in', 'kenteris', 'greek', 'thanou']

Cluster 4, 13 documents
Most frequent words: ['the', '.', 'to', 'and', 'of', 'a', 'he', 'mr', 'on', 'in']

Cluster 5, 12 documents
Most frequent words: ['the', '.', 'and', 'to', 'of', 'for', '-', 'best', 'a', 'in']

Cluster 6, 11 documents
Most frequent words: ['the', 'in', '.', 'of', 'a', '%', 'to', 'and', 'said', 'prices']

Cluster 7, 10 documents
Most frequent words: ['the', '.', 'to', 'a', 'lse', 'deutsche', 'boerse', 'in', 'of', 's']

Cluster 8, 10 documents
Most frequent words: ['the', '.', 'of', 'to', 'dollar', 'in', 'a', 'and', 'us', 's']


In [24]:
from sklearn.model_selection import train_test_split
bbc_train, bbc_test = train_test_split(bbc_df, test_size=0.1)

In [27]:
from sklearn.cluster import KMeans
bbc_df = pd.read_csv("../data/bbc-text.csv")
documents = bbc_train['text'].values
model = SentenceTransformer('all-MiniLM-L6-v2')
encoded_data = model.encode(documents)
km = KMeans(n_clusters=5, n_init='auto', init='k-means++')
km.fit(encoded_data)

0,1,2
,n_clusters,5
,init,'k-means++'
,n_init,'auto'
,max_iter,300
,tol,0.0001
,verbose,0
,random_state,
,copy_x,True
,algorithm,'lloyd'


In [None]:
def print_most_common_words_by_cluster(input_df, km, num_clusters):
    clusters = km.labels_.tolist()
    input_df["Cluster"] = clusters
    for cluster in range(0, num_clusters):
        this_cluster_text = input_df[input_df["Cluster"] == cluster]
        all_text = " ".join(this_cluster_text["text_clean"].astype(str))
        most_common_words = get_most_frequent_words(all_text, 10)
        print(cluster)
        print(most_common_words)
    return input_df
print_most_common_words_by_cluster(bbc_train, km, 5)

0
['the', '.', 'to', 'a', 'in', 'and', 'of', 's', 'i', 'for']
1
['the', '.', 'to', 'of', 'in', 'a', 'and', 's', 'said', 'is']
2
['the', '.', 'and', 'of', 'to', 'a', 'in', 's', 'for', 'on']
3
['the', '.', 'to', 'of', 'and', 'a', 'in', 'that', 'is', 'it']
4
['the', '.', 'to', 'of', 'and', 'a', 'in', 'said', 'he', 'for']


Unnamed: 0,category,text,Cluster
1508,politics,kennedy s cautious optimism charles kennedy is...,4
225,entertainment,fox too reliant on reality tv the head of us...,2
31,tech,firefox browser takes on microsoft microsoft s...,3
2127,sport,dunne keen to commit to man city richard dunne...,0
1807,sport,english clubs make euro history all four of en...,0
...,...,...,...
891,politics,councils must find gypsy sites ministers are...,4
1668,sport,johnson too strong for gb runners britain s ka...,0
1373,business,uk economy facing major risks the uk manufac...,1
1570,politics,donor attacks blair-brown feud the reported ...,4


In [29]:
bbc_test["predictions"] = bbc_test["text"].apply(lambda x: km.predict(model.encode([x]))[0])
print(bbc_test)

           category                                               text  \
1711          sport  souness backs smith for scotland graeme sounes...   
869        politics  voters  reject eu by two to one  british voter...   
2145          sport  wales coach elated with win mike ruddock paid ...   
638        politics  could rivalry overshadow election  tony blair ...   
380   entertainment  stallone evicted from big brother jackie stall...   
...             ...                                                ...   
838           sport  spain coach faces racism inquiry spain s footb...   
1823       business  qantas considers offshore option australian ai...   
1          business  worldcom boss  left books alone  former worldc...   
2212       business  christmas shoppers flock to tills shops all ov...   
633   entertainment  gallery unveils interactive tree a christmas t...   

      predictions  
1711            0  
869             4  
2145            0  
638             4  
380        

In [30]:
topic_mapping = {
    0: 'tech',
    1: 'sport',
    2: 'entertainment',
    3: 'politics',
    4: 'business'
}
bbc_test["predicted_topic"] = bbc_test["predictions"].map(topic_mapping)
print(bbc_test[['text', 'predicted_topic']])

                                                   text predicted_topic
1711  souness backs smith for scotland graeme sounes...            tech
869   voters  reject eu by two to one  british voter...        business
2145  wales coach elated with win mike ruddock paid ...            tech
638   could rivalry overshadow election  tony blair ...        business
380   stallone evicted from big brother jackie stall...   entertainment
...                                                 ...             ...
838   spain coach faces racism inquiry spain s footb...            tech
1823  qantas considers offshore option australian ai...           sport
1     worldcom boss  left books alone  former worldc...           sport
2212  christmas shoppers flock to tills shops all ov...           sport
633   gallery unveils interactive tree a christmas t...   entertainment

[223 rows x 2 columns]


In [31]:
from sklearn.metrics import classification_report
print(classification_report(bbc_test['category'], bbc_test['predicted_topic']))

               precision    recall  f1-score   support

     business       0.00      0.00      0.00        52
entertainment       0.94      0.91      0.93        35
     politics       0.00      0.00      0.00        51
        sport       0.00      0.00      0.00        51
         tech       0.00      0.00      0.00        34

     accuracy                           0.14       223
    macro avg       0.19      0.18      0.19       223
 weighted avg       0.15      0.14      0.15       223

