In [1]:
%run -i "../util/util_simple_classifier.ipynb"
%run -i "../util/lang_utils.ipynb"

In [3]:
from nltk import word_tokenize
from sklearn.cluster import KMeans
from nltk.probability import FreqDist
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedShuffleSplit

In [5]:
train_dataset = load_dataset("SetFit/bbc-news", split="train")
test_dataset = load_dataset("SetFit/bbc-news", split="test")

In [6]:
train_df = train_dataset.to_pandas()
test_df = test_dataset.to_pandas()
train_df.head()

Unnamed: 0,text,label,label_text
0,wales want rugby league training wales could f...,2,sport
1,china aviation seeks rescue deal scandal-hit j...,1,business
2,rock band u2 break ticket record u2 have smash...,3,entertainment
3,markets signal brazilian recovery the brazilia...,1,business
4,tough rules for ringtone sellers firms that fl...,0,tech


In [8]:
print(train_df.groupby('label_text').count())
print(test_df.groupby('label_text').count())

               text  label
label_text                
business        286    286
entertainment   210    210
politics        242    242
sport           275    275
tech            212    212
               text  label
label_text                
business        224    224
entertainment   176    176
politics        175    175
sport           236    236
tech            189    189


In [13]:
combined_df = pd.concat([train_df, test_df], ignore_index=True, sort=False)
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
train_index, test_index = next(sss.split(combined_df["text"], combined_df["label"]))
train_df = combined_df[combined_df.index.isin(train_index)].copy()
test_df = combined_df[combined_df.index.isin(test_index)].copy()
print(train_df.groupby('label_text').count())
print(test_df.groupby('label_text').count())

               text  label
label_text                
business        408    408
entertainment   309    309
politics        333    333
sport           409    409
tech            321    321
               text  label
label_text                
business        102    102
entertainment    77     77
politics         84     84
sport           102    102
tech             80     80


In [14]:
# Preprocess the data
train_df = tokenize(train_df, "text")
train_df = remove_stopword_punct(train_df, "text_tokenized")
test_df = tokenize(test_df, "text")
test_df = remove_stopword_punct(test_df, "text_tokenized")

In [15]:
train_df.head()

Unnamed: 0,text,label,label_text,text_tokenized
0,wales want rugby league training wales could f...,2,sport,"[wales, want, rugby, league, training, wales, ..."
1,china aviation seeks rescue deal scandal-hit j...,1,business,"[china, aviation, seeks, rescue, deal, scandal..."
2,rock band u2 break ticket record u2 have smash...,3,entertainment,"[rock, band, u2, break, ticket, record, u2, sm..."
3,markets signal brazilian recovery the brazilia...,1,business,"[markets, signal, brazilian, recovery, brazili..."
4,tough rules for ringtone sellers firms that fl...,0,tech,"[tough, rules, ringtone, sellers, firms, flout..."


In [17]:
# Create TF-IDF vectorizer
train_df["text_clean"] = train_df["text_tokenized"].apply(lambda x: " ".join(x)) # re-group tokens (after removing stockpords/punctuations)
test_df["text_clean"] = test_df["text_tokenized"].apply(lambda x: " ".join(x))
train_df.to_json("../data/bbc_train.json")
test_df.to_json("../data/bbc_test.json")
vectorizer = TfidfVectorizer(ngram_range=(1,3))
matrix = vectorizer.fit_transform(train_df["text_clean"])
print(f"TF-IDF matrix shape: {matrix.shape}")

TF-IDF matrix shape: (1780, 665564)


In [18]:
# KMeans clustering
km = KMeans(n_clusters=5, n_init=10, random_state=0)
km.fit(matrix)

0,1,2
,n_clusters,5
,init,'k-means++'
,n_init,10
,max_iter,300
,tol,0.0001
,verbose,0
,random_state,0
,copy_x,True
,algorithm,'lloyd'


In [19]:
# Get the most frequent words in a list
def get_most_frequent_words(text, num_words):
    word_list = word_tokenize(text)
    fdist = FreqDist(word_list)
    most_common = fdist.most_common(num_words)
    return [word[0] for word in most_common]

In [20]:
def print_most_common_words_by_cluster(input_df, km, num_clusters):
    clusters = km.labels_.tolist()
    input_df["Cluster"] = clusters
    for cluster in range(0, num_clusters):
        this_cluster_text = input_df[input_df["Cluster"] == cluster]
        all_text = " ".join(this_cluster_text["text_clean"].astype(str))
        most_common_words = get_most_frequent_words(all_text, 10)
        print(cluster)
        print(most_common_words)
    return input_df
        

In [21]:
print_most_common_words_by_cluster(train_df, km, 5)

0
['mr', 'said', 'would', 'labour', 'party', 'election', 'blair', 'government', 'people', 'brown']
1
['film', 'best', 'said', 'also', 'awards', 'year', 'music', 'one', 'award', 'us']
2
['said', 'game', 'first', 'england', 'would', 'world', 'one', 'last', 'win', 'two']
3
['said', 'us', 'year', 'people', 'mr', 'also', 'would', 'new', 'one', 'could']
4
['said', 'people', 'software', 'mr', 'users', 'would', 'microsoft', 'us', 'also', 'could']


Unnamed: 0,text,label,label_text,text_tokenized,text_clean,Cluster
0,wales want rugby league training wales could f...,2,sport,"[wales, want, rugby, league, training, wales, ...",wales want rugby league training wales could f...,2
1,china aviation seeks rescue deal scandal-hit j...,1,business,"[china, aviation, seeks, rescue, deal, scandal...",china aviation seeks rescue deal scandal-hit j...,3
2,rock band u2 break ticket record u2 have smash...,3,entertainment,"[rock, band, u2, break, ticket, record, u2, sm...",rock band u2 break ticket record u2 smashed ir...,2
3,markets signal brazilian recovery the brazilia...,1,business,"[markets, signal, brazilian, recovery, brazili...",markets signal brazilian recovery brazilian st...,3
4,tough rules for ringtone sellers firms that fl...,0,tech,"[tough, rules, ringtone, sellers, firms, flout...",tough rules ringtone sellers firms flout rules...,3
...,...,...,...,...,...,...
2217,soros group warns of kazakh close the open soc...,1,business,"[soros, group, warns, kazakh, close, open, soc...",soros group warns kazakh close open society in...,3
2218,election could be terror target terrorists m...,4,politics,"[election, could, terror, target, terrorists, ...",election could terror target terrorists might ...,0
2219,lifestyle governs mobile choice faster bett...,0,tech,"[lifestyle, governs, mobile, choice, faster, b...",lifestyle governs mobile choice faster better ...,3
2220,mobile multimedia slow to catch on there is no...,0,tech,"[mobile, multimedia, slow, catch, doubt, mobil...",mobile multimedia slow catch doubt mobile phon...,3


In [23]:
test_example = test_df.iloc[1, test_df.columns.get_loc("text_clean")]
print(f"Test example text: {test_example}")
vectorized = vectorizer.transform([test_example])
predicted_cluster = km.predict(vectorized)
print(f"Predicted cluster: {predicted_cluster[0]}")


Test example text: lib dems new election pr chief lib dems appointed senior figure bt party new communications chief next general election effort sandy walkington work senior figures matthew taylor completing party manifesto party chief executive lord rennard said appointment significant strengthening lib dem team mr walkington said wanted party ready mischief rivals media tried throw role ensure new public profile effectively communicated levels said also know party put scrutiny media parties never need show ready prepared counter mischief misrepresentation often comes party opponents party already demonstrating every issue effective opposition mr walkington new job title director general election communications
Predicted cluster: 0


In [25]:
from joblib import dump, load
dump(km, '../data/kmeans.joblib')
km_ = load('../data/kmeans.joblib')
prediction = km_.predict(vectorized)
print(f"Predicted cluster from loaded model: {prediction[0]}")

Predicted cluster from loaded model: 0
