In [None]:
!pip install bertopic fasttext nepalitokenizer nepali-stemmer pymongo snowballstemmer

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bertopic
  Downloading bertopic-0.14.1-py2.py3-none-any.whl (120 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m120.7/120.7 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nepalitokenizer
  Downloading nepalitokenizer-1.8.6.0-py3-none-any.whl (8.2 kB)
Collecting nepali-stemmer
  Downloading nepali_stemmer-0.0.2-py3-none-any.whl (149 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m149.0/149.0 kB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pymongo
  Downloading pymongo-4.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (492 kB)
[2K     [90m━━━━━━━━━━━━━━━━━

In [None]:
# Mounting Google Drive for fasttext file
from google.colab import drive

import pandas as pd

drive.mount('/content/gdrive/', force_remount=True)
%cd gdrive/MyDrive


Mounted at /content/gdrive/
/content/gdrive/MyDrive


In [None]:
#Import the news headline in dataframe named news_text_df
import pymongo
import pandas as pd

myclient = pymongo.MongoClient("mongodburl")
mydb = myclient["major-project"]
mycol = mydb["news_np"]

mydoc = mycol.find({},{"headline":1})

news_text_df = pd.DataFrame(list(mydoc))

In [None]:
news_text_df

Unnamed: 0,_id,headline
0,6419e7569cd6f156db0ad4c6,"नेपालमा पश्चिमी वायुको प्रभाव, उच्च पहाडी क्षे..."
1,6419e7569cd6f156db0ad4c7,रेडक्रसको भ्रष्टाचारमा अन्देखा किन ?
2,6419e7569cd6f156db0ad4c8,तीन थान एयर गनसहित चार भारतीय नियन्त्रणमा
3,6419e7569cd6f156db0ad4c9,सांसद चौधरीको मुद्दामा उच्च अदालतको कारण देखाउ...
4,6419e7569cd6f156db0ad4ca,राष्ट्रपति पदमा गैरएमालेको खोजी : शेरबहादुर कु...
...,...,...
263870,6419e8ce9cd6f156db0edb86,दाङमा लघुवित्त पीडितले दिए प्रशासनमा धर्ना
263871,6419e8ce9cd6f156db0edb87,दाङमा अटोरिक्सा दुर्घटना हुँदा युवकको मृत्यु
263872,6419e8ce9cd6f156db0edb88,"माधव भन्छन् १११४ भोट, ओली भन्छन् १४३५"
263873,6419e8ce9cd6f156db0edb89,ट्याङकर चोरेर डिजल बेचेको आरोप लागेका १ जना थु...


In [None]:
# All necessary imports
import fasttext
import fasttext.util
from bertopic.backend import BaseEmbedder
import numpy as np
import snowballstemmer
from nepalitokenizer import NepaliTokenizer
from bertopic import BERTopic
# from cuml.cluster import HDBSCAN
# from cuml.manifold import UMAP
from hdbscan import HDBSCAN
from umap import UMAP
# from transformers import BertTokenizer
from sklearn.feature_extraction.text import CountVectorizer
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords


stopword= stopwords.words('nepali')
stemmer = snowballstemmer.stemmer('nepali')

## Creating custom embedder using fasttext
class CustomEmbedder(BaseEmbedder):
    def __init__(self, embedding_model):
        super().__init__()
        self.embedding_model = embedding_model

    def embed(self, documents, verbose=False):
        results_from_fasttext = []
        for sentence in documents:
            embeddings_fasttext = self.embedding_model.get_sentence_vector(sentence).tolist()
            embeddings_fasttext = np.asarray(embeddings_fasttext).reshape(-1,300).flatten()
            results_from_fasttext.append(embeddings_fasttext)

        embeddings= np.array(results_from_fasttext)
        return embeddings

# Create custom backend
ft_ne = fasttext.load_model("/content/gdrive/MyDrive/major-project/cc.ne.300.bin")
custom_embedder = CustomEmbedder(embedding_model=ft_ne)

##Removing stop words and stemming the documents

tokenize = NepaliTokenizer()

news_text_df["headline_stemmed"] = news_text_df["headline"].apply(tokenize.tokenizer)
news_text_df["headline_stemmed"] = news_text_df["headline_stemmed"].apply(stemmer.stemWords)
news_text_df["headline_stemmed"]= news_text_df["headline_stemmed"].apply(lambda x: " ".join([w for w in x if w not in stopword]))

#Defining UMAP
umap_model = UMAP(n_neighbors=15,
                  n_components=5,
                  min_dist=0.0,
                  metric='cosine',
                  random_state=100)
hdbscan_model = HDBSCAN(min_samples=100, gen_min_span_tree=True, prediction_data=True)

## Creating vectorizer with custom tokenizer


def nepali_tokenizer(text):
    # tokenize the text using the BERT tokenizer
    tokens = tokenize.tokenizer(text)
    # return the token list as a string
    return tokens

tokenize = NepaliTokenizer()

vectorizer = CountVectorizer(tokenizer=nepali_tokenizer,min_df=10)



## Get sentence embeddings
embeddings = custom_embedder.embed(news_text_df["headline_stemmed"])

# Train BERTopic
topic_model = BERTopic(embedding_model=custom_embedder,
                       vectorizer_model=vectorizer,
                       umap_model=umap_model,
                       hdbscan_model=hdbscan_model,
                       low_memory=True,
                       calculate_probabilities=False,
                       min_topic_size=100)
# Run BERTopic model
topics = topic_model.fit_transform(news_text_df["headline_stemmed"],embeddings)



# Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)


fig = topic_model.visualize_documents(news_text_df["headline"], reduced_embeddings=reduced_embeddings)





[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
fig

In [None]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,147068,-1_कांग्रेस_प्रधानमन्त्री_सरकार_प्रचण्ड
1,0,11084,0_विश्वकप_क्रिकेट_जित_खेल
2,1,3514,1_बलात्कार_हत्या_आरोप_छोरी
3,2,3502,2_सडक_पुल_अवरुद्ध_राजमार्ग
4,3,3310,3_सर्वोच्च_आदेश_अदालत_न्यायाधीश
...,...,...,...
332,331,5,331_संस्कृति_भाषा_संरक्षण_सशक्त
333,332,5,332_इतिहास_सूर्यबहादुर_बदनाम_रंग
334,333,5,333_रकम_घोटाला_प्रकरणः_उपभोक्ता
335,334,5,334_क्षमता_अस्थिरता_नारा_हटाउ


In [None]:
topic_model.visualize_documents(news_text_df["headline"], reduced_embeddings=reduced_embeddings)

In [None]:
topic_model.visualize_topics()

In [None]:
fig.write_html("/content/gdrive/MyDrive/major-project/viz_test.html")

In [None]:
topic_model.save("/content/gdrive/MyDrive/major-project/pro_first_2k_data_3.10", save_embedding_model=False)


Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.



In [None]:
embeddings.shape

(263875, 300)

In [None]:
type(embeddings)

numpy.ndarray

In [None]:
np.save("/content/gdrive/MyDrive/major-project/pro_first_2k_data_embeddings", embeddings)

In [None]:
topic_model.visualize_barchart(top_n_topics=20)

In [None]:
topic_model.visualize_heatmap()

In [None]:
# New data for the review
new_news = "कर्णाली सरकारको ४ अर्ब बेरुजु"
# Find topics
num_of_topics = 3
similar_topics = topic_model.find_topics(new_news, top_n=num_of_topics);
# Print results
print(f'The top {num_of_topics} similar topics are {similar_topics}, and the similarities are {np.round(similarity,2)}')

Exception: ignored

In [None]:
topic_model.transform(new_news)

AttributeError: ignored

In [None]:
new_news

'कर्णाली सरकारको ४ अर्ब बेरुजु'