In [None]:
!pip install bertopic fasttext nepalitokenizer nepali-stemmer pymongo snowballstemmer

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bertopic
  Downloading bertopic-0.14.1-py2.py3-none-any.whl (120 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m120.7/120.7 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nepalitokenizer
  Downloading nepalitokenizer-1.8.6.0-py3-none-any.whl (8.2 kB)
Collecting nepali-stemmer
  Downloading nepali_stemmer-0.0.2-py3-none-any.whl (149 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m149.0/149.0 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pymongo
  Downloading pymongo-4.3.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (492 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━

In [None]:
# Mounting Google Drive for fasttext file
from google.colab import drive

import pandas as pd

drive.mount('/content/gdrive/', force_remount=True)
%cd gdrive/MyDrive


Mounted at /content/gdrive/
/content/gdrive/MyDrive


In [None]:
#Import the news headline in dataframe named news_text_df
import pymongo
import pandas as pd

mongodb_url = ""
myclient = pymongo.MongoClient(mongodb_url)
mydb = myclient["major-project"]
mycol = mydb["news_np"]

mydoc = mycol.find({},{"headline":1})

news_text_df = pd.DataFrame(list(mydoc))
news_text_df.head()

Unnamed: 0,_id,headline
0,6419e7569cd6f156db0ad4c6,"नेपालमा पश्चिमी वायुको प्रभाव, उच्च पहाडी क्षे..."
1,6419e7569cd6f156db0ad4c7,रेडक्रसको भ्रष्टाचारमा अन्देखा किन ?
2,6419e7569cd6f156db0ad4c8,तीन थान एयर गनसहित चार भारतीय नियन्त्रणमा
3,6419e7569cd6f156db0ad4c9,सांसद चौधरीको मुद्दामा उच्च अदालतको कारण देखाउ...
4,6419e7569cd6f156db0ad4ca,राष्ट्रपति पदमा गैरएमालेको खोजी : शेरबहादुर कु...


In [None]:
# All necessary imports
import fasttext
import fasttext.util
from bertopic.backend import BaseEmbedder
import numpy as np
import snowballstemmer
from nepalitokenizer import NepaliTokenizer
from bertopic import BERTopic
from hdbscan import HDBSCAN
from umap import UMAP
from sklearn.feature_extraction.text import CountVectorizer
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords


stopword= stopwords.words('nepali')
stemmer = snowballstemmer.stemmer('nepali')

## Creating custom embedder using fasttext
class CustomEmbedder(BaseEmbedder):
    def __init__(self, embedding_model):
        super().__init__()
        self.embedding_model = embedding_model

    def embed(self, documents, verbose=False):
        results_from_fasttext = []
        for sentence in documents:
            embeddings_fasttext = self.embedding_model.get_sentence_vector(sentence).tolist()
            embeddings_fasttext = np.asarray(embeddings_fasttext).reshape(-1,300).flatten()
            results_from_fasttext.append(embeddings_fasttext)

        embeddings= np.array(results_from_fasttext)
        return embeddings

# Create custom backend
ft_ne = fasttext.load_model("/content/gdrive/MyDrive/major-project/cc.ne.300.bin")
custom_embedder = CustomEmbedder(embedding_model=ft_ne)

##Removing stop words and stemming the documents

tokenize = NepaliTokenizer()

news_text_df["headline_stemmed"] = news_text_df["headline"].apply(tokenize.tokenizer)
news_text_df["headline_stemmed"] = news_text_df["headline_stemmed"].apply(stemmer.stemWords)
news_text_df["headline_stemmed"]= news_text_df["headline_stemmed"].apply(lambda x: " ".join([w for w in x if w not in stopword]))

#Defining UMAP
umap_model = UMAP(n_neighbors=15,
                  n_components=5,
                  min_dist=0.0,
                  metric='cosine',
                  random_state=100)
hdbscan_model = HDBSCAN(min_samples=100, gen_min_span_tree=True, prediction_data=True)

## Creating vectorizer with custom tokenizer


def nepali_tokenizer(text):
    # tokenize the text using the BERT tokenizer
    tokens = tokenize.tokenizer(text)
    # return the token list as a string
    return tokens

tokenize = NepaliTokenizer()

vectorizer = CountVectorizer(tokenizer=nepali_tokenizer,min_df=10)



## Get sentence embeddings
embeddings = custom_embedder.embed(news_text_df["headline_stemmed"])

# Train BERTopic
topic_model = BERTopic(embedding_model=custom_embedder,
                       vectorizer_model=vectorizer,
                       umap_model=umap_model,
                       hdbscan_model=hdbscan_model,
                       low_memory=True,
                       calculate_probabilities=False,
                       min_topic_size=100)
# Run BERTopic model
topics = topic_model.fit_transform(news_text_df["headline_stemmed"])



# Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)


fig = topic_model.visualize_documents(news_text_df["headline"], reduced_embeddings=reduced_embeddings)





[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# New data for the review
new_news = "कर्णाली सरकारको ४ अर्ब बेरुजु"
# Find topics
num_of_topics = 3
similar_topics,similarity = topic_model.find_topics(new_news, top_n=num_of_topics);
# Print results
print(f'The top {num_of_topics} similar topics are {similar_topics}, and the similarities are {np.round(similarity,2)}')

The top 3 similar topics are [19, 146, 315], and the similarities are [0.69 0.55 0.55]


In [None]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,140642,-1_सरकार_प्रधानमन्त्री_प्रचण्ड_कोरोना
1,0,11140,0_विश्वकप_क्रिकेट_जित_खेल
2,1,5130,1_दुर्घटना_घाइते_मोटरसाइकल_बस
3,2,4360,2_विजयी_मेयर_निर्वाचित_उपमेयर
4,3,4073,3_पक्राउ_किलो_तस्करी_बरामद
...,...,...,...
319,318,5,318_मान्_कार्टुन_होइन’_छोड्
320,319,5,319_एक्लो_त्रसित_चीन_‘नेपाल
321,320,5,320_चरण_अन्तिम_विमानस्थल_बन्दरगाह
322,321,5,321_लकडाउन_हुँदैन’_निस्क_पालना
