In [None]:
!pip install uvicorn gunicorn fastapi pydantic bertopic fasttext nepalitokenizer snowballstemmer pyngrok pymongo

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting uvicorn
  Downloading uvicorn-0.22.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting gunicorn
  Downloading gunicorn-20.1.0-py3-none-any.whl (79 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.5/79.5 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fastapi
  Downloading fastapi-0.95.1-py3-none-any.whl (56 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.0/57.0 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Collecting bertopic
  Downloading bertopic-0.14.1-py2.py3-none-any.whl (120 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m120.7/120.7 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
!python --version

Python 3.9.16


In [None]:
# Mounting Google Drive for fasttext file
from google.colab import drive

import pandas as pd

drive.mount('/content/gdrive/', force_remount=True)
%cd gdrive/MyDrive

Mounted at /content/gdrive/
/content/gdrive/MyDrive


In [None]:
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from typing import List
import fasttext
import numpy as np
from bertopic.backend import BaseEmbedder
from bertopic import BERTopic
from nepalitokenizer import NepaliTokenizer
import snowballstemmer
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from pyngrok import ngrok
import nest_asyncio
import uvicorn
import requests
import pandas as pd
import pymongo
from fastapi.responses import JSONResponse

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:

app= FastAPI()

origins = ["*"]

app.add_middleware(
    CORSMiddleware,
    allow_origins = origins,
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"]
)

#MongoDB and scrapy cloud api
myclient = pymongo.MongoClient("mongodburl")
mydb = myclient["major-project"]

ok_url = "https://app.zyte.com/api/items.json?project=649148&spider=ok_latest&include_headers=1&apikey=5838ef376c21473bb230876352380793"

ap_url = "https://app.zyte.com/api/items.json?project=649148&spider=ap_latest&include_headers=1&apikey=5838ef376c21473bb230876352380793"

mycol = mydb["news_np_lb_pred"]

#Preprocessing the fetched news titles
stopword= stopwords.words('nepali')
stemmer = snowballstemmer.stemmer('nepali')

def nepali_tokenizer(text):
    # tokenize the text using the BERT tokenizer
    tokens = tokenize.tokenizer(text)
    # return the token list as a string
    return tokens

tokenize = NepaliTokenizer()

## Creating custom embedder using fasttext
class CustomEmbedder(BaseEmbedder):
    def __init__(self, embedding_model):
        super().__init__()
        self.embedding_model = embedding_model

    def embed(self, documents, verbose=False):
        results_from_fasttext = []
        for sentence in documents:
            embeddings_fasttext = self.embedding_model.get_sentence_vector(sentence).tolist()
            embeddings_fasttext = np.asarray(embeddings_fasttext).reshape(-1,300).flatten()
            results_from_fasttext.append(embeddings_fasttext)

        embeddings= np.array(results_from_fasttext)
        return embeddings

# Create custom backend
ft_ne = fasttext.load_model("/content/gdrive/MyDrive/major-project/cc.ne.300.bin")
custom_embedder = CustomEmbedder(embedding_model=ft_ne)

# Importing Trained Bertopic Model
topic_model = BERTopic.load("/content/gdrive/MyDrive/major-project/pro_first_2k_data")


topic_info = topic_model.get_topic_info()

class NewsItem(BaseModel):
    documents: List[str]



@app.post('/predict')
async def predict_endpoint(item:NewsItem):
    # print(item.documents)
    # print(type(item.documents))
    item_preprocess = list(map(tokenize.tokenizer, item.documents))
    item_preprocess = list(map(stemmer.stemWords,item_preprocess))
    item_preprocess = list(map(lambda x: " ".join([w for w in x if w not in stopword]),item_preprocess))
    # print(item_preprocess)

    # Creating embeddings for new topics
    embeddings_steemed = custom_embedder.embed(item_preprocess)
    # Predict topics for test_docs
    predicted_topics, predicted_probs = topic_model.transform(item_preprocess,embeddings_steemed)
    print(predicted_topics)

    # predicted_topics = list(map(int, predicted_topics))
    names = []
    for label in predicted_topics:
      name = topic_info[topic_info["Topic"]==label]["Name"].to_string(index=False)
      names.append(name)

    return dict(zip(item.documents,names))

@app.get('/predictandpost')
async def predict_post():
  response_ok=requests.get(ok_url)
  response_ap=requests.get(ap_url)


  news_text_df = pd.read_json(response_ap.text,orient="records")

  ok_df = pd.read_json(response_ok.text,orient="records")

  news_text_df = pd.concat([news_text_df,ok_df],ignore_index=True)

  mydoc = mycol.find({"_type":"NewscrawlerItem"},{"link":1,"_id":0})

  news_text_df_old = pd.DataFrame(list(mydoc))

  news_text_df["headline_stemmed"] = news_text_df["headline"].apply(tokenize.tokenizer)
  news_text_df["headline_stemmed"] = news_text_df["headline_stemmed"].apply(stemmer.stemWords)
  news_text_df["headline_stemmed"]= news_text_df["headline_stemmed"].apply(lambda x: " ".join([w for w in x if w not in stopword]))

  # Creating embeddings for new topics
  embeddings_steemed = custom_embedder.embed(news_text_df["headline_stemmed"])
  # Predict topics for test_docs
  predicted_topics, predicted_probs = topic_model.transform(news_text_df["headline_stemmed"],embeddings_steemed)

  news_text_df["topic_label"] = predicted_topics

  ## SAVING LABEL NAME TO DATABASE
  topic_info= topic_model.get_topic_info()
  merged_df = pd.merge(news_text_df, topic_info, left_on='topic_label', right_on='Topic', how='left')
  # add the 'Value' column from df2 to df1
  news_text_df['topic_name'] = merged_df['Name']

  #check if previous news is already present or not
  if 'link' in news_text_df_old.columns:
    news_text_df = news_text_df[~news_text_df['link'].isin(news_text_df_old['link'])]


  if news_text_df.empty:
    return JSONResponse(content={"message": "No new data to insert"})

  else:


    data = news_text_df.to_dict(orient='records')

    # insert the data into the MongoDB collection
    res = mycol.insert_many(data)

    if res.acknowledged:
        return JSONResponse(content={"message": "Insertion successful."})
    else:
        return JSONResponse(content={"message": "Insertion failed."})






In [None]:
ngrok_tunnel = ngrok.connect(8000)
print("Public URL: ", ngrok_tunnel.public_url)
nest_asyncio.apply()
uvicorn.run(app,port=8000)



Public URL:  https://cb6b-34-73-229-95.ngrok.io


INFO:     Started server process [183]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit)


INFO:     120.89.104.44:0 - "GET /predictandpost HTTP/1.1" 200 OK
INFO:     120.89.104.44:0 - "GET /predictandpost HTTP/1.1" 200 OK
