# Indexer for Search Engine with Ranking

In [147]:
import pandas as pd 
import string 
import numpy as np 
import nltk
import bson
import pymongo as pm

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer
nltk.download('omw-1.4')
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package omw-1.4 to /home/suparth/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /home/suparth/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/suparth/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/suparth/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Import Websites from json file

In [118]:
websites = pd.read_json("data/table.json")
df = pd.DataFrame(websites)
df = df.reset_index()
df.head(3)

Unnamed: 0,index,id,url,status,dispatchToken,contentHashId,createdAt,updatedAt,dispatchAgent,clientEnd,clientStart,serverEnd,serverStart,metadata
0,0,1,https://nepal.gov.np,done,,ba422f11-88b3-43a2-bfe3-147cff6b6f24,2022-12-16T07:52:43.982Z,2022-12-16T07:56:11.957Z,kG_amT2-70kWc2XYAAAB,2022-12-16T07:56:11.535Z,2022-12-16T07:55:55.985Z,2022-12-16T07:56:11.896Z,2022-12-16T07:55:55.982Z,"{'url': 'https://nepal.gov.np', 'title': 'Nepa..."
1,1,2,https://moha.gov.np,done,,6e36b5ec-9419-46de-a4e6-b558beb14eb2,2022-12-16T07:52:44.042Z,2022-12-16T07:58:43.935Z,S_eXYDY_dk3-8euAAAAD,2022-12-16T07:58:37.772Z,2022-12-16T07:58:12.905Z,2022-12-16T07:58:43.883Z,2022-12-16T07:58:12.903Z,"{'url': 'https://moha.gov.np', 'title': 'गृह म..."
2,2,3,https://p1.gov.np,done,,d15807d0-5043-40b4-9465-18cd0517267f,2022-12-16T07:52:44.132Z,2022-12-16T07:59:18.859Z,S_eXYDY_dk3-8euAAAAD,2022-12-16T07:59:13.435Z,2022-12-16T07:58:44.037Z,2022-12-16T07:59:18.806Z,2022-12-16T07:58:44.036Z,"{'url': 'https://p1.gov.np', 'title': 'प्रदेश ..."


## Create new Dictionary to store website on database with following schema

```
string: Website 
Website: {
  url: string,
  outgoingLinks: string[],
  incomingLinks: string[]
}
```

In [119]:
websites_dict = {}

for index, row in df.iterrows():
    url = row['url']
    if websites_dict.get(url) is None:
        websites_dict[url] = {
            "url": url,
            "outgoingLinks": list(set(row['metadata']['links'])),
            "outgoingLinksLen":len(row['metadata']['links']),
            "incomingLinks": [],
            "incomingLinksLen": 0,
            "contentHashId":row["contentHashId"]
        }

## Assign Incoming Links to the created dictionary

In [120]:
for website in websites_dict:
    links = websites_dict[website]["outgoingLinks"]
    for link in links:
        linkInDict = websites_dict.get(link)
        if linkInDict is not None:
            if not website in websites_dict[link]["incomingLinks"]: 
                websites_dict[link]["incomingLinks"].append(website)
                websites_dict[link]["incomingLinksLen"] = websites_dict[link]["incomingLinksLen"] + 1
            
            

website_pd = pd.DataFrame.from_dict(websites_dict)
website_pd.head(2).T

Unnamed: 0,url,outgoingLinks
https://nepal.gov.np,https://nepal.gov.np,[https://nepal.gov.np:8443/NationalPortal/NP?s...
https://moha.gov.np,https://moha.gov.np,"[https://moha.gov.np/page/main-functions, http..."
https://p1.gov.np,https://p1.gov.np,[https://moh.p1.gov.np/report/quaterly-report/...
http://p2.gov.np,http://p2.gov.np,"[https://madhesh.gov.np/#main-content, https:/..."
http://p3.gov.np,http://p3.gov.np,[]
...,...,...
http://www.ugcnepal.edu.np/,http://www.ugcnepal.edu.np/,"[http://www.ugcnepal.edu.np/tvprograms, http:/..."
http://www.wecs.gov.np/,http://www.wecs.gov.np/,"[http://www.wecs.gov.np/, http://www.wecs.gov...."
http://www.nmc.org.np/,http://www.nmc.org.np/,[https://nmc.org.np/na-pa-l-ma-da-kal-ka-una-s...
http://nhrc.gov.np/,http://nhrc.gov.np/,"[https://nhrc.gov.np/about/executive-board/, h..."


# Create List of Websites from dictionary

In [6]:
websites_list = list(websites_dict.values())

## Save Dictionary Data to MongoDB

### Initialize pymongo

In [2]:
mongouri = "mongodb://root:prisma@localhost:27017/db_seven_sem_prj?authSource=admin"
client = pm.MongoClient(mongouri)
database = client.get_database()
print(database)

Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, authsource='admin'), 'db_seven_sem_prj')


In [3]:
website_collection = database['websites']

In [5]:
website_collection.insert_many(websites_list)

NameError: name 'websites_list' is not defined

In [5]:
websites_in_db = list(website_collection.find())
print(len(websites_in_db))

5277


# Export Websites from Database to JSON

In [54]:
import json
from bson.json_util import dumps

def to_json(data, fileName):
    with open(f"data/generated/{fileName}.json", "w", encoding="utf-8") as f:
        jsonData = dumps(data, ensure_ascii=False)
        f.write(jsonData)
        f.close()

In [37]:
to_json(websites_in_db, "websites")
print("Done")

Done


# Text Processing

## List of All Stop Words in English and Nepali Language

In [6]:
nepaliStopWords = ["अझै","अधिक","अन्य","अन्यत्र","अन्यथा","अब","अरु","अरुलाई","अर्को","अर्थात","अर्थात्","अलग","आए","आजको","आत्म","आदि","आफू","आफूलाई","आफै","आफैलाई","आफैले","आफ्नै","आफ्नो","आयो","उनको","उनले","उनि","उनी","उनीहरु","उप","उसलाई","उस्तै","उहाँ","उहालाई","ऊ","एउटै","एक","एकदम","ओठ","औं","कतै","कसरी","कसै","कसैले","कस्तो","कहाँ","कहाँबाट","कहिले","कहिलेकाहीं","का","कि","किन","किनभने","कुनै","कुरा","कृपया","के","केवल","केहि","केही","को","कोही","गए","गयौ","गर","गरि","गरी","गरे","गरेका","गरेको","गरेर","गरौं","गर्छ","गर्छु","गर्दछ","गर्दै","गर्न","गर्नु","गर्नुपर्छ","गर्ने","गर्नेछन्","गर्नेछौ","गैर","चार","चाले","चाहनुहुन्छ","चाहन्छु","चाहन्छौ","चाहन्छौं","चाहन्थे","चाहिए","छ","छन्","छु","छू","छैन","छौं","जब","जबकि","जसको","जसबाट","जसमा","जसलाई","जसले","जस्तै","जस्तो","जहाँ","जान","जाहिर","जुन","जे","जो","ठीक","त","तत्काल","तथा","तदनुसार","तपाई","तपाईं","तपाईको","तर","तल","तापनी","तिनिहरुलाई","तिनी","तिनीहरुको","तिनीहरू","तिनीहरूको","तिमि","तिमी","तिमीसँग","तिम्रो","तिर","ती","तीन","तुरुन्तै","तेस्कारण","तेस्रो","त्यसपछि","त्यहाँ","त्यो","त्सपछि","त्सैले","थप","थिए","थिएन","थिएनन्","थियो","दिए","दिनुभएको","दिनुहुन्छ","दुई","दुबै","देखि","देखिन्छ","देखियो","देखे","देखेको","देखेर","दोस्रो","द्वारा","धेरै","न","नगर्नुहोस्","नजिकै","नत्र","नयाँ","नि","निम्ति","निम्न","नै","नौ","पक्का","पक्कै","पछि","पछिल्लो","पटक","पनि","पर्छ","पर्थ्यो","पर्याप्त","पहिले","पहिलो","पहिल्यै","पाँच","पाँचौं","पूर्व","प्रति","प्रतेक","प्रत्येक","प्लस","फेरि","फेरी","बने","बन्द","बरु","बाट","बारे","बारेमा","बाहिर","बाहेक","बिरुद्ध","बिशेष","बीच","बीचमा","भए","भएको","भन","भने","भन्","भन्छन्","भन्छु","भन्दा","भन्नुभयो","भन्ने","भर","भित्र","भित्री","म","मँ","मलाई","मा","मात्र","माथि","मार्फत","मुख्य","मेरो","मैले","यति","यथोचित","यदि","यद्यपि","यस","यसको","यसपछि","यसबाहेक","यसरी","यसैले","यसो","यस्तो","यहाँ","यहाँसम्म","या","यी","यो","र","रही","रहेका","रहेको","राखे","राख्छ","राम्रो","रूप","लगभग","लाई","लागि","ले","वरीपरी","वा","वास्तवमा","विरुद्ध","शायद","सकदिन","सकिएन","सक्छ","सक्दैन","संग","संगै","सट्टा","सधै","सबै","सबैलाई","समय","समयमा","सम्भव","सम्म","सही","साँच्चै","सात","साथ","साथै","सायद","सारा","सो","सोही","स्पष्ट","हरे","हरेक","हामी","हामीसँग","हाम्रो","हुँ","हुँदैन","हुन","हुनु","हुनुहुन्छ","हुने","हुनेछ","हुनेछु","हुन्","हुन्छ","हुन्थे","हो","होइन","हौंअझै","अधिक","अन्य","अन्यत्र","अन्यथा","अब","अरु","अरुलाई","अर्को","अर्थात","अर्थात्","अलग","आए","आजको","आत्म","आदि","आफू","आफूलाई","आफै","आफैलाई","आफैले","आफ्नै","आफ्नो","आयो","उनको","उनले","उनि","उनी","उनीहरु","उप","उसलाई","उस्तै","उहाँ","उहालाई","ऊ","एउटै","एक","एकदम","ओठ","औं","कतै","कसरी","कसै","कसैले","कस्तो","कहाँ","कहाँबाट","कहिले","कहिलेकाहीं","का","कि","किन","किनभने","कुनै","कुरा","कृपया","के","केवल","केहि","केही","को","कोही","गए","गयौ","गर","गरि","गरी","गरे","गरेका","गरेको","गरेर","गरौं","गर्छ","गर्छु","गर्दछ","गर्दै","गर्न","गर्नु","गर्नुपर्छ","गर्ने","गर्नेछन्","गर्नेछौ","गैर","चार","चाले","चाहनुहुन्छ","चाहन्छु","चाहन्छौ","चाहन्छौं","चाहन्थे","चाहिए","छ","छन्","छु","छू","छैन","छौं","जब","जबकि","जसको","जसबाट","जसमा","जसलाई","जसले","जस्तै","जस्तो","जहाँ","जान","जाहिर","जुन","जे","जो","ठीक","त","तत्काल","तथा","तदनुसार","तपाई","तपाईं","तपाईको","तर","तल","तापनी","तिनिहरुलाई","तिनी","तिनीहरुको","तिनीहरू","तिनीहरूको","तिमि","तिमी","तिमीसँग","तिम्रो","तिर","ती","तीन","तुरुन्तै","तेस्कारण","तेस्रो","त्यसपछि","त्यहाँ","त्यो","त्सपछि","त्सैले","थप","थिए","थिएन","थिएनन्","थियो","दिए","दिनुभएको","दिनुहुन्छ","दुई","दुबै","देखि","देखिन्छ","देखियो","देखे","देखेको","देखेर","दोस्रो","द्वारा","धेरै","न","नगर्नुहोस्","नजिकै","नत्र","नयाँ","नि","निम्ति","निम्न","नै","नौ","पक्का","पक्कै","पछि","पछिल्लो","पटक","पनि","पर्छ","पर्थ्यो","पर्याप्त","पहिले","पहिलो","पहिल्यै","पाँच","पाँचौं","पूर्व","प्रति","प्रतेक","प्रत्येक","प्लस","फेरि","फेरी","बने","बन्द","बरु","बाट","बारे","बारेमा","बाहिर","बाहेक","बिरुद्ध","बिशेष","बीच","बीचमा","भए","भएको","भन","भने","भन्","भन्छन्","भन्छु","भन्दा","भन्नुभयो","भन्ने","भर","भित्र","भित्री","म","मँ","मलाई","मा","मात्र","माथि","मार्फत","मुख्य","मेरो","मैले","यति","यथोचित","यदि","यद्यपि","यस","यसको","यसपछि","यसबाहेक","यसरी","यसैले","यसो","यस्तो","यहाँ","यहाँसम्म","या","यी","यो","र","रही","रहेका","रहेको","राखे","राख्छ","राम्रो","रूप","लगभग","लाई","लागि","ले","वरीपरी","वा","वास्तवमा","विरुद्ध","शायद","सकदिन","सकिएन","सक्छ","सक्दैन","संग","संगै","सट्टा","सधै","सबै","सबैलाई","समय","समयमा","सम्भव","सम्म","सही","साँच्चै","सात","साथ","साथै","सायद","सारा","सो","सोही","स्पष्ट","हरे","हरेक","हामी","हामीसँग","हाम्रो","हुँ","हुँदैन","हुन","हुनु","हुनुहुन्छ","हुने","हुनेछ","हुनेछु","हुन्","हुन्छ","हुन्थे","हो","होइन","हौं"]


# Create Dataframe for website list and append tokens column

In [7]:
website_list_df = pd.DataFrame(websites_in_db)
# create new column in dataframe
website_list_df.insert(0,"tokens", "")
website_list_df.head()


Unnamed: 0,tokens,_id,url,outgoingLinks,outgoingLinksLen,incomingLinks,incomingLinksLen,contentHashId
0,,63da9e91df832131f59f8727,https://nepal.gov.np,[https://nepal.gov.np:8443/NationalPortal/NP?s...,3,[],0,ba422f11-88b3-43a2-bfe3-147cff6b6f24
1,,63da9e91df832131f59f8728,https://moha.gov.np,"[https://moha.gov.np/, https://moha.gov.np/gal...",114,[],0,6e36b5ec-9419-46de-a4e6-b558beb14eb2
2,,63da9e91df832131f59f8729,https://p1.gov.np,"[https://p1.gov.np/detail/sewa-prava, https://...",100,[],0,d15807d0-5043-40b4-9465-18cd0517267f
3,,63da9e91df832131f59f872a,http://p2.gov.np,"[https://madhesh.gov.np/node/155, https://madh...",26,[],0,897de6d0-342a-4ed1-9b36-3265340e1705
4,,63da9e91df832131f59f872b,http://p3.gov.np,[],0,[],0,2015b910-06c8-428d-a35f-c6f3427401b3


## Stop Word Removal

In [8]:
def textPreProcessing(text):
    punctuation = string.punctuation + "।" + "!" + "?"
    stopWordsEn = set(stopwords.words("english"))
    stopWordsNp = set(nepaliStopWords)
    stopWordsCombined = stopWordsEn.union(stopWordsNp)
    
    # Remove Punctuations
    text = text.translate(str.maketrans("", "", punctuation))
    # Tokenize Word     
    tokens = word_tokenize(text)
    # Remove Stop Words     
    tokens = [token for token in tokens if token.lower() not in stopWordsCombined]
    
    # Stemming     
    stemmerEn = SnowballStemmer("english")
    tokens = [stemmerEn.stem(token) for token in tokens]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return tokens
    

In [9]:

# In each row, from contentHashId, bring all text data from file and perform text processing in them
allTokens = []
for idx, row in website_list_df.iterrows():
    row = row.copy();
    folderName = row["contentHashId"]
    with open(f'data/{folderName}/{folderName}.txt') as txtFile:
        text = txtFile.readlines()
#         txtInSingleStr = ("".join())
        txtInSingleStr = " ".join("".join(text).split("\n"))
#         print("For " + row["contentHashId"])
        tokens = textPreProcessing(txtInSingleStr)
        for token in tokens:
            allTokens.append({
                "token":token,
                "website":row["_id"]
            })
print("Done")

Done


# Create Dataframe from All Tokens

In [10]:
tokens_df = pd.DataFrame(allTokens)

In [11]:
print(len(tokens_df))

1821292


# New Dataframe by combining websites

In tokens_df, there are multiple rows with same token but with different websites. We can decrease the size of dataframe by grouping them together and appending websites in a list.

In [106]:
tokens_ws_comb = pd.merge(tokens_df, tokens_df.groupby('token').website.apply(list).reset_index(), on='token', how='left')
# Remove Duplicate Columns after merging
tokens_ws_comb = tokens_ws_comb.drop_duplicates(subset="token", keep="first")


In [107]:
print(len(tokens_ws_comb))
tokens_ws_comb.head()

62427


Unnamed: 0,token,website_x,website_y
0,नेपाल,63da9e91df832131f59f8727,"[63da9e91df832131f59f8727, 63da9e91df832131f59..."
1,सरकारको,63da9e91df832131f59f8727,"[63da9e91df832131f59f8727, 63da9e91df832131f59..."
2,आधिकारिक,63da9e91df832131f59f8727,"[63da9e91df832131f59f8727, 63da9e91df832131f59..."
3,पोर्टल,63da9e91df832131f59f8727,"[63da9e91df832131f59f8727, 63da9e91df832131f59..."
4,offici,63da9e91df832131f59f8727,"[63da9e91df832131f59f8727, 63da9e91df832131f59..."


# Clean new dataframe data
Remove Website_x and rename website_y with websites.

In [108]:
tokens_ws_comb = tokens_ws_comb.drop('website_x', axis=1)

In [109]:
tokens_ws_comb.rename(columns = {'website_y':'websites'}, inplace = True)
tokens_ws_comb.head()

Unnamed: 0,token,websites
0,नेपाल,"[63da9e91df832131f59f8727, 63da9e91df832131f59..."
1,सरकारको,"[63da9e91df832131f59f8727, 63da9e91df832131f59..."
2,आधिकारिक,"[63da9e91df832131f59f8727, 63da9e91df832131f59..."
3,पोर्टल,"[63da9e91df832131f59f8727, 63da9e91df832131f59..."
4,offici,"[63da9e91df832131f59f8727, 63da9e91df832131f59..."


# Remove Duplicate Website Ids from Website Column of Table for each token

In [110]:
# def convert_to_str(oid):
#     return str(oid)

# tokens_ws_comb["websites"] = tokens_ws_comb.apply(convert_to_str)
# tokens_ws_comb.drop_duplicates(subset='websites', inplace=True)
# tokens_ws_comb.head()
def remove_duplicates(row):
    try:
        str_oids = [str(oid) for oid in row['websites']]
        return pd.Series(str_oids).drop_duplicates().tolist()
    except:
        print(f"Error with row: {row}")
        return np.nan


lst_w_dup_np = list(tokens_ws_comb[tokens_ws_comb['token']=="नेपाल"]["websites"])[0]
tokens_ws_comb['websites'] = tokens_ws_comb.apply(remove_duplicates, axis=1)
print("Done")

Done


## Check for one token.

In [111]:
lst_wo_dp_np = list(tokens_ws_comb[tokens_ws_comb['token']=="नेपाल"]["websites"])[0]
print("Before Duplicate ID Removal for नेपाल, size: " + str(len(lst_w_dup_np)))
print("After Duplicate ID Removal for नेपाल, size: " + str(len(lst_wo_dp_np)))

Before Duplicate ID Removal for नेपाल, size: 11031
After Duplicate ID Removal for नेपाल, size: 3256


# Save Dataframe to JSON

In [112]:
token_list = tokens_ws_comb.to_dict("records")

In [113]:
to_json(token_list, "tokens_websites")
print("Done")

Done


# Load data from JSON, to simplfy calculations

In [137]:
load_tokens = pd.read_json("data/generated/tokens_websites.json")
# convert list of string values in websites array to bson.ObjectIds
load_tokens["websites"] = load_tokens["websites"].apply(lambda x: [bson.ObjectId(website) for website in x])


In [138]:
# Confirming the conversion
temp = load_tokens[load_tokens["token"]=="नेपाल"]
temp_ws = list(temp["websites"])[0]
temp_ws[0:3]

[ObjectId('63da9e91df832131f59f8727'),
 ObjectId('63da9e91df832131f59f8728'),
 ObjectId('63da9e91df832131f59f872f')]

# Save Tokens to Database and join websites with the ids in websites column

In [149]:
tokens_collection = database['tokens']
# convert loaded tokens to list of tokens
tokens_to_save = load_tokens.to_dict("records")
tokens_collection.insert_many(tokens_to_save)
# create index on websites array for tokens
tokens_collection.create_index([("websites", pm.ASCENDING)])

print("Done")

Done


# Get All Tokens

In [150]:
tokens_in_db = tokens_collection.find()
tokens_list_db = list(tokens_in_db)
len(tokens_list_db)

62427

# Get Single Token and List all websites it has

In [181]:
page_number = 1 #26119481
single_token_db_cursor = tokens_collection.aggregate([
    {
        "$match": {
            "token": "नेपाल"
        }
    },
    {
        "$lookup": {
            "from": "websites",
            "localField": "websites",
            "foreignField": "_id",
            "as": "websites"
        }
    },
    {
        "$project": {
            "token": 1,
            "websites.url":1
#             "websites": { 
#                 "$slice": [{"$map": {
#                     "input": "$websites",
#                     "as": "website",
#                     "in": {"url": "$$website.url"}
#                 }}, 10000]
#             },
        }
    },
])
single_token_db = single_token_db_cursor.next()
print(single_token_db["websites"])

[{'url': 'https://nepal.gov.np'}, {'url': 'https://moha.gov.np'}, {'url': 'https://nepal.gov.np:8443/NationalPortal/NP?splashAction=home'}, {'url': 'https://nepal.gov.np:8443/NationalPortal/NP?splashAction=business'}, {'url': 'https://nepal.gov.np:8443/NationalPortal/NP?splashAction=citizen'}, {'url': 'https://moha.gov.np/contact'}, {'url': 'https://moha.gov.np/complaint'}, {'url': 'https://moha.gov.np/office-directory'}, {'url': 'https://moha.gov.np/directory'}, {'url': 'https://moha.gov.np/#'}, {'url': 'https://moha.gov.np/switch-bandwidth'}, {'url': 'https://moha.gov.np/'}, {'url': 'https://moha.gov.np/offices'}, {'url': 'https://moha.gov.np/page/introduction'}, {'url': 'https://moha.gov.np/page/main-functions'}, {'url': 'https://moha.gov.np/office-layout'}, {'url': 'https://moha.gov.np/page/resources'}, {'url': 'https://moha.gov.np/page/act-regulation'}, {'url': 'https://moha.gov.np/page/directive-framework'}, {'url': 'https://moha.gov.np/page/criteria'}, {'url': 'https://moha.gov.