# Indexer for Search Engine with Ranking

In [1]:
import pandas as pd 
import string 
import numpy as np 
import nltk
import bson
import pymongo as pm
from itertools import chain
import math 

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer

# already downloaded
# nltk.download('omw-1.4')
# nltk.download()("punkt")
# nltk.download()("stopwords")
# nltk.download()("wordnet")

## Import Websites from json file

In [2]:
websites = pd.read_json("data/table.json")
df = pd.DataFrame(websites)
df = df.reset_index()
df.head(3)

Unnamed: 0,index,id,url,status,dispatchToken,contentHashId,createdAt,updatedAt,dispatchAgent,clientEnd,clientStart,serverEnd,serverStart,metadata
0,0,1,https://nepal.gov.np,done,,ba422f11-88b3-43a2-bfe3-147cff6b6f24,2022-12-16T07:52:43.982Z,2022-12-16T07:56:11.957Z,kG_amT2-70kWc2XYAAAB,2022-12-16T07:56:11.535Z,2022-12-16T07:55:55.985Z,2022-12-16T07:56:11.896Z,2022-12-16T07:55:55.982Z,"{'url': 'https://nepal.gov.np', 'title': 'Nepa..."
1,1,2,https://moha.gov.np,done,,6e36b5ec-9419-46de-a4e6-b558beb14eb2,2022-12-16T07:52:44.042Z,2022-12-16T07:58:43.935Z,S_eXYDY_dk3-8euAAAAD,2022-12-16T07:58:37.772Z,2022-12-16T07:58:12.905Z,2022-12-16T07:58:43.883Z,2022-12-16T07:58:12.903Z,"{'url': 'https://moha.gov.np', 'title': 'गृह म..."
2,2,3,https://p1.gov.np,done,,d15807d0-5043-40b4-9465-18cd0517267f,2022-12-16T07:52:44.132Z,2022-12-16T07:59:18.859Z,S_eXYDY_dk3-8euAAAAD,2022-12-16T07:59:13.435Z,2022-12-16T07:58:44.037Z,2022-12-16T07:59:18.806Z,2022-12-16T07:58:44.036Z,"{'url': 'https://p1.gov.np', 'title': 'प्रदेश ..."


## Create new Dictionary to store website on database with following schema

```
string: Website 
Website: {
  url: string,
  outgoingLinks: string[],
  incomingLinks: string[]
}
```

In [3]:
websites_dict = {}

for index, row in df.iterrows():
    url = row['url']
    if websites_dict.get(url) is None:
        websites_dict[url] = {
            "url": url,
            "outgoingLinks": list(set(row['metadata']['links'])),
            "outgoingLinksLen":len(row['metadata']['links']),
            "incomingLinks": [],
            "incomingLinksLen": 0,
            "contentHashId":row["contentHashId"]
        }

## Assign Incoming Links to the created dictionary

In [4]:
for website in websites_dict:
    links = websites_dict[website]["outgoingLinks"]
    for link in links:
        linkInDict = websites_dict.get(link)
        if linkInDict is not None:
            if not website in websites_dict[link]["incomingLinks"]: 
                websites_dict[link]["incomingLinks"].append(website)
                websites_dict[link]["incomingLinksLen"] = websites_dict[link]["incomingLinksLen"] + 1
            
            

website_pd = pd.DataFrame.from_dict(websites_dict)
website_pd.head(2).T

Unnamed: 0,url,outgoingLinks
https://nepal.gov.np,https://nepal.gov.np,[https://nepal.gov.np:8443/NationalPortal/NP?s...
https://moha.gov.np,https://moha.gov.np,"[https://moha.gov.np/post/va-ja-niapa-ta-8, ht..."
https://p1.gov.np,https://p1.gov.np,[https://mofelc.p1.gov.np/document/nirdeshika/...
http://p2.gov.np,http://p2.gov.np,"[http://pga.p2.gov.np/, http://moitfe.p2.gov.n..."
http://p3.gov.np,http://p3.gov.np,[]
...,...,...
http://www.ugcnepal.edu.np/,http://www.ugcnepal.edu.np/,"[http://www.ugcnepal.edu.np/division/43, http:..."
http://www.wecs.gov.np/,http://www.wecs.gov.np/,[http://www.wecs.gov.np/pages/organization-cha...
http://www.nmc.org.np/,http://www.nmc.org.np/,"[https://nmc.org.np/searchPractitioner, https:..."
http://nhrc.gov.np/,http://nhrc.gov.np/,"[https://nhrc.gov.np/#greyscale, https://nhrc...."


# Create List of Websites from dictionary

In [5]:
websites_list = list(websites_dict.values())

## Save Dictionary Data to MongoDB

### Initialize pymongo

In [6]:
mongouri = "mongodb://root:prisma@localhost:27017/db_seven_sem_prj?authSource=admin"
client = pm.MongoClient(mongouri)
database = client.get_database()
print(database)

Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, authsource='admin'), 'db_seven_sem_prj')


In [10]:
website_collection = database['websites']

In [11]:
website_collection.insert_many(websites_list)

<pymongo.results.InsertManyResult at 0x7f8d7c1f15c0>

In [12]:
websites_in_db = list(website_collection.find())
print(len(websites_in_db))
print(list(websites_in_db)[0])

5277
{'_id': ObjectId('640230722fde703077dc0b0e'), 'url': 'https://nepal.gov.np', 'outgoingLinks': ['https://nepal.gov.np:8443/NationalPortal/NP?splashAction=home', 'https://nepal.gov.np:8443/NationalPortal/NP?splashAction=business', 'https://nepal.gov.np:8443/NationalPortal/NP?splashAction=citizen'], 'outgoingLinksLen': 3, 'incomingLinks': [], 'incomingLinksLen': 0, 'contentHashId': 'ba422f11-88b3-43a2-bfe3-147cff6b6f24'}


# Export Websites from Database to JSON

In [13]:
import json
from bson.json_util import dumps

def to_json(data, fileName):
    with open(f"data/generated/{fileName}.json", "w", encoding="utf-8") as f:
        jsonData = dumps(data, ensure_ascii=False)
        f.write(jsonData)
        f.close()

In [14]:
to_json(websites_in_db, "websites")
print("Done")

Done


# Text Processing

# Create Dataframe for website list and append tokens column

In [15]:
website_list_df = pd.DataFrame(websites_in_db)
# create new column in dataframe
website_list_df.insert(0,"tokens", "")
website_list_df.head()

Unnamed: 0,tokens,_id,url,outgoingLinks,outgoingLinksLen,incomingLinks,incomingLinksLen,contentHashId
0,,640230722fde703077dc0b0e,https://nepal.gov.np,[https://nepal.gov.np:8443/NationalPortal/NP?s...,3,[],0,ba422f11-88b3-43a2-bfe3-147cff6b6f24
1,,640230722fde703077dc0b0f,https://moha.gov.np,"[https://moha.gov.np/post/va-ja-niapa-ta-8, ht...",114,[],0,6e36b5ec-9419-46de-a4e6-b558beb14eb2
2,,640230722fde703077dc0b10,https://p1.gov.np,[https://mofelc.p1.gov.np/document/nirdeshika/...,100,[],0,d15807d0-5043-40b4-9465-18cd0517267f
3,,640230722fde703077dc0b11,http://p2.gov.np,"[http://pga.p2.gov.np/, http://moitfe.p2.gov.n...",26,[],0,897de6d0-342a-4ed1-9b36-3265340e1705
4,,640230722fde703077dc0b12,http://p3.gov.np,[],0,[],0,2015b910-06c8-428d-a35f-c6f3427401b3


## Text Preprocessing

In [16]:
def textPreProcessing(text):
    nepaliStopWords = ["अझै","अधिक","अन्य","अन्यत्र","अन्यथा","अब","अरु","अरुलाई","अर्को","अर्थात","अर्थात्","अलग","आए","आजको","आत्म","आदि","आफू","आफूलाई","आफै","आफैलाई","आफैले","आफ्नै","आफ्नो","आयो","उनको","उनले","उनि","उनी","उनीहरु","उप","उसलाई","उस्तै","उहाँ","उहालाई","ऊ","एउटै","एक","एकदम","ओठ","औं","कतै","कसरी","कसै","कसैले","कस्तो","कहाँ","कहाँबाट","कहिले","कहिलेकाहीं","का","कि","किन","किनभने","कुनै","कुरा","कृपया","के","केवल","केहि","केही","को","कोही","गए","गयौ","गर","गरि","गरी","गरे","गरेका","गरेको","गरेर","गरौं","गर्छ","गर्छु","गर्दछ","गर्दै","गर्न","गर्नु","गर्नुपर्छ","गर्ने","गर्नेछन्","गर्नेछौ","गैर","चार","चाले","चाहनुहुन्छ","चाहन्छु","चाहन्छौ","चाहन्छौं","चाहन्थे","चाहिए","छ","छन्","छु","छू","छैन","छौं","जब","जबकि","जसको","जसबाट","जसमा","जसलाई","जसले","जस्तै","जस्तो","जहाँ","जान","जाहिर","जुन","जे","जो","ठीक","त","तत्काल","तथा","तदनुसार","तपाई","तपाईं","तपाईको","तर","तल","तापनी","तिनिहरुलाई","तिनी","तिनीहरुको","तिनीहरू","तिनीहरूको","तिमि","तिमी","तिमीसँग","तिम्रो","तिर","ती","तीन","तुरुन्तै","तेस्कारण","तेस्रो","त्यसपछि","त्यहाँ","त्यो","त्सपछि","त्सैले","थप","थिए","थिएन","थिएनन्","थियो","दिए","दिनुभएको","दिनुहुन्छ","दुई","दुबै","देखि","देखिन्छ","देखियो","देखे","देखेको","देखेर","दोस्रो","द्वारा","धेरै","न","नगर्नुहोस्","नजिकै","नत्र","नयाँ","नि","निम्ति","निम्न","नै","नौ","पक्का","पक्कै","पछि","पछिल्लो","पटक","पनि","पर्छ","पर्थ्यो","पर्याप्त","पहिले","पहिलो","पहिल्यै","पाँच","पाँचौं","पूर्व","प्रति","प्रतेक","प्रत्येक","प्लस","फेरि","फेरी","बने","बन्द","बरु","बाट","बारे","बारेमा","बाहिर","बाहेक","बिरुद्ध","बिशेष","बीच","बीचमा","भए","भएको","भन","भने","भन्","भन्छन्","भन्छु","भन्दा","भन्नुभयो","भन्ने","भर","भित्र","भित्री","म","मँ","मलाई","मा","मात्र","माथि","मार्फत","मुख्य","मेरो","मैले","यति","यथोचित","यदि","यद्यपि","यस","यसको","यसपछि","यसबाहेक","यसरी","यसैले","यसो","यस्तो","यहाँ","यहाँसम्म","या","यी","यो","र","रही","रहेका","रहेको","राखे","राख्छ","राम्रो","रूप","लगभग","लाई","लागि","ले","वरीपरी","वा","वास्तवमा","विरुद्ध","शायद","सकदिन","सकिएन","सक्छ","सक्दैन","संग","संगै","सट्टा","सधै","सबै","सबैलाई","समय","समयमा","सम्भव","सम्म","सही","साँच्चै","सात","साथ","साथै","सायद","सारा","सो","सोही","स्पष्ट","हरे","हरेक","हामी","हामीसँग","हाम्रो","हुँ","हुँदैन","हुन","हुनु","हुनुहुन्छ","हुने","हुनेछ","हुनेछु","हुन्","हुन्छ","हुन्थे","हो","होइन","हौंअझै","अधिक","अन्य","अन्यत्र","अन्यथा","अब","अरु","अरुलाई","अर्को","अर्थात","अर्थात्","अलग","आए","आजको","आत्म","आदि","आफू","आफूलाई","आफै","आफैलाई","आफैले","आफ्नै","आफ्नो","आयो","उनको","उनले","उनि","उनी","उनीहरु","उप","उसलाई","उस्तै","उहाँ","उहालाई","ऊ","एउटै","एक","एकदम","ओठ","औं","कतै","कसरी","कसै","कसैले","कस्तो","कहाँ","कहाँबाट","कहिले","कहिलेकाहीं","का","कि","किन","किनभने","कुनै","कुरा","कृपया","के","केवल","केहि","केही","को","कोही","गए","गयौ","गर","गरि","गरी","गरे","गरेका","गरेको","गरेर","गरौं","गर्छ","गर्छु","गर्दछ","गर्दै","गर्न","गर्नु","गर्नुपर्छ","गर्ने","गर्नेछन्","गर्नेछौ","गैर","चार","चाले","चाहनुहुन्छ","चाहन्छु","चाहन्छौ","चाहन्छौं","चाहन्थे","चाहिए","छ","छन्","छु","छू","छैन","छौं","जब","जबकि","जसको","जसबाट","जसमा","जसलाई","जसले","जस्तै","जस्तो","जहाँ","जान","जाहिर","जुन","जे","जो","ठीक","त","तत्काल","तथा","तदनुसार","तपाई","तपाईं","तपाईको","तर","तल","तापनी","तिनिहरुलाई","तिनी","तिनीहरुको","तिनीहरू","तिनीहरूको","तिमि","तिमी","तिमीसँग","तिम्रो","तिर","ती","तीन","तुरुन्तै","तेस्कारण","तेस्रो","त्यसपछि","त्यहाँ","त्यो","त्सपछि","त्सैले","थप","थिए","थिएन","थिएनन्","थियो","दिए","दिनुभएको","दिनुहुन्छ","दुई","दुबै","देखि","देखिन्छ","देखियो","देखे","देखेको","देखेर","दोस्रो","द्वारा","धेरै","न","नगर्नुहोस्","नजिकै","नत्र","नयाँ","नि","निम्ति","निम्न","नै","नौ","पक्का","पक्कै","पछि","पछिल्लो","पटक","पनि","पर्छ","पर्थ्यो","पर्याप्त","पहिले","पहिलो","पहिल्यै","पाँच","पाँचौं","पूर्व","प्रति","प्रतेक","प्रत्येक","प्लस","फेरि","फेरी","बने","बन्द","बरु","बाट","बारे","बारेमा","बाहिर","बाहेक","बिरुद्ध","बिशेष","बीच","बीचमा","भए","भएको","भन","भने","भन्","भन्छन्","भन्छु","भन्दा","भन्नुभयो","भन्ने","भर","भित्र","भित्री","म","मँ","मलाई","मा","मात्र","माथि","मार्फत","मुख्य","मेरो","मैले","यति","यथोचित","यदि","यद्यपि","यस","यसको","यसपछि","यसबाहेक","यसरी","यसैले","यसो","यस्तो","यहाँ","यहाँसम्म","या","यी","यो","र","रही","रहेका","रहेको","राखे","राख्छ","राम्रो","रूप","लगभग","लाई","लागि","ले","वरीपरी","वा","वास्तवमा","विरुद्ध","शायद","सकदिन","सकिएन","सक्छ","सक्दैन","संग","संगै","सट्टा","सधै","सबै","सबैलाई","समय","समयमा","सम्भव","सम्म","सही","साँच्चै","सात","साथ","साथै","सायद","सारा","सो","सोही","स्पष्ट","हरे","हरेक","हामी","हामीसँग","हाम्रो","हुँ","हुँदैन","हुन","हुनु","हुनुहुन्छ","हुने","हुनेछ","हुनेछु","हुन्","हुन्छ","हुन्थे","हो","होइन","हौं"]
    punctuation = string.punctuation + "।" + "!" + "?"
    stopWordsEn = set(stopwords.words("english"))
    stopWordsNp = set(nepaliStopWords)
    stopWordsCombined = stopWordsEn.union(stopWordsNp)
    
    # Remove Punctuations
    text = text.translate(str.maketrans("", "", punctuation))
    # Tokenize Word     
    tokens = word_tokenize(text)
    # Remove Stop Words     
    tokens = [token for token in tokens if token.lower() not in stopWordsCombined]
    
    # Stemming     
    stemmerEn = SnowballStemmer("english")
    tokens = [stemmerEn.stem(token) for token in tokens]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    tokens_occ = []
    
    for token in set(tokens):
        occurances = [index for index, value in enumerate(tokens) if value == token]
        tokens_occ.append({"token": token, "occurrences": occurances})
    return tokens_occ
    

In [17]:
# In each row, from contentHashId, bring all text data from file and perform text processing in them, also update website collection to set total tokens in that website
allTokens = []
for idx, row in website_list_df.iterrows():
    row = row.copy();
    folderName = row["contentHashId"]
    with open(f'data/{folderName}/{folderName}.txt') as txtFile:
        text = txtFile.readlines()
        txtInSingleStr = " ".join("".join(text).split("\n"))
        tokens_occs = textPreProcessing(txtInSingleStr)
        # update website column to save total tokens in it
        website_collection.update_one({"_id":row["_id"]}, {"$set":{"total_tokens":[d['token'] for d in tokens_occs]}})
        for tokens_occ in tokens_occs:
            allTokens.append({
                "token":tokens_occ["token"],
                "website":{
                    "_id" : row["_id"],
                    "occurrences":tokens_occ["occurrences"],
                }
            })
print("Done")

Done


# Create Dataframe from All Tokens

In [18]:
tokens_df = pd.DataFrame(allTokens)

In [19]:
print(len(tokens_df))

856899


# New Dataframe by combining websites

In tokens_df, there are multiple rows with same token but with different websites. We can decrease the size of dataframe by grouping them together and appending websites in a list.

In [20]:
tokens_ws_comb = pd.merge(tokens_df, tokens_df.groupby('token').website.apply(list).reset_index(), on='token', how='left')
# Remove Duplicate Columns after merging
tokens_ws_comb = tokens_ws_comb.drop_duplicates(subset="token", keep="first")

In [21]:
print(len(tokens_ws_comb))
tokens_ws_comb.head()

62427


Unnamed: 0,token,website_x,website_y
0,नागरिक,"{'_id': 640230722fde703077dc0b0e, 'occurrences...","[{'_id': 640230722fde703077dc0b0e, 'occurrence..."
1,©,"{'_id': 640230722fde703077dc0b0e, 'occurrences...","[{'_id': 640230722fde703077dc0b0e, 'occurrence..."
2,nepalgovnp,"{'_id': 640230722fde703077dc0b0e, 'occurrences...","[{'_id': 640230722fde703077dc0b0e, 'occurrence..."
3,offici,"{'_id': 640230722fde703077dc0b0e, 'occurrences...","[{'_id': 640230722fde703077dc0b0e, 'occurrence..."
4,portal,"{'_id': 640230722fde703077dc0b0e, 'occurrences...","[{'_id': 640230722fde703077dc0b0e, 'occurrence..."


# Clean new dataframe data
Remove Website_x and rename website_y with websites.

In [22]:
tokens_ws_comb = tokens_ws_comb.drop('website_x', axis=1)

In [23]:
tokens_ws_comb.rename(columns = {'website_y':'websites'}, inplace = True)
tokens_ws_comb.head()

Unnamed: 0,token,websites
0,नागरिक,"[{'_id': 640230722fde703077dc0b0e, 'occurrence..."
1,©,"[{'_id': 640230722fde703077dc0b0e, 'occurrence..."
2,nepalgovnp,"[{'_id': 640230722fde703077dc0b0e, 'occurrence..."
3,offici,"[{'_id': 640230722fde703077dc0b0e, 'occurrence..."
4,portal,"[{'_id': 640230722fde703077dc0b0e, 'occurrence..."


# Save tokens to json

In [24]:
token_list = tokens_ws_comb.to_dict("records")

In [25]:
to_json(token_list, "tokens_websites")
print("Done")

Done


# Load Tokens from Json

In [26]:
load_tokens = pd.read_json("data/generated/tokens_websites.json")

In [27]:
load_tokens.head()

Unnamed: 0,token,websites
0,नागरिक,"[{'_id': {'$oid': '640230722fde703077dc0b0e'},..."
1,©,"[{'_id': {'$oid': '640230722fde703077dc0b0e'},..."
2,nepalgovnp,"[{'_id': {'$oid': '640230722fde703077dc0b0e'},..."
3,offici,"[{'_id': {'$oid': '640230722fde703077dc0b0e'},..."
4,portal,"[{'_id': {'$oid': '640230722fde703077dc0b0e'},..."


# Convert \_id key for each dict for each row of each website to bson.ObjectId 

In [28]:
def convert_to_bson_object_id(row):
    for website in row['websites']:
        website['_id'] = bson.ObjectId(website['_id']['$oid'])
    return row
load_tokens = load_tokens.apply(convert_to_bson_object_id, axis=1)
load_tokens.head()

Unnamed: 0,token,websites
0,नागरिक,"[{'_id': 640230722fde703077dc0b0e, 'occurrence..."
1,©,"[{'_id': 640230722fde703077dc0b0e, 'occurrence..."
2,nepalgovnp,"[{'_id': 640230722fde703077dc0b0e, 'occurrence..."
3,offici,"[{'_id': 640230722fde703077dc0b0e, 'occurrence..."
4,portal,"[{'_id': 640230722fde703077dc0b0e, 'occurrence..."


In [29]:
# Confirming the conversion
temp = load_tokens[load_tokens["token"]=="nepal"]
temp_ws = list(temp["websites"])[0]
print(temp_ws[0:3])

[{'_id': ObjectId('640230722fde703077dc0b0e'), 'occurrences': [7, 20]}, {'_id': ObjectId('640230722fde703077dc0b1b'), 'occurrences': [1626]}, {'_id': ObjectId('640230722fde703077dc0b32'), 'occurrences': [54]}]


# Save tokens to database

In [30]:
tokens_collection = database['tokens']

In [31]:
# drop collection for now. later, this will be removed and instead of insert_many, upsert will be used
tokens_collection.drop()
# convert loaded tokens to list of tokens
tokens_to_save = load_tokens.to_dict("records")
tokens_collection.insert_many(tokens_to_save)
# create index on websites array for tokens
tokens_collection.create_index([("websites", pm.ASCENDING)])

print("Done")

Done


# Get All Tokens

In [32]:
all_tokens_in_db = tokens_collection.find()
all_tokens_in_db = list(all_tokens_in_db)

In [33]:
len(all_tokens_in_db)

62427

# Simulate a user query

In [275]:
query = "Driving License" 

# For each word, perform data preprocessing

In [276]:
tokenized_query = textPreProcessing(query)
# extract token strings from list
extracted_tokens = [item['token'] for item in tokenized_query] 
extracted_tokens

['drive', 'licens']

# From database, get all tokens that match these tokens in query

# Use Pandas to create a dataframe for Ranking Documents based on their Standard Deviation from user query

1. Calculate the mean of the occurances of each token in the document.
2. Calculate the variance of the occurances of each token in the document.
3. Take the square root of the variance to get the standard deviation.

Eg for following data: 
```json
{
  "Nepal": [
    { "website": "a", "occurances": [0, 1, 2] },
    { "website": "b", "occurances": [1, 2, 4] },
    { "website": "c", "occurances": [2, 9, 10] }
  ],
  "Drive": [
    { "website": "b", "occurances": [6, 10, 91] },
    { "website": "c", "occurances": [9, 10, 42] }
  ],
  "License": [
    { "website": "a", "occurances": [11, 10, 93] },
    { "website": "c", "occurances": [91, 48, 12] }
  ]
}
```
1. Find Standard Deviation for each token in each website
```yaml
Nepal:
    standard deviation of "Nepal" in website "a" = 0.8164
    standard deviation of "Nepal" in website "b" = 1.2472
    standard deviation of "Nepal" in website "c" = 3.5590
Drive:
    standard deviation of "Drive" in website "b" = 39.1606
    standard deviation of "Drive" in website "c" = 15.3260
License:
  standard deviation of "License" in website "a" = 38.8930
  standard deviation of "License" in website "c" = 32.2937
 ```
 
2. For each website w and token t find weight of each token in each website
    
    Weight is a value that represents the importance of a token in relation to the other tokens in a document. In information retrieval, weighting is often used to prioritize or rank the relevance of documents based on the presence of specific keywords. In the example I provided earlier, the weight of a token is calculated as the inverse of its standard deviation, which is a measure of the spread or dispersion of its occurrences within a set of documents. By assigning higher weights to tokens with low standard deviation, we can prioritize the tokens that are more consistently distributed across the relevant documents, and thus, are more likely to be relevant to a user's query.
```yaml
    Formula:
        weight[w][t] = 1 / SD[t][w]
    Nepal:
        weight of "Nepal" in website "a" = 1.2248
        weight of "Nepal" in website "b" = 0.8017
        weight of "Nepal" in website "c" = 0.2809
    Drive:
        weight of "Drive" in website "b" = 0.0255
        weight of "Drive" in website "c" = 0.0652
    License:
        weight of "License" in website "a" = 0.02571
        weight of "License" in website "c" = 0.0309
```
3. Find final score for each token in each website
```yaml
    Formula:
        score[w][t] = wt[t][w] * SD[t][w] 
    Nepal:
        score of "Nepal" in website "a" = 0.999
        score of "Nepal" in website "b" = 0.999
        score of "Nepal" in website "c" = 0.999
    Drive:
        score of "Drive" in website "b" = 0.9985
        score of "Drive" in website "c" = 0.999
    License:
        score of "License" in website "a" = 0.999
        score of "License" in website "c" = 0.9978
```
4. Sum all scores for website
```yaml
    a: 
        0.999 + 0.999 = 1.998
    b:
        0.999 + 0.9985 = 1.975
    c:
        0.999 + 0.999 + 0.9978 = 2.9958
```
5. Normalize data
```yaml
    Formula:
        x_norm = (x - x_min) / (x_max - x_min)
    a: 1
    b: 0.02
    c: 0
```
Hence, based on distribution of tokens, rank of websites is a > b > c

In [43]:
def calc_std_rank(websites, uniq):
    print(uniq)
    website_weights = []
    for sublist in websites:
        sub_weights = []
        for website in sublist:
            id = website['_id']
            occurances = website['occurrences']
            sd = np.std(occurances)
            weight = 1/(sd) if sd != 0 else 1
            sub_weights.append({'_id': id, 'weight': weight})
        website_weights.append(sub_weights)

    final_weights = {}
    for sub_weights in website_weights:
        for website in sub_weights:
            if website['_id'] in final_weights:
                final_weights[website['_id']] += website['weight']
            else:
                final_weights[website['_id']] = website['weight']

    final_weights = {k: v for k, v in sorted(final_weights.items(), key=lambda item: item[1], reverse=True)}

    return [{'_id': k, 'rank': i + 1} for i, (k, v) in enumerate(final_weights.items())]




In [44]:
def new_calc_std_rank(websites, uniq):
    print(uniq)
    website_weights = []
    for sublist in websites:
        sub_weights = []
        for website in sublist:
            id = website['_id']
            occurances = website['occurrences']
            sd = np.std(occurances)
            oc_q1 = np.percentile(occurances, 25)
            oc_q2 = np.percentile(occurances, 75)
            weight = oc_q2 - oc_q1
#             weight = np.median(occurances) if sd == 0 else 1/(sd + 1)
            sub_weights.append({'_id': id, 'weight': weight})
        website_weights.append(sub_weights)

    final_weights = {}
    for sub_weights in website_weights:
        for website in sub_weights:
            if website['_id'] in final_weights:
                final_weights[website['_id']] += website['weight']
            else:
                final_weights[website['_id']] = website['weight']

    final_weights = {k: v for k, v in sorted(final_weights.items(), key=lambda item: item[1], reverse=True)}

    return [{'_id': k, 'rank': i + 1} for i, (k, v) in enumerate(final_weights.items())]

In [277]:
ws_tokens_in_db = list(tokens_collection.find({
    "token": {"$in": extracted_tokens}
}, {"websites": 1, "_id":0}))
# convert list of dicts to list of 2d arrays
ws_tokens_2d_array = [token["websites"] for token in ws_tokens_in_db]

In [49]:
std_ranks = new_calc_std_rank(ws_tokens_2d_array, "1IQR")
# get first website

# dotm:
# 75e6a88c-3827-46d4-ad85-efe33e5d7099
# 63da9e91df832131f59f9b93

first_rank = std_ranks[0]
std_ranks
ws_db_rnk = website_collection.find({"_id":first_rank["_id"]})
for item in ws_db_rnk:
    print(item)

1IQR
{'_id': ObjectId('63da9e91df832131f59f8cfb'), 'url': 'https://www.apf.gov.np/Pages/AllNews', 'outgoingLinks': ['https://www.apf.gov.np/ReadNews/98', 'https://www.apf.gov.np/ReadNews/553', 'https://www.apf.gov.np/Pages/Development', 'https://www.apf.gov.np/ReadNews/1073', 'https://www.apf.gov.np/ReadNews/825', 'https://www.apf.gov.np/ReadNews/38', 'https://www.apf.gov.np/ReadNews/740', 'https://www.apf.gov.np/ReadNews/256', 'https://www.apf.gov.np/ReadNews/815', 'https://www.apf.gov.np/ReadNews/201', 'https://www.apf.gov.np/ReadNews/824', 'https://www.apf.gov.np/ReadNews/741', 'https://www.apf.gov.np/ReadNews/1255', 'https://www.apf.gov.np/ReadNews/1214', 'https://www.apf.gov.np/ReadNews/668', 'https://www.apf.gov.np/ReadNews/900', 'https://www.apf.gov.np/ReadNews/219', 'https://www.apf.gov.np/ReadNews/963', 'https://www.apf.gov.np/ReadNews/540', 'https://www.apf.gov.np/ReadNews/942', 'https://www.apf.gov.np/ReadNews/748', 'https://www.apf.gov.np/ReadNews/648', 'https://www.apf.gov

# Rank based on TF-IDF Score and Cosine Similarity
The standard deviation approach only looks at the frequency of terms in a document and doesn't consider their importance or relevance to the query. This can result in documents with a large number of terms but low relevance being ranked higher than more relevant documents. By incorporating inverse term frequency, TF-IDF provides a more robust measure of document relevance.

TF-IDF is a widely used information retrieval technique for ranking documents. It takes into account both the frequency of terms in a document and the inverse frequency of those terms across all the documents in a collection. This helps to rank documents based on how relevant they are to a specific query, rather than simply how frequently terms appear in a document.

TF is the number of times a term appears in a document divided by the total number of terms in that document. This reflects the importance of a term in a particular document.

IDF is the logarithm of the total number of documents in the collection divided by the number of documents containing a particular term. This reflects the rarity of a term across the entire collection.

Example, for query "Nepal Driving License", we can calculate the TF-IDF scores for the terms "Nepal", "Driving", and "License" in each document in the collection. We can then sum the TF-IDF scores for each term in the query to get a total score for each document, and sort the documents based on their total scores to get the final ranking.

The formula for TF can be written as:
```
TF(t, d) = (Number of times term t appears in document d) / (Total number of terms in document d)

The formula for IDF can be written as:

IDF(t) = log(N/n)

Where N is the total number of documents in the corpus and n is the number of documents containing the term t.

The formula for TF-IDF can be written as:

TF-IDF(t, d) = TF(t, d) * IDF(t)
```

What is Cosine Similarity?

Cosine similarity is a measure of similarity between two non-zero vectors of an inner product space. In information retrieval and text mining, it is often used to compare the similarity of documents or text to a given query. It is defined as the cosine of the angle between the two vectors in a multi-dimensional space.

The cosine similarity value lies between -1 and 1. A value of 1 indicates that the vectors are identical, a value of 0 indicates that the vectors are orthogonal (not similar), and a value of -1 indicates that the vectors are diametrically opposite. By taking the cosine of the angle between two vectors, we can determine the similarity between them, regardless of their magnitude.

In the context of text data, cosine similarity can be used to find the similarity between a query document and each document in a document set. The resulting similarity scores can then be used to rank the documents according to their similarity to the query, providing a method of information retrieval that is both fast and effective.

Formula: dot_product(document_vector, query_vector)

In [278]:
# For each token from user query, find its TF and IDF and TF-IDF
# ws_tokens_2d_array
# Find total documents that contain tokens in an extracted tokens
ws_count = []
for token in extracted_tokens:
    ws_w_token = website_collection.count_documents({"total_tokens":{
        "$in":[token]
    }})
    ws_count.append(ws_w_token)


In [279]:
# calculate TF, IDF and TF-IDF
for idx, array in enumerate(ws_tokens_2d_array):
    for ws in array:
        TF = len(ws["occurrences"])/len(websites_in_db)
        IDF = math.log(len(websites_in_db)/ws_count[idx])
        TF_IDF = TF * IDF
        ws["TF"]  = TF
        ws["IDF"] = IDF
        ws["TF_IDF"] = TF_IDF


# Creating Document Vector

In [46]:
# example
e_ws_list = ["a", "b", "c", "d", "e", "f"]
e_two_d_arr = [
    ["a", "b", "c"], #np
    ["c", "b"], #dr
    ["d", "e"] #lc
]
# for each element in ws_list, find indexes where those elements are in 2d array
e_all_idx = [];
for e_ws in e_ws_list:
    e_idx_list = []
    for e_arr_idx, e_arr in enumerate(e_two_d_arr):
        if e_ws in e_arr:
            e_idx_list.append(e_arr_idx)
    e_all_idx.append(e_idx_list)
            
print(e_all_idx)
print(len(e_all_idx))

[[0], [0, 1], [0, 1], [2], [2], []]
6


In [296]:
flat_ws_list = list(chain.from_iterable(ws_tokens_2d_array))
website_ids = list(set([d["_id"] for d in flat_ws_list]))

print(website_ids)


flat_ws_df = pd.DataFrame(flat_ws_list)
flat_ws_df = flat_ws_df.drop_duplicates(subset="_id")

unique_websites = list(flat_ws_df)

document_vector = {}
document_token_idx = {} #used for padding. Explained below
for lst_idx, inner_list in enumerate(ws_tokens_2d_array):
    for dct_idx, dct in enumerate(inner_list):
        # print(extracted_tokens[lst_idx] + " " + str(dct_idx))
        if dct["_id"] in document_vector and dct["_id"] in document_token_idx:
            document_vector[dct["_id"]].append(dct["TF_IDF"])
            document_token_idx[dct["_id"]].append(lst_idx)
        else:
            document_vector[dct["_id"]] = [dct["TF_IDF"]]
            document_token_idx[dct["_id"]] = [lst_idx]

[ObjectId('63da9e91df832131f59f876c'), ObjectId('63da9e91df832131f59f9b3c'), ObjectId('63da9e91df832131f59f90f5'), ObjectId('63da9e91df832131f59f889e'), ObjectId('63da9e91df832131f59f9b41'), ObjectId('63da9e91df832131f59f876e'), ObjectId('63da9e91df832131f59f9ae6'), ObjectId('63da9e91df832131f59f97cf'), ObjectId('63da9e91df832131f59f8e2e'), ObjectId('63da9e91df832131f59f90f7'), ObjectId('63da9e91df832131f59f876f'), ObjectId('63da9e91df832131f59f8e2d'), ObjectId('63da9e91df832131f59f876d'), ObjectId('63da9e91df832131f59f8e83'), ObjectId('63da9e91df832131f59f8e97'), ObjectId('63da9e91df832131f59f8e47'), ObjectId('63da9e91df832131f59f9bb8'), ObjectId('63da9e91df832131f59f8ac8'), ObjectId('63da9e91df832131f59f893b'), ObjectId('63da9e91df832131f59f8ac9'), ObjectId('63da9e91df832131f59f8871'), ObjectId('63da9e91df832131f59f9805'), ObjectId('63da9e91df832131f59f9af7'), ObjectId('63da9e91df832131f59f931f'), ObjectId('63da9e91df832131f59f9b9b'), ObjectId('63da9e91df832131f59f9b93'), ObjectId('6

# Normalization of TF_IDF Scores
Why Normalization is necessary?

In the cosine similarity calculation, the document matrix (which contains the TF-IDF values) is usually normalized to ensure that the magnitude of the vectors representing the documents does not affect the similarity scores. This is because cosine similarity is a measure of the cosine of the angle between two vectors, and the magnitude of the vectors does not play a role in the angle calculation.

If the document matrix is not normalized, then the cosine similarity scores could be skewed towards longer documents, as the vectors representing the longer documents would have a higher magnitude. Normalizing the document matrix eliminates this bias and ensures that the cosine similarity scores are based only on the orientation of the vectors, and not on their magnitude.






# Pad the normalized_doc_vec list with 0 where token-document is not present
document_token_idx hash table represents all those places where token exists. In all those places where token doesnt exist, in normalized vector list, insert 0. For example
if in document a, the value is ```[0, 2]```, it means ```token[1]``` is in document a. Thus, in
``` normalized_doc_vec[a][1]``` we need to insert 1


In [281]:
# normalize each vector and pad them
normalized_document_vector = {}
for key in document_vector:
    values = document_vector[key] 
    total = sum(values)
    norm_val = [value/total for value in values]
    normalized_document_vector[key] = norm_val
#  padding   
max_tokens = len(extracted_tokens)
for key in normalized_document_vector.keys():
    # get the document from document_token_idx using key
    existing_token_idx = document_token_idx[key]
    # find all values that are less than max_tokens that are not in existing tokens
    missing_idxes = [i for i in range(max_tokens) if i not in existing_token_idx and i < max_tokens]
    for idx in missing_idxes:
        normalized_document_vector[key].insert(idx, 0)

In [282]:
normalized_document_vector_list = [{'_id': key, 'dv': value} for key, value in normalized_document_vector.items()]
normalized_document_vector_list[0:3]

[{'_id': ObjectId('63da9e91df832131f59f876c'), 'dv': [1.0, 0]},
 {'_id': ObjectId('63da9e91df832131f59f876d'), 'dv': [1.0, 0]},
 {'_id': ObjectId('63da9e91df832131f59f876e'), 'dv': [1.0, 0]}]

# Calculate Query Vector

In [283]:
query_dict = tokenized_query
for idx, item in enumerate(query_dict):
    # we calculate frequency of the token occurance in the query and normalize it
    TF = len(item)/len(query_dict)
    # ws_tokens_2d_array is a 2d array with each inner array corresponding to the websites that have the respectove token in it
    num_ws_w_token = len(ws_tokens_2d_array[idx])
    IDF = len(websites_in_db)/num_ws_w_token
    TF_IDF = TF*IDF
    
    item["TF"] = TF
    item["IDF"] = IDF
    item["TF_IDF"] = TF_IDF
query_vector = [d["TF_IDF"] for d in query_dict]


# Calculate Cosine Similarity for each document

In [284]:
cosine_similarity = []
np_qv = np.array(query_vector)
for item in normalized_document_vector_list:
    np_dv = np.array(item["dv"])
    cosine_sim = np.dot(np_dv, np_qv)
    cosine_similarity.append({
        "_id": item["_id"],
        "cosine_sim": cosine_sim
    })
cosine_similarity[0:3]


[{'_id': ObjectId('63da9e91df832131f59f876c'),
  'cosine_sim': 135.30769230769232},
 {'_id': ObjectId('63da9e91df832131f59f876d'),
  'cosine_sim': 135.30769230769232},
 {'_id': ObjectId('63da9e91df832131f59f876e'),
  'cosine_sim': 135.30769230769232}]

# Cosine SImilarity Value is the rank value. Sort based on it

In [285]:
ranked_ws = sorted(cosine_similarity, key=lambda x: x['cosine_sim'], reverse=True)

In [303]:
# dotm = website_collection.find({"url":"https://www.dotm.gov.np/"})

# for item in dotm:
#     print(item["_id"])
#     print(item["contentHashId"])
    
#     for tkn in extracted_tokens:
#         if tkn in item["total_tokens"]:
#             print(tkn)
# print("-----")

# find rank of this website

# for item in ranked_ws:
#     print(item)

highest_rank_ws = ranked_ws[:5]
highest_rank_ws_ids = [d["_id"] for d in highest_rank_ws]

highest_rank_ws_db = website_collection.find({"_id":{
    "$in": highest_rank_ws_ids
}})

print("-----")

for item in highest_rank_ws_db:
    print(item["url"])
    print(item["contentHashId"])
    print("-----")

-----
http://daosarlahi.moha.gov.np/
35af855d-f729-4feb-a835-1d8882cef1e7
-----
https://daosarlahi.moha.gov.np/
42d377a5-0d14-45b3-af91-8eab3bdddca1
-----
http://mosd.p1.gov.np/index.php/node/585
ca8d7c71-5d4f-4094-8c49-87e6d41dedb4
-----
http://www.ntc.net.np/
72bda786-1ef4-462f-8bbb-8022793a2140
-----
http://www.nathm.edu.np/
d475a939-f472-4891-a58b-b49ba6137cc4
-----
