# Indexer for Search Engine with Ranking

In [1]:
import pandas as pd 

## Import Websites from json file

In [7]:
websites = pd.read_json("data/table.json")
df = pd.DataFrame(websites)
df = df.reset_index()
df.head(3)

Unnamed: 0,index,id,url,status,dispatchToken,contentHashId,createdAt,updatedAt,dispatchAgent,clientEnd,clientStart,serverEnd,serverStart,metadata
0,0,1,https://nepal.gov.np,done,,ba422f11-88b3-43a2-bfe3-147cff6b6f24,2022-12-16T07:52:43.982Z,2022-12-16T07:56:11.957Z,kG_amT2-70kWc2XYAAAB,2022-12-16T07:56:11.535Z,2022-12-16T07:55:55.985Z,2022-12-16T07:56:11.896Z,2022-12-16T07:55:55.982Z,"{'url': 'https://nepal.gov.np', 'title': 'Nepa..."
1,1,2,https://moha.gov.np,done,,6e36b5ec-9419-46de-a4e6-b558beb14eb2,2022-12-16T07:52:44.042Z,2022-12-16T07:58:43.935Z,S_eXYDY_dk3-8euAAAAD,2022-12-16T07:58:37.772Z,2022-12-16T07:58:12.905Z,2022-12-16T07:58:43.883Z,2022-12-16T07:58:12.903Z,"{'url': 'https://moha.gov.np', 'title': 'गृह म..."
2,2,3,https://p1.gov.np,done,,d15807d0-5043-40b4-9465-18cd0517267f,2022-12-16T07:52:44.132Z,2022-12-16T07:59:18.859Z,S_eXYDY_dk3-8euAAAAD,2022-12-16T07:59:13.435Z,2022-12-16T07:58:44.037Z,2022-12-16T07:59:18.806Z,2022-12-16T07:58:44.036Z,"{'url': 'https://p1.gov.np', 'title': 'प्रदेश ..."


## Create new Dictionary to store website on database with following schema

```
string: Website 
Website: {
  url: string,
  outgoingLinks: string[],
  incomingLinks: string[]
}
```

In [8]:
websites_dict = {}

for index, row in df.iterrows():
    url = row['url']
    if websites_dict.get(url) is None:
        websites_dict[url] = {
            "url": url,
            "outgoingLinks": list(set(row['metadata']['links'])),
            "outgoingLinksLen":len(row['metadata']['links']),
            "incomingLinks": [],
            "incomingLinksLen": 0,
            "contentHashId":row["contentHashId"]
        }

## Assign Incoming Links to the created dictionary

In [9]:
for website in websites_dict:
    links = websites_dict[website]["outgoingLinks"]
    for link in links:
        linkInDict = websites_dict.get(link)
        if linkInDict is not None:
            if not website in websites_dict[link]["incomingLinks"]: 
                websites_dict[link]["incomingLinks"].append(website)
                websites_dict[link]["incomingLinksLen"] = websites_dict[link]["incomingLinksLen"] + 1
            
            

website_pd = pd.DataFrame.from_dict(websites_dict)
website_pd.head(30).T

Unnamed: 0,url,outgoingLinks,outgoingLinksLen,incomingLinks,incomingLinksLen,contentHashId
https://nepal.gov.np,https://nepal.gov.np,[https://nepal.gov.np:8443/NationalPortal/NP?s...,3,[],0,ba422f11-88b3-43a2-bfe3-147cff6b6f24
https://moha.gov.np,https://moha.gov.np,"[https://moha.gov.np/, https://moha.gov.np/gal...",114,[],0,6e36b5ec-9419-46de-a4e6-b558beb14eb2
https://p1.gov.np,https://p1.gov.np,"[https://p1.gov.np/detail/sewa-prava, https://...",100,[],0,d15807d0-5043-40b4-9465-18cd0517267f
http://p2.gov.np,http://p2.gov.np,"[https://madhesh.gov.np/node/155, https://madh...",26,[],0,897de6d0-342a-4ed1-9b36-3265340e1705
http://p3.gov.np,http://p3.gov.np,[],0,[],0,2015b910-06c8-428d-a35f-c6f3427401b3
...,...,...,...,...,...,...
http://www.ugcnepal.edu.np/,http://www.ugcnepal.edu.np/,"[http://www.ugcnepal.edu.np/downloads/1/5, htt...",118,[https://nepal.gov.np:8443/NationalPortal/view...,3,27100abd-fabc-438a-9c84-615d232cfec8
http://www.wecs.gov.np/,http://www.wecs.gov.np/,"[http://www.wecs.gov.np/pages/documents, https...",56,[https://nepal.gov.np:8443/NationalPortal/view...,5,0f174b39-ac03-4bcc-a1f3-2207dd2648d7
http://www.nmc.org.np/,http://www.nmc.org.np/,"[https://nmc.org.np/cpd-modules, https://nmc.o...",33,[https://nepal.gov.np:8443/NationalPortal/view...,2,f3f47701-3233-4bc5-9ed1-9b4efb92f081
http://nhrc.gov.np/,http://nhrc.gov.np/,[https://nhrc.gov.np/trainings/training-worksh...,79,[https://nepal.gov.np:8443/NationalPortal/view...,6,1e182a96-a375-4c1e-90b6-0fc2f053d256


## Save Dictionary Data to JSON file

In [8]:
    website_pd.to_json("websites.json")

# Create List of Websites from dictionary

In [11]:
websites_list = list(websites_dict.values())
websites_list[2]

{'url': 'https://p1.gov.np',
 'outgoingLinks': ['https://p1.gov.np/detail/sewa-prava',
  'https://mowsie.p1.gov.np/notice/procurement-notice/detail/2079-01-5-supply-delivery-of-tubular-steel-electric-pole-copper-wound-distribution-of-outdoor-transformer-and-acsr-conductor-2079-01-05',
  'http://motmc.p1.gov.np/',
  'https://p1.gov.np/web/sidelink/mantralaya',
  'https://p1.gov.np/report/yearly-report',
  'https://p1.gov.np/detail/suchana',
  'https://p1.gov.np/mantralaya/15',
  'https://p1.gov.np/language/ne',
  'https://motc.p1.gov.np/document/act_rule/detail/2079-08-28-95737',
  'https://p1.gov.np/facebook.com',
  'http://mowsie.p1.gov.np/notice/general-notice/detail/2079-08-21-69475',
  'https://p1.gov.np/mantralaya/5',
  'https://moh.p1.gov.np/report/yearly-report/detail/2079-07-29-46829',
  'https://p1.gov.np/mantralaya/4',
  'https://p1.gov.np/niti/directory',
  'https://p1.gov.np/web/sidelink/e-services',
  'https://p1.gov.np/mantralaya/11',
  'https://moa.p1.gov.np/document/act

## Save Dictionary Data to MongoDB

### Initialize pymongo

In [12]:
import pymongo as pm
mongouri = "mongodb://root:prisma@localhost:27017/db_seven_sem_prj?authSource=admin"
client = pm.MongoClient(mongouri)
database = client.get_database()
print(database)

Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, authsource='admin'), 'db_seven_sem_prj')


In [13]:
website_collection = database['websites']

In [16]:
website_collection.insert_many(websites_list)

<pymongo.results.InsertManyResult at 0x7f3788a7de80>

# Verify Data Inserted by showing the total rows in database

In [17]:
websites_in_db = database["websites"].find()
print(list(websites_in_db)[0])

{'_id': ObjectId('63da9e91df832131f59f8727'), 'url': 'https://nepal.gov.np', 'outgoingLinks': ['https://nepal.gov.np:8443/NationalPortal/NP?splashAction=home', 'https://nepal.gov.np:8443/NationalPortal/NP?splashAction=business', 'https://nepal.gov.np:8443/NationalPortal/NP?splashAction=citizen'], 'outgoingLinksLen': 3, 'incomingLinks': [], 'incomingLinksLen': 0, 'contentHashId': 'ba422f11-88b3-43a2-bfe3-147cff6b6f24'}


# For each website, get text data from folder whose name is in contentHashId

## Testing for only one website