In [1]:
import numpy as np
import os

## Page Rank list

In [2]:
def getdata():
    '''
    The function will return the page rank matrix as a shape of (500,500) numpy.ndarray
    and the # of connections of each page as a dictionary.
    '''
    target_dictionary = "./web-search-files/"
    listdir = os.listdir(target_dictionary)
    listdir.sort(key = lambda s: int(s[4:])) # sort file name with number
    page_mat = []
    connect = dict()
    for file in listdir:
        path = target_dictionary + file
        with open(path) as f:
            data = f.read() # str
            data_list = data.split("\n") # str -> list
            split_index = data_list.index("---------------------")
            pages = data_list[:split_index] # pages are before the line
            pages = [int(pages[i][4:]) for i in range(len(pages))] # page100 -> 100
            page_vector = [1/len(pages) if i in pages else 0 for i in range(500)] # vector
            page_mat.append(page_vector)
            connect[int(file[4:])] = len(pages) # add # of page connection to dict 
    page_mat = np.array(page_mat) #This matrix should be transposed
    return page_mat.T, connect

In [3]:
# def page_rank(page_mat,d,DIFF):
def page_rank(d,DIFF):
    """
    The function will return the latest update and the its rank
    """
    page_mat_mod = page_mat * d
    init = np.array([1/500]*500)
    update = init.copy() # avoid changing init
    while True:
        temp = update.copy()
        update = np.dot(page_mat_mod,update) + init*(1-d) # calculate update
        diff = sum(abs(temp - update))
        if diff < DIFF:
            break
    return update, update.argsort()[::-1]

In [6]:
# def write_ans(rank,connect,update,d,DIFF):
def write_Q1_ans(rank,update,d,DIFF):
    """
    The function will write our result into the corresponding file
    """
    ans = []
    for i in range(len(rank)):
#     print(f"page{rank[i]}\t{connect[rank[i]]}\t{round(update[rank[i]],8)}")
        result = f"page{rank[i]}\t{connect[rank[i]]}\t{round(update[rank[i]],8)}"
        ans.append(result)
    DIFF_dict = {0.100:"100", 0.010:"010", 0.001:"001"}
    file = "pr_"+str(int(d*100))+"_"+DIFF_dict[DIFF]+".txt"
    with open(file,"w") as f:
        f.write('\n'.join(ans))

In [7]:
# def Q1(page_mat,connect,d,DIFF):
def Q1(d,DIFF):
    update, rank = page_rank(d,DIFF)
    write_Q1_ans(rank,update,d,DIFF) 

In [8]:
page_mat, connect = getdata()
d_list = [0.25, 0.45, 0.65, 0.85]
DIFF_list = [0.100, 0.010, 0.001]
for i in range(len(d_list)):
    for j in range(len(DIFF_list)):
        Q1(d_list[i], DIFF_list[j])

## Reverse index

In [9]:
def getword():
    target_dictionary = "./web-search-files/"
    listdir = os.listdir(target_dictionary)
    listdir.sort(key = lambda s: int(s[4:])) # sort file name with number
    word_list = []
    for file in listdir:
        path = target_dictionary + file
        with open(path) as f:
            data = f.read()
            data_list = data.split("\n")
            split_index = data_list.index("---------------------")
            word = data_list[split_index+1:][-2]
            word_list.append(word)
    word_dict = dict()
    for i in range(len(word_list)):
        word_dict[i] = word_list[i].split(" ")[:-1]
#     print(word_dict)
    temp = []
    for i in range(len(word_dict)):
        temp.extend(word_dict[i])
    new_word_list = sorted(list(set(temp)))
    return new_word_list, word_dict

In [19]:
def find_page(word_list, word_dict):
    ans = []
    for i in range(len(word_list)):
        s = f"{word_list[i]}\t"
        for j in range(len(word_dict)):
            if word_list[i] in word_dict[j]:
                s+=f" page{j}"
        ans.append(s)
    with open("reverseindex.txt","w") as f:
        f.write('\n'.join(ans))

In [20]:
def Q2():
    word_list, word_dict = getword()
    find_page(word_list, word_dict)
Q2()

## Search engine

In [25]:
word_list, word_dict = getword()
with open("list.txt") as f:
    test = f.read().split("\n")
print(test)
for i in range(len(test)):
    if " " in test[i]:
        test[i] = test[i].split(" ")
print()
print(test)

['Baker', 'He', 'It', 'They', 'France', 'abhorrent', 'adjusted', 'alternating', 'around', 'epic', 'balanced', 'buried', 'cocaine', 'daily', "Storm's", 'doings:', 'dubious', 'establishment', 'faculties', 'opinion', 'finds', 'former', 'has', 'him', 'Putin', 'immense', 'intrusions', 'lately', 'lover', 'obama', "men's", 'mission', 'mysteries', 'observer--excellent', 'President', 'one', 'particularly', 'powers', 'reasoner', 'trump', 'save', 'she', 'softer', 'study', 'NASA', 'temperament', 'things', 'trained', 'veil', 'telescope', 'who', 'yet', 'He woman', 'had Hawaii', 'whole I', 'he volcano', 'all which', 'his Blade', 'was as', 'clearing to', 'that for', 'nature one', 'Holmes a and', 'but emotion infinity', 'from have him', 'in of own', 'the up War', 'were who formula', 'with teen Pilot cockpit']

['Baker', 'He', 'It', 'They', 'France', 'abhorrent', 'adjusted', 'alternating', 'around', 'epic', 'balanced', 'buried', 'cocaine', 'daily', "Storm's", 'doings:', 'dubious', 'establishment', 'facu

In [18]:
# print(word_list)
# print(word_dict)

{0: ['her', 'clearing', 'instrument', 'of', 'singular', 'every', 'lenses', 'and', 'He', 'the', 'him', 'at', 'which', 'false', 'successfully', 'to', 'the', 'and', 'In', 'Street'], 1: ['And', 'attracted', 'from', 'that', 'and', 'the', 'any', 'and', 'remained', 'intrusions', 'might', 'was', 'but', 'all', 'of', 'his', 'Atkinson', 'any', 'veil', 'that'], 2: ['occupied', 'a', 'whole', 'with', 'never', 'his', 'of', 'establishment', 'always', 'for', 'his', 'some', 'delicately', 'in', 'sneer', 'observation', 'have', 'of', 'always', 'eclipses'], 3: ['perfect', 'in', 'seen', 'clues', 'seen', 'drifted', 'ever', 'spoke', 'of', 'hopeless', 'to', 'instrument', 'remained', 'I', 'felt', 'save', 'readers', 'world', 'was', 'and'], 4: ['vague', 'He', 'And', 'clearing', 'of', 'brothers', 'which', 'of', 'balanced', 'Bohemian', 'more', 'by', 'To', 'late', 'be', 'and', 'as', 'motives', 'his', 'Atkinson'], 5: ['readers', 'which', 'as', 'home-centred', 'intrusions', 'of', 'cold', 'the', 'were', 'gibe', 'of', 't

In [31]:
s = ""
for i in range(len(test)):
    # 單一輸入
    if not isinstance(test[i],list):
        s+=f"{test[i]}\t"
    else:
        s+=f"{test[i]}\t"
    s+="\n"

In [32]:
print(s)

Baker	
He	
It	
They	
France	
abhorrent	
adjusted	
alternating	
around	
epic	
balanced	
buried	
cocaine	
daily	
Storm's	
doings:	
dubious	
establishment	
faculties	
opinion	
finds	
former	
has	
him	
Putin	
immense	
intrusions	
lately	
lover	
obama	
men's	
mission	
mysteries	
observer--excellent	
President	
one	
particularly	
powers	
reasoner	
trump	
save	
she	
softer	
study	
NASA	
temperament	
things	
trained	
veil	
telescope	
who	
yet	
['He', 'woman']	
['had', 'Hawaii']	
['whole', 'I']	
['he', 'volcano']	
['all', 'which']	
['his', 'Blade']	
['was', 'as']	
['clearing', 'to']	
['that', 'for']	
['nature', 'one']	
['Holmes', 'a', 'and']	
['but', 'emotion', 'infinity']	
['from', 'have', 'him']	
['in', 'of', 'own']	
['the', 'up', 'War']	
['were', 'who', 'formula']	
['with', 'teen', 'Pilot', 'cockpit']	

