In [12]:
import operator
import networkx as nx 

In [13]:
def read_file(filename: str) -> tuple:
    graph = {}
    edges = []
    with open(filename, 'r') as f:
        for line in f:
            e = line.strip().split(" ")
            edges.append((e[0], e[1]))
            if e[0] not in graph.keys():
                graph[e[0]] = [e[1]]
            else:
                graph[e[0]].append(e[1])
            if e[1] not in graph.keys():
                graph[e[1]] = []
    return graph, edges

In [14]:
def hits(graph: dict, k: int = 20) -> tuple:
    hub_scores = {page: 1 for page in graph}
    authority_scores = {page: 1 for page in graph}

    for _ in range(k):
        for page, _ in graph.items():
            temp_authority = 0
            for in_link in graph:
                if page in graph[in_link]:
                    temp_authority += hub_scores[in_link]
            authority_scores[page] = temp_authority

        sum_authorities = sum(authority_scores.values())

        for page in authority_scores:
            authority_scores[page] /= sum_authorities
        
        for page in graph:
            temp_hub = 0
            for out_link in graph[page]:
                temp_hub += authority_scores[out_link]
            hub_scores[page] = temp_hub
            
        sum_hubs = sum(hub_scores.values())

        for page in hub_scores:
            hub_scores[page] /= sum_hubs

    return hub_scores, authority_scores

In [15]:
def get_top_k(d: dict, k: int = 10) -> list:
    return sorted(d.items(), key = operator.itemgetter(1), reverse = True)[:k]

In [16]:
graph, edges = read_file('data1.txt')
hubs, authorities = hits(graph)

G = nx.DiGraph()
G.add_edges_from(edges)
pr= nx.pagerank(G) 

print("Top 10 Hubs:")
top_10_hubs = get_top_k(hubs)
for page, hub_score in top_10_hubs:
    print(page + ': ' + str(round(hub_score, 3)))
print()

print("Top 10 Authorities:")
top_10_authorities = get_top_k(authorities)
for page, authory_score in top_10_authorities:
    print(page + ': ' + str(round(authory_score, 3)))
print()

print("Top 10 page rank:")
top_10_pr = get_top_k(pr)
for page, pr_score in top_10_pr:
    print(page + ': ' + str(round(pr_score, 3)))

Top 10 Hubs:
E: 0.259
G: 0.171
B: 0.158
F: 0.158
D: 0.134
A: 0.046
C: 0.037
H: 0.037

Top 10 Authorities:
C: 0.388
D: 0.135
B: 0.114
F: 0.114
A: 0.109
E: 0.07
H: 0.07
G: 0.0

Top 10 page rank:
A: 0.3
C: 0.292
D: 0.28
E: 0.029
H: 0.029
B: 0.025
F: 0.025
G: 0.019
