In [5]:
import os
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd

%matplotlib inline

### Finding Giant Component

In [6]:
from pathlib import Path

path = Path('../data/processed_except/pantip_edge_10.csv').resolve();
di_graph = nx.DiGraph()
with open(path, 'r') as input_file:
    while True:
        line = input_file.readline().rstrip()

        # EOF
        if line == '':
            break

        splits = line.split(',')
        # prevent self-loop
        if splits[0] != splits[1]:
            di_graph.add_edge(splits[1], splits[0])

# Finding the Giant Component
undi_graph = di_graph.to_undirected()
graph_components = (undi_graph.subgraph(c) for c in nx.connected_components(undi_graph));
graph_components = sorted(graph_components, key=len, reverse=True)

giant_component =  di_graph.subgraph(graph_components[0].nodes())

edge_df = pd.DataFrame(columns=[ 'Source','Destination' ])
for index, edge in enumerate(giant_component.edges()):
    (src, dest) = edge
    edge_df.loc[index] = [ src, dest ]

edge_df.head()

Unnamed: 0,Source,Destination
0,38185138,38184895
1,36815279,31746773
2,32531118,35249916
3,38156318,38155883
4,38163904,38163673


### PageRank

In [7]:
pr = nx.pagerank(giant_component)
pr_tuple = []
for p in pr:
    pr_tuple.append((p, pr[p]))
pr_tuple = sorted(pr_tuple, key=lambda x: x[1], reverse=True)
pr_df = pd.DataFrame(columns=['TopicID', 'PageRank Value'])
for index, prt in enumerate(pr_tuple):
    pr_df.loc[index] = [prt[0], prt[1]]
pr_df.head(10)

Unnamed: 0,TopicID,PageRank Value
0,38176352,0.002464
1,34502095,0.002418
2,34222014,0.002349
3,32983135,0.002219
4,32447263,0.002058
5,38179329,0.00202
6,34433699,0.001966
7,36111851,0.001948
8,35268526,0.001925
9,31791000,0.00185


##### Finding Topic-Tags

In [8]:
from pathlib import Path
import json
topic_tags = {}
tag_path = Path('../data/processed_except/topic_tags.json').resolve();
with open(tag_path, 'r', encoding='utf-8') as tf:
    while True:
        line = tf.readline()
        if line == '':
            break
        cur_json = json.loads(line)
        topic_tags[cur_json['topic_id']] = cur_json['tags']

FileNotFoundError: [Errno 2] No such file or directory: 'D:\\data_mining_project\\data\\processed_except\\topic_tags.json'

#### PageRank Result

In [3]:
from pathlib import Path
import json
topics_title = {}
path = Path('../data/processed_except/topics.json').resolve();
with open(path, 'r', encoding='utf-8') as tf:
    while True:
        line = tf.readline()
        if line == '':
            break
        cur_json = json.loads(line)
        topics_title[cur_json['topic_id']] = cur_json['title']

In [4]:
recommend_set = set()
count = 0
index = 0
for i in range(1000):
    if count >= 100:
        break
    cur_set = set(topic_tags[pr_tuple[i][0]])
    ints_set = recommend_set.intersection(cur_set)
    if len(ints_set) == 0:
        for x in cur_set:
            recommend_set.add(x)
        count += 1
        print(pr_tuple[i][0])
        print(pr_tuple[i][1])
        print(topic_tags[pr_tuple[i][0]])
        index = i
        print("===============")
print(index)

NameError: name 'topic_tags' is not defined

### HITS

In [23]:
hubs, auths = nx.hits(giant_component, max_iter=500)
hits_data = {}
for hub in hubs:
    hits_data[hub] = [ hubs[hub] ]
for auth in auths:
    hits_data[auth].append(auths[auth])

hits_data_arr = []
for index, topic in enumerate(hits_data):
    cur_data = hits_data[topic]
    hits_data_arr.append([ topic, cur_data[0], cur_data[1] ])

hits_data_arr = sorted(hits_data_arr, key=lambda x: x[2], reverse=True)

hits_df = pd.DataFrame(columns=['TopicID', 'Hub', 'Authority'])
for index, data in enumerate(hits_data_arr):
    hits_df.loc[index] = [ data[0], data[1], data[2] ]

hits_df.head(10)

Unnamed: 0,TopicID,Hub,Authority
0,38122354,0.014925,0.115598
1,38116097,0.01962,0.099313
2,38104881,0.089684,0.09323
3,38128544,0.036114,0.087938
4,38106313,0.045879,0.07835
5,38111299,0.049774,0.077767
6,38140936,0.108599,0.075499
7,38178317,0.094059,0.069434
8,38147148,0.136068,0.065974
9,38173055,0.098851,0.055281
