In [1]:
import os
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd

%matplotlib inline

# Import Graph Data

In [2]:
from pathlib import Path

path = Path('../data/processed_except/user_topic_filtered.csv').resolve();
di_graph = nx.DiGraph()
user_set = set()
with open(path, 'r') as input_file:
    # skip first line
    line = input_file.readline().rstrip()
    while True:
        line = input_file.readline().rstrip()

        # EOF
        if line == '':
            break

        splits = line.split(',')
        # prevent self-loop
        if splits[0] != splits[1]:
            user_set.add(splits[1])
            di_graph.add_edge(splits[1], splits[0])

In [3]:
# edge_df = pd.DataFrame(columns=[ 'Source','Destination' ])
# for index, edge in enumerate(di_graph.edges()):
#     (src, dest) = edge
#     edge_df.loc[index] = [ src, dest ]
# edge_df.head()

In [3]:
user_pairs = []
user_list = list(user_set)
for x in range(len(user_list)):
    for y in range(x + 1, len(user_list)):
        user_pairs.append((user_list[x], user_list[y]))

# Jaccard's Coefficient

In [4]:
un_graph = di_graph.to_undirected()

jaccard_result = []
for pair in user_pairs:
    u = pair[0]
    v = pair[1]
    u_set = set(nx.all_neighbors(un_graph, u))
    v_set = set(nx.all_neighbors(un_graph, v))
    uv_com_set = u_set.intersection(v_set)
    uv_un_set = u_set.union(v_set)
    jaccard_coef = 0
    if len(uv_un_set) > 0:
        jaccard_coef = len(uv_com_set) / len(uv_un_set)
    if jaccard_coef > 0:
        jaccard_result.append((u, v, jaccard_coef))

In [5]:
jaccard_result = sorted(jaccard_result, key=lambda x: x[2], reverse=True)

# Result

In [6]:
flitered_jaccard_result = list(filter(lambda x: x [2] >= 0.5, jaccard_result))

In [7]:
user_topics_result = {}
for data in flitered_jaccard_result:
    u = data[0]
    v = data[1]
    if u not in user_topics_result:
        user_topics_result[u] = set()
    if v not in user_topics_result:
        user_topics_result[v] = set()
        
    u_set = set(nx.all_neighbors(un_graph, u))
    v_set = set(nx.all_neighbors(un_graph, v))
    uv_com_set = u_set.intersection(v_set)
    uv_un_set = u_set.union(v_set)
    
    un_dif_com = uv_un_set.difference(uv_com_set)
    user_topics_result[u] = user_topics_result[u].union(un_dif_com.difference(u_set))
    user_topics_result[v] = user_topics_result[v].union(un_dif_com.difference(v_set))

In [8]:
result_df = pd.DataFrame(columns=[ 'User', 'Prediction Topic' ])
for index, utr in enumerate(user_topics_result):
    result_df.loc[index] = [ utr, list(user_topics_result[utr]) ]
result_df.head(50)

Unnamed: 0,User,Prediction Topic
0,oh2radjylBPW4Vduxt,[37700048]
1,olteszesbBJkh3g5q3j,[36894667]
2,pb8oxl3b5QQdNjSQD0D,"[38179746, 38180028, 38167459, 38180612, 38177..."
3,oborxz90uMoIfuZQyvj,"[38181846, 38179746, 38180028, 38181711, 38178..."
4,ozevb32vj5xsix6R3Ze,[36136603]
5,p363sjr6mdU7oi8LH8,"[31177339, 36245659, 35108592, 30769364]"
6,pgs5xc4l1w0koWc444Jv,"[36583753, 31879007, 31440005, 36366065]"
7,ozt5ze11ajWJAJYxtdfa,[30102456]
8,pgwckg3pd3zRh341PPIi,"[38166121, 36386170, 38166177]"
9,nkjl6j972BZq5RO2LRp,"[36531498, 33329584]"
