In [1]:
import csv
import pickle
import random
import networkx

# Read in raw data

In [2]:
d = {}
with open("./train.txt", 'r') as f:
    reader = csv.reader(f, delimiter="\t")
    
    for line in reader:
        entry = list(map(int, line))
        d[entry[0]] = entry[1:]
        

# Dump dictionary (if needed)

In [6]:
with open("./d.pickle", "wb") as f:
    pickle.dump(d, f)

# Generate 5k unconnected and 5k connected pairs

In [3]:
random.seed(1)

unconnected_pairs = []
while len(unconnected_pairs) < 5000:
    # generate random sample
    x, y = random.sample(d.keys(), k=2)
    
    if x not in d[y] and y not in d[x] and (x,y) not in unconnected_pairs:
        unconnected_pairs.append((x, y))

print("done")

done


In [4]:
connected_pairs = []
while len(connected_pairs) < 5000:
    x = random.sample(d.keys(), k=1)[0]
    try:
        y = random.sample(d[x], k=1)[0]
    
        if (x,y) not in connected_pairs:
            connected_pairs.append((x, y))
    except:
        pass
    
print("done")
    

done


In [5]:
len(connected_pairs)

5000

In [6]:
len(unconnected_pairs)

5000

# Combine lists and build graph from original dataset

In [7]:
mixed_pairs = connected_pairs + unconnected_pairs

In [7]:
# https://stackoverflow.com/questions/26665799/networkx-adding-edges-to-a-graph-from-a-dictionary-with-lists-as-values
G = networkx.Graph(d)

In [9]:
print(networkx.info(G))

Name: 
Type: Graph
Number of nodes: 4867136
Number of edges: 23416061
Average degree:   9.6221


In [None]:
J = networkx.DiGraph(d)
#J.add_edges_from(z)

In [None]:
page_rank = nx.pagerank_scipy(J)

# Compute graph metrics based on the 10k sample

In [9]:
jc = list(networkx.jaccard_coefficient(G, mixed_pairs))

In [16]:
mixed_pairs[:5]

[(1971490, 4521813),
 (2863976, 926731),
 (370525, 3608160),
 (3682375, 2984819),
 (4342778, 94321)]

In [11]:
ra = list(networkx.resource_allocation_index(G, mixed_pairs))

In [12]:
aa = list(networkx.adamic_adar_index(G, mixed_pairs))

In [13]:
pa = list(networkx.preferential_attachment(G, mixed_pairs))

In [14]:
cn = [(e[0], e[1], len(list(networkx.common_neighbors(G,e[0], e[1])))) for e in mixed_pairs]

In [24]:
for i in range(len(G)):
    G.nodes[i]['community'] = 0

In [25]:
cn_sh = list(networkx.cn_soundarajan_hopcroft(G, mixed_pairs))

In [26]:
ra_sh = list(networkx.ra_index_soundarajan_hopcroft(G, mixed_pairs))

In [27]:
wic = list(networkx.within_inter_cluster(G, mixed_pairs))

In [None]:
page_rank_source = []
page_rank_sink = []

 
for (x,y) in string_pairs:
    page_rank_source.append(page_rank[x])
    page_rank_sink.append(page_rank[y])

In [None]:
hub_score, authority_score = nx.hits(J)

In [None]:
hub_source = []
hub_sink = []
authority_source = []
authority_sink = []

for (x,y) in string_pairs:
    hub_source.append(hub_score[x])
    hub_sink.append(hub_score[y])    
    authority_source.append(authority_score[x])
    authority_sink.append(authority_score[y])

In [None]:
out_degree_centrality = nx.out_degree_centrality(J)
in_degree_centrality = nx.in_degree_centrality(J)

In [None]:
source_out_degree = []
source_in_degree = []
sink_out_degree = []
sink_in_degree = []

for (x,y) in string_pairs:
    source_out_degree.append(out_degree_centrality[x])
    sink_out_degree.append(out_degree_centrality[y])    
    source_in_degree.append(in_degree_centrality[x])
    sink_in_degree.append(in_degree_centrality[y])

In [None]:
sp = []
for (x,y) in string_pairs:
    try:
        sp.append(nx.shortest_path_length(J, x, y))
    except:
        sp.append(0) # find the max and add one

In [32]:
#pr = networkx.pagerank(G, alpha=0.9)

# Save results to .csv

In [28]:
N = 5000
ones = [1 for i in range(N)] + [0 for i in range(N)]

In [29]:
write_data = [("Souce", "Sink", "Connected", "Jaccard", "Resource_alloc", "Adamic_adar", "Preferential_attachment", "Common Neighbours", "CN Sound-Hopcroft", "Resoruce Alloc Index", "Within Inner Cluster")]
for i,pair in enumerate(mixed_pairs):
    entry = (pair[0], pair[1], ones[i], jc[i][2], ra[i][2], aa[i][2], pa[i][2], cn[i][2], cn_sh[i][2], ra_sh[i][2], wic[i][2])
    write_data.append(entry)

In [30]:
with open("./trainData.csv", 'w', newline="") as f:
    writer = csv.writer(f)
    writer.writerows(write_data)
    

## Create features for test data

In [3]:
test_list = []
with open("./test-public.txt", 'r') as f:
    reader = csv.reader(f, delimiter='\t')
    
#     for line in reader:
#         entry = list(map(int, line))
#         t[entry[0]] = entry[1:]
    for line in reader:
        test_list.append((line[1], line[2]))
        

In [15]:
test_list = test_list[1:]

In [18]:
test_list = [(int(x), int(y)) for x,y in test_list]

In [19]:
test_list[:5]

[(2184483, 1300190),
 (3151356, 1452193),
 (1579396, 193159),
 (1406432, 2481036),
 (2389638, 593017)]

In [20]:
len(test_list)

2000

In [21]:
jc_test = list(networkx.jaccard_coefficient(G, test_list))

In [31]:
ra_test = list(networkx.resource_allocation_index(G, test_list))

In [22]:
aa_test = list(networkx.adamic_adar_index(G, test_list))

In [23]:
pa_test = list(networkx.preferential_attachment(G, test_list))

In [24]:
cn_test = [(e[0], e[1], len(list(networkx.common_neighbors(G,e[0], e[1])))) for e in test_list]

In [25]:
for i in range(len(G)):
    G.nodes[i]['community'] = 0

In [27]:
cn_sh_test = list(networkx.cn_soundarajan_hopcroft(G, test_list))

In [28]:
ra_sh_test = list(networkx.ra_index_soundarajan_hopcroft(G, test_list))

In [29]:
wic_test = list(networkx.within_inter_cluster(G, test_list))

In [32]:
write_data = [("Souce", "Sink", "Jaccard", "Resource_alloc", "Adamic_adar", "Preferential_attachment", "Common Neighbours", "CN Sound-Hopcroft", "Resoruce Alloc Index", "Within Inner Cluster")]
for i,pair in enumerate(test_list):
    entry = (pair[0], pair[1], jc_test[i][2], ra_test[i][2], aa_test[i][2], pa_test[i][2], cn_test[i][2], cn_sh_test[i][2], ra_sh_test[i][2], wic_test[i][2])
    write_data.append(entry)

In [33]:
with open("./testData.csv", 'w', newline="") as f:
    writer = csv.writer(f)
    writer.writerows(write_data)

## node2vec implementation

In [13]:
test_set = set()

for i, j in test_list[1:]:
    test_set.add(int(i))
    test_set.add(int(j))

In [14]:
len(test_set)

3948

In [20]:
random.sample(d[1933312],2)

[3523359, 4740709]

In [77]:
z[:5]

NameError: name 'z' is not defined

In [56]:
sink_generator = lambda x:random.sample(d[x],2)

In [84]:
my_width = []
for node in test_set:
    
    if node not in d.keys():
        key = reverse_d[node]
        my_width.append((key, node))
                
        #my_width.append((node,))
        
    elif len(d[node]) >= 2:
        sinks = sink_generator(node)
        my_width.append((node, sinks[0]))
        my_width.append((node, sinks[1]))
        
    elif len(d[node]) == 1:
        sink = d[node]
        my_width.append((node, sink[0]))
    
    #else:
    #    my_width.append((node,))

In [85]:
len(my_width)

6240

In [63]:
len(my_width)

6247

In [67]:
my_width[:5]

[(1933312, 4568979),
 (1933312, 1470023),
 (3072002,),
 (2351112,),
 (2261005, 571350)]

In [87]:
my_depth = []

for node_tup in my_width:
    new_node = node_tup[-1]
    if new_node in d.keys():
        sink = d[new_node]
        if sink:
            my_depth.append((new_node, sink[0]))

In [88]:
len(my_depth)

1326

In [89]:
final_list = my_width + my_depth

In [90]:
len(final_list)

7566

In [94]:
with open('./node2vec/node2v.txt', 'w') as f:
    for x,y in final_list:
        f.writelines(f'{x} {y}\n')

In [78]:
import csv

with open('train.txt') as test:                                                                                          
    testz = csv.reader(test, delimiter='\t')
    data = list(testz)
    
with open('test-public.txt') as test2:                                                                                          
    test2z = csv.reader(test2, delimiter='\t')
    test_data = list(test2z)

In [79]:
#turn training data into a list of tuples; where each one represents a relationship
a = []
b = []
for i in range(len(data)):
    a.append(data[i][0])
#print(a[:2])

for i in range(2):
    print(len(data[i][1:]))
y = []
z = []
for i in range(len(data)):
    y = [a[i]]*len(data[i][1:])
    z.extend(list(zip(y,data[i][1:])))

143
21


In [82]:
reverse_d = {}

for x,y in z:
    reverse_d[int(y)] = int(x)

In [83]:
len(reverse_d)

4867136