# Task 2: Attribute Prediction

In [1]:
import pandas as pd
from math import log
import networkx as nx

In [2]:
network = pd.read_csv('network.tsv',sep = '\t',header=None)
network = network.rename(columns={0: 'node_id1', 1: 'node_id2'})
network['edges'] = list(zip(network['node_id1'], network['node_id2']))
network.head()  

Unnamed: 0,node_id1,node_id2,edges
0,3942361,4009630,"(3942361, 4009630)"
1,1862789,2403557,"(1862789, 2403557)"
2,3559086,3449838,"(3559086, 3449838)"
3,4268588,6041523,"(4268588, 6041523)"
4,3067063,3845402,"(3067063, 3845402)"


In [39]:
# import training data - 5301403 records
train = pd.read_csv('data/labeled-vertices.train.tsv', sep = '\t', header = None)
train = train.rename(columns={0: 'node_id', 1: 'attributes'})
train.head()

Unnamed: 0,node_id,attributes
0,5509623,T0:0 T1:0
1,6334893,T0:0 T1:1
2,1218900,T0:1 T1:2
3,3871398,T0:1 T1:2
4,3942361,T0:0 T1:3


In [3]:
# import test data (662675 rows; type:float64)
test = pd.read_csv('data/unlabeled-vertices.test.txt',header = None)
test = test.rename(columns={0: 'node_id'})
test.head()

Unnamed: 0,node_id
0,4546232
1,3711008
2,6394112
3,5883774
4,2843733


In [4]:
test_nodes = test.node_id.tolist()
include_test_df = network[(network.node_id1.isin(test_nodes) == True) | (network.node_id2.isin(test_nodes) == True)]
include_test_df.head()  # subgraph include all test nodes + its neighborhood
len(include_test_df)

747055

In [7]:
# add nearby train nodes upon include_test_df
train_nodes = train.node_id.tolist()
subset_df = include_test_df[(include_test_df.node_id1.isin(train_nodes) == True) | (include_test_df.node_id2.isin(train_nodes) == True)]


In [8]:
# construct graph
subset_edge_list = subset_df['edges'].tolist()
G = nx.Graph()
G.add_edges_from(subset_edge_list)

In [9]:
len(G.nodes())

1156688

In [10]:
G.add_nodes_from(test_nodes)

In [12]:
# parse attributes to training data nodes (in random graph)

attr_dict = {}
# for (k,v) in zip(train.node_id,train.attributes):
for (k,v) in zip(train_in_graph.node_id,train_in_graph.attributes):
    node_attrs = {}
    attrs = v.split() # ['T0:0', 'T1:0']
    
    node_attrs = {}        
    for attr in attrs:
        key = attr.split(':')[0] # key
        value = attr.split(':')[1] # value
        node_attrs[key] = value
    
    attr_dict[k] = node_attrs

nx.set_node_attributes(G, attr_dict)

In [36]:
def neighbourhood(G, n,k=1):
    if k == 1:
        return G[n]
    dist = nx.single_source_shortest_path_length(G, n, k)
    del dist[n]
    return dist.keys()


def get_node_neighbors(n):
    neighbors = list(neighbourhood(G, n,k=1)) 
    all_lst = []
    for neighbor_node in neighbors:
        neighbor_attrs = G.nodes[neighbor_node]

        lst = []
        for (k, v) in neighbor_attrs.items():
            lst.append((k,v))

        all_lst += lst
        
    return all_lst


In [None]:
def predict_node_attr(n):
    all_lst = get_node_neighbors(n)  # get_node_neighbors <<
    df = pd.DataFrame(all_lst, columns = ['attr','value'])
    target_df = df.groupby('attr').agg(lambda x:x.value_counts().index[0]) 
    
    target_node_attr = {}
    target_df.to_dict()['value']
    target_node_attr[n] = target_df.to_dict()['value']
    
    return target_node_attr

In [30]:
# divide test data into 5 batches and predict each of them
test_nodes[:132535][0]  # -> 4546232   
test_nodes[132535:265070][0]  # -> 647698   
test_nodes[265070:397605][0]  # -> 3529436   
test_nodes[397605:530140] [0]  #  -> 90666   
test_nodes[530140:][0]   # -> 971066  

4546232

In [39]:
from scipy import stats
import time

start_time = time.time() 
predict_attrs_bag5 = []

for test_node in test_nodes[530140:]:
    predict_attrs_bag5.append(predict_node_attr(test_node))

end_time = time.time()
print("predictions_bag5 created. Process took {:.04f} seconds".format(end_time - start_time))

predictions_bag5 created. Process took 1057.9442 seconds


In [40]:
import json
    
with open('task2_outputfile(bag5)', 'w') as fout:
    json.dump(predict_attrs_bag5, fout)