# Task 2: Attribute Prediction

Describtion: For attribute prediction, I construct the graph using similar method as what I did in task 1. But instead of randomly selected nodes from network file, I chose the nodes particularly from test data first to construct the network, because I think that's what we actually care about and ultimately want to get the results. However, simply using test node is not enough, since we would want our test node to be around the training nodes, so that we can predict the test node attributes based on training nodes' attributes. So I add the training nodes when they are either in the test-node subgraph or in its neighborhood. Then I get a list of neighboring nodes and use the "majority vote" intuition to decide which attributes should the target test node adopt.

In [1]:
import pandas as pd
from math import log
import networkx as nx

In [2]:
network = pd.read_csv('network.tsv',sep = '\t',header=None)
network = network.rename(columns={0: 'node_id1', 1: 'node_id2'})
network['edges'] = list(zip(network['node_id1'], network['node_id2']))
network.head()  

Unnamed: 0,node_id1,node_id2,edges
0,3942361,4009630,"(3942361, 4009630)"
1,1862789,2403557,"(1862789, 2403557)"
2,3559086,3449838,"(3559086, 3449838)"
3,4268588,6041523,"(4268588, 6041523)"
4,3067063,3845402,"(3067063, 3845402)"


In [3]:
# import training data - 5301403 records
train = pd.read_csv('data/labeled-vertices.train.tsv', sep = '\t', header = None)
train = train.rename(columns={0: 'node_id', 1: 'attributes'})
train.head()

Unnamed: 0,node_id,attributes
0,5509623,T0:0 T1:0
1,6334893,T0:0 T1:1
2,1218900,T0:1 T1:2
3,3871398,T0:1 T1:2
4,3942361,T0:0 T1:3


In [4]:
# import test data (662675 rows; type:float64)
test = pd.read_csv('data/unlabeled-vertices.test.txt',header = None)
test = test.rename(columns={0: 'node_id'})
test.head()

Unnamed: 0,node_id
0,4546232
1,3711008
2,6394112
3,5883774
4,2843733


In [5]:
train_nodes = train.node_id.tolist()

include_train_df = network[(network.node_id1.isin(train_nodes) == True) | (network.node_id2.isin(train_nodes) == True)]
include_train_df.head()  
len(include_train_df)

30812820

In [8]:
# add nearby train nodes upon include_test_df
test_nodes = test.node_id.tolist()
subset_df = include_train_df[(include_train_df.node_id1.isin(test_nodes) == True) | (include_train_df.node_id2.isin(test_nodes) == True)]


In [9]:
# construct graph
subset_edge_list = subset_df['edges'].tolist()
G = nx.Graph()
G.add_edges_from(subset_edge_list)

In [10]:
len(G.nodes())

1156688

In [11]:
G.add_nodes_from(test_nodes)

In [12]:
train_in_graph = train[train.node_id.isin(G.nodes()) == True]

In [13]:
# parse attributes to training data nodes (in random graph)

attr_dict = {}
# for (k,v) in zip(train.node_id,train.attributes):
for (k,v) in zip(train_in_graph.node_id,train_in_graph.attributes):
    node_attrs = {}
    attrs = v.split() # ['T0:0', 'T1:0']
    
    node_attrs = {}        
    for attr in attrs:
        key = attr.split(':')[0] # key
        value = attr.split(':')[1] # value
        node_attrs[key] = value
    
    attr_dict[k] = node_attrs

nx.set_node_attributes(G, attr_dict)

In [26]:
train.head(10)

Unnamed: 0,node_id,attributes
0,5509623,T0:0 T1:0
1,6334893,T0:0 T1:1
2,1218900,T0:1 T1:2
3,3871398,T0:1 T1:2
4,3942361,T0:0 T1:3
5,4009630,T0:0 T1:4
6,3890082,T0:0 T1:5 T8:0
7,1010073,T0:0 T1:6
8,3348867,T0:0 T1:7
9,4017062,T0:0 T1:8


In [54]:
def get_attr(node):
    attr = train[train.node_id == node].attributes.values
    attr_lst = attr[0].split(' ')
    return attr_lst  # format: TBD

get_attr(5509623)

['T0:0', 'T1:0']

In [55]:
# for test_node in test_nodes:
def get_neighbor_lst(G,node):
    neighbor_lst = [neighbor for neighbor in G.neighbors(node)]
    return neighbor_lst


def get_attr_from_neighbor(node): # test_node
   
    neighbor_lst = get_neighbor_lst(G,node)
    attr_lst = []
    for neighbor in neighbor_lst:
        attr = get_attr(neighbor) # get neighbors' attributes
        attr_lst += attr

    return attr_lst
    
get_attr_from_neighbor(4546232)    

['T0:0', 'T1:1766']

In [56]:
for test_node in test_nodes[:3]:
    results = get_attr_from_neighbor(test_node)
    print(results)

['T0:0', 'T1:1766']
['T0:0', 'T1:1762', 'T0:0', 'T1:297', 'T0:0', 'T1:295']
['T0:0', 'T1:1914', 'T8:0']


I specify the attributes of a target node as adoption from its surrounding nodes. If the attribute type from its neighbors are unique, then the target node would adopt that attribute with value from its neighbor; if the attribute type is not unique, then I would evaluate based on the majority of its values, to decide the specific value for that type of attribute in our target node. 

In [110]:
def get_format_result(test_node):
    results = get_attr_from_neighbor(test_node)

    dct = {}
    typ_lst = []
    val_lst = []

    for attr in results:
        typ_lst.append(attr.split(':')[0])
        val_lst.append(attr.split(':')[1])

    dct['attr_type'] = typ_lst
    dct['value'] = val_lst

    # parse into df
    df = pd.DataFrame(dct)
    df = df.drop_duplicates()
    target_df = df.groupby('attr_type').agg(lambda x:x.value_counts().index[0])
    target_df

    # format
    lst = []
    for (t,v) in target_df.itertuples():
        output_str = t + ':' + v
        lst.append(output_str)

    return lst

get_format_result(test_nodes[2])

['T0:0', 'T1:1914', 'T8:0']

In [None]:
import time
start_time = time.time() 
attr_predictions = []
for test_node in test_nodes[:132535]:
    results = get_format_result(test_node)
    attr_predictions.append(results)

end_time = time.time()
print("predictions created. Process took {:.04f} seconds".format(end_time - start_time))

In [None]:
data = {}
data['id'] = test_nodes
data['attr'] = attr_predictions # todo

result_df = pd.DataFrame(data)

Since the test dataset is quite huge, my computer simply would run forever when I implement the prediction all at at once. So I divide the test set into 5 batches, and predict each of them using the functions defined above. 

In [30]:
# divide test data into 5 batches and predict each of them
# test_nodes[:132535][0]  # -> 4546232   
# test_nodes[132535:265070][0]  # -> 647698   
# test_nodes[265070:397605][0]  # -> 3529436   
# test_nodes[397605:530140] [0]  #  -> 90666   
# test_nodes[530140:][0]   # -> 971066  

4546232

In [39]:
from scipy import stats
import time

start_time = time.time() 
predict_attrs_bag5 = []

for test_node in test_nodes[530140:]:
    predict_attrs_bag5.append(predict_node_attr(test_node))

end_time = time.time()
print("predictions_bag5 created. Process took {:.04f} seconds".format(end_time - start_time))

predictions_bag5 created. Process took 1057.9442 seconds


In [40]:
# import json
    
# with open('task2_outputfile(bag5)', 'w') as fout:
#     json.dump(predict_attrs_bag5, fout)

Finally, output the predictions to file. Once I get all the batches of predictions, I combined all of them locally by copying and pasting in a txt file, and clean it and reformat it as required in the instruction in another ipython script(see "formatting.ipython").