# attribute prediction evaluation

In [1]:
import pandas as pd
from math import log
import networkx as nx

In [2]:
network = pd.read_csv('network.tsv',sep = '\t',header=None)
network = network.rename(columns={0: 'node_id1', 1: 'node_id2'})
network['edges'] = list(zip(network['node_id1'], network['node_id2']))
network.head()  

Unnamed: 0,node_id1,node_id2,edges
0,3942361,4009630,"(3942361, 4009630)"
1,1862789,2403557,"(1862789, 2403557)"
2,3559086,3449838,"(3559086, 3449838)"
3,4268588,6041523,"(4268588, 6041523)"
4,3067063,3845402,"(3067063, 3845402)"


In [3]:
# import training data - 5301403 records
train = pd.read_csv('data/labeled-vertices.train.tsv', sep = '\t', header = None)
train = train.rename(columns={0: 'node_id', 1: 'attributes'})
train.head()

Unnamed: 0,node_id,attributes
0,5509623,T0:0 T1:0
1,6334893,T0:0 T1:1
2,1218900,T0:1 T1:2
3,3871398,T0:1 T1:2
4,3942361,T0:0 T1:3


In [22]:
# import test data (662675 rows; type:float64)
# test = pd.read_csv('data/unlabeled-vertices.test.txt',header = None)
# test = test.rename(columns={0: 'node_id'})
# test.head()

In [21]:
dev = pd.read_csv('labeled-vertices.dev.tsv',sep='\t',header=None)
dev = dev.rename(columns = {0:'id',1:'attr'})
dev.head()

Unnamed: 0,id,attr
0,2666403,T0:2 T1:99
1,2627940,T0:0 T1:26
2,4843136,T0:0 T1:26
3,5396835,T0:0 T1:1813
4,5438188,T0:1 T1:1733


In [23]:
dev_nodes = dev.id.tolist()
include_dev_df = network[(network.node_id1.isin(dev_nodes) == True) | (network.node_id2.isin(dev_nodes) == True)]
include_dev_df.head()  # subgraph include all test nodes + its neighborhood
len(include_dev_df)

1009292

In [25]:
include_dev_df.head()

Unnamed: 0,node_id1,node_id2,edges
15590836,2666403,2051632,"(2666403, 2051632)"
15590842,2627940,4843136,"(2627940, 4843136)"
15590862,5396835,2352112,"(5396835, 2352112)"
15590873,5438188,5690565,"(5438188, 5690565)"
15590880,1394998,4497108,"(1394998, 4497108)"


In [26]:
# add nearby train nodes upon include_test_df
train_nodes = train.node_id.tolist()
subset_df = include_dev_df[(include_dev_df.node_id1.isin(train_nodes) == True) | (include_dev_df.node_id2.isin(train_nodes) == True)]


In [27]:
# construct graph
subset_edge_list = subset_df['edges'].tolist()
G = nx.Graph()
G.add_edges_from(subset_edge_list)

I would take the first 50 nodes from dev file to make an error analysis, using the same method that I used to predict attributes for test data.

In [29]:
G.add_nodes_from(dev_nodes[:50])

In [17]:
def get_attr(node):
    attr = train[train.node_id == node].attributes.values
    attr_lst = attr[0].split(' ')
    return attr_lst  # format: TBD



def get_neighbor_lst(G,node):
    neighbor_lst = [neighbor for neighbor in G.neighbors(node)]
    return neighbor_lst


def get_attr_from_neighbor(node): # test_node
   
    neighbor_lst = get_neighbor_lst(G,node)
    attr_lst = []
    for neighbor in neighbor_lst:
        attr = get_attr(neighbor) # get neighbors' attributes
        attr_lst += attr

    return attr_lst

    

In [18]:
def get_format_result(test_node):
    results = get_attr_from_neighbor(test_node)

    dct = {}
    typ_lst = []
    val_lst = []

    for attr in results:
        typ_lst.append(attr.split(':')[0])
        val_lst.append(attr.split(':')[1])

    dct['attr_type'] = typ_lst
    dct['value'] = val_lst

    # parse into df
    df = pd.DataFrame(dct)
    df = df.drop_duplicates()
    target_df = df.groupby('attr_type').agg(lambda x:x.value_counts().index[0])
    target_df

    # format
    lst = []
    for (t,v) in target_df.itertuples():
        output_str = t + ':' + v
        lst.append(output_str)

    return lst

In [39]:
dct = {}

prediction_lst = []
for node_id in dev_nodes[:50]:
    predictions = get_format_result(node_id)
    prediction_lst.append(predictions)
    
dct['dev_node_id'] = dev_nodes[:50]
dct['attr'] = prediction_lst

pred_df = pd.DataFrame(dct)
pred_df.head()

Unnamed: 0,dev_node_id,attr
0,2666403,"[T0:2, T1:99]"
1,2627940,[]
2,4843136,[]
3,5396835,"[T0:0, T1:1813]"
4,5438188,"[T0:1, T1:1733]"


In [40]:
dev.head()  # source file

Unnamed: 0,id,attr
0,2666403,T0:2 T1:99
1,2627940,T0:0 T1:26
2,4843136,T0:0 T1:26
3,5396835,T0:0 T1:1813
4,5438188,T0:1 T1:1733


In [48]:
merge_df = pd.merge(dev.head(50), pred_df, how='left',left_on='id',right_on='dev_node_id')
merge_df = merge_df.drop(columns = ['dev_node_id'])
merge_df = merge_df.rename(columns = {'attr_x':'true attr','attr_y':'pred attr'})
merge_df

Unnamed: 0,id,true attr,pred attr
0,2666403,T0:2 T1:99,"[T0:2, T1:99]"
1,2627940,T0:0 T1:26,[]
2,4843136,T0:0 T1:26,[]
3,5396835,T0:0 T1:1813,"[T0:0, T1:1813]"
4,5438188,T0:1 T1:1733,"[T0:1, T1:1733]"
5,1394998,T0:0 T1:653,"[T0:0, T1:20]"
6,174985,T0:0 T1:87,"[T0:0, T1:87]"
7,3627071,T0:0 T1:119,"[T0:0, T1:1680]"
8,4073409,T0:0 T1:83,"[T0:0, T1:83]"
9,1151948,T0:0 T1:111,"[T0:0, T1:111]"


Comparing the true attributes and the predict attributes, I can see that one main problem is that some of the pred attr are missing, simply because when I built the graph, I only include those training nodes that are perceived as neighbors of the dev nodes. This make the prediction not accuracy enough. So one way I can think of to improve, is to build graph based on training nodes, and then add nodes from test data (or dev data). I have tried that, but unfortunately I don't have enough time to fix this problem and to get the results. Also, I notice that some types are especially hard to predict than the others, e.g., T1:26, T1:98, T1:115, etc.