In [112]:
%matplotlib inline
import scipy
import numpy as np
import pandas as pd
import cPickle as pickle
from scipy.sparse import csr_matrix
from sklearn.cross_validation import train_test_split
import math
import sys

In [2]:
%pdb

Automatic pdb calling has been turned ON


In [171]:
path_kg_in = '/Users/vmisra/data/kg_embed_data/data/traindata_db233_minmentions10_minentity3.pkl'
path_wordnet_in = '/Users/vmisra/data/kg_embed_data/data/noun_relations.pkl'

SEED = 2052016
VALIDATION_HOLDOUT = 0.05
N_TRAINING_SAMPLES = 100000
N_VAL_SAMPLES = -1
DIST_TYPE = 'UNIFORM'

#Construct dataset

###Create positive and negative samples

In [147]:
data,_,_ = pickle.load(open(path_wordnet_in,'r'))

#first, remove all diagonal self-links, as this leads to leakage in the prediction task
data = data[np.nonzero(data[:,0]-data[:,1])]

#next, shuffle it
randstate = np.random.RandomState(SEED)
data = randstate.permutation(data)

#finally, determine the number of entities for future reference
n_entities = max(max(data[:,0]),max(data[:,1]))+1

In [172]:
#next, remove the validation set
data_trainable, data_validate = train_test_split(data,test_size = VALIDATION_HOLDOUT, random_state=SEED)
data_train_pos = data_trainable[:N_TRAINING_SAMPLES].copy()
data_val_pos = data_validate[:N_VAL_SAMPLES].copy()
N_VAL_SAMPLES = len(data_val_pos)

In [174]:
#generate negative samples for training
if DIST_TYPE == 'UNIGRAM':
    randidxs = randstate.permutation(data.flatten())[:N_TRAINING_SAMPLES] #unigram distribution
elif DIST_TYPE == 'UNIFORM':
    randidxs = randstate.randint(n_entities,size=N_TRAINING_SAMPLES)#uniform distribution
else:
    print 'UNRECOGNIZED DISTRIBUTION TYPE!!!'
    
randchoice = randstate.binomial(n=1,p=.5,size=(N_TRAINING_SAMPLES))
data_train_neg = data_train_pos.copy()

for edge,choice,randidx in zip(data_train_neg,randchoice,randidxs):
    edge[choice] = randidx

In [175]:
#generate negative samples for validation
if DIST_TYPE == 'UNIGRAM':
    randidxs = randstate.permutation(data.flatten())[:N_TRAINING_SAMPLES] #unigram distribution
elif DIST_TYPE == 'UNIFORM':
    randidxs = randstate.randint(n_entities,size=N_TRAINING_SAMPLES)#uniform distribution
else:
    print 'UNRECOGNIZED DISTRIBUTION TYPE!!!'
randchoice = randstate.binomial(n=1,p=.5,size=(N_VAL_SAMPLES))
data_val_neg = data_val_pos.copy()

for edge,choice,randidx in zip(data_val_neg,randchoice,randidxs):
    edge[choice] = randidx

###Create graph

In [176]:
rows = np.transpose(data)[0]
cols = np.transpose(data)[1]

graph = csr_matrix((np.ones(len(rows)+len(cols)),(np.concatenate([rows,cols]),np.concatenate([cols,rows]))),shape=(n_entities,n_entities))
graph.data = np.ones(len(graph.data))
#graph.setdiag(values=np.zeros(graph.shape[0])) #no longer necessary --- we found a simpler way of doing this without the slow scipy sparse operations.
#for i in range(graph.shape[0]):
#    graph[i,i] = 0 #remove all self-links, as this ensures no leakage in the training process.
#graph = csr_matrix((np.ones(len(rows)),(rows,cols)),shape=(n_entities,n_entities))

#Feature computation mechanism

##First: common neighbors vector

In [177]:
#common neighbors benchmark
def get_common_neighbors(graph_local, data_local):
    common_neighbors = np.ndarray(len(data_local))

    import time
    start = time.time()
    for i,edge in enumerate(data_local):
        common_neighbors[i] = graph_local.getrow(edge[0]).dot(graph_local.getrow(edge[1]).transpose()).toarray()[0,0]
        
    print time.time()-start, " seconds taken for getting common neighbors"
    sys.stdout.flush()
    return common_neighbors

##Second: Adamic adar

In [178]:
def gen_AA(graph_local, data_local, func=lambda x: 1/(1.0+math.log(x+1))):
    #neighborhood sizes for each node
    n_neighbors = np.squeeze(np.array(graph_local.sum(axis=1)))
    #map it into the AA weights as given by the function argument
    AA_weights = map(func,n_neighbors)
    
    #actually compute AA features
    AA_features = np.ndarray(len(data_local))
    import time
    start = time.time()
    for i,edge in enumerate(data_local):
        AA_features[i] = np.squeeze(np.array(graph_local.getrow(edge[0]).dot(graph_local.getrow(edge[1]).multiply(AA_weights).transpose())))    
    print time.time()-start, "seconds to do AA."
    sys.stdout.flush()
    return AA_features

#Featurize the data

In [179]:
common_neighbors_pos = get_common_neighbors(graph,data_train_pos)
common_neighbors_neg = get_common_neighbors(graph,data_train_neg)

85.9058570862  seconds taken for getting common neighbors
84.8270850182  seconds taken for getting common neighbors


In [180]:
common_neighbors_pos_val = get_common_neighbors(graph,data_val_pos)
common_neighbors_neg_val = get_common_neighbors(graph,data_val_neg)

9.71619200706  seconds taken for getting common neighbors
9.55100798607  seconds taken for getting common neighbors


In [181]:
straight_AA_pos = gen_AA(graph, data_train_pos)
straight_AA_neg = gen_AA(graph, data_train_neg)
sqrt_AA_pos = gen_AA(graph,data_train_pos,lambda x: (1+x)**(-.5))
sqrt_AA_neg = gen_AA(graph,data_train_neg,lambda x: (1+x)**(-.5))
cubert_AA_pos = gen_AA(graph,data_train_pos,lambda x: (1+x)**(-.3))
cubert_AA_neg = gen_AA(graph,data_train_neg,lambda x: (1+x)**(-.3))

394.50590992 seconds to do AA.
396.158967018 seconds to do AA.
922.640341043 seconds to do AA.
914.378906012 seconds to do AA.
915.507543802 seconds to do AA.
917.92914319 seconds to do AA.


In [182]:
straight_AA_pos_val = gen_AA(graph, data_val_pos)
straight_AA_neg_val = gen_AA(graph, data_val_neg)
sqrt_AA_pos_val = gen_AA(graph,data_val_pos,lambda x: (1+x)**(-.5))
sqrt_AA_neg_val = gen_AA(graph,data_val_neg,lambda x: (1+x)**(-.5))
cubert_AA_pos_val = gen_AA(graph,data_val_pos,lambda x: (1+x)**(-.3))
cubert_AA_neg_val = gen_AA(graph,data_val_neg,lambda x: (1+x)**(-.3))

46.387305975 seconds to do AA.
45.9769368172 seconds to do AA.
106.259618998 seconds to do AA.
106.118647099 seconds to do AA.
105.348823071 seconds to do AA.
105.568843126 seconds to do AA.


In [183]:
raw_features_pos = [straight_AA_pos,sqrt_AA_pos,cubert_AA_pos, common_neighbors_pos]
raw_features_neg = [straight_AA_neg,sqrt_AA_neg,cubert_AA_neg, common_neighbors_neg]

In [184]:
raw_features_pos_val = [straight_AA_pos_val,sqrt_AA_pos_val,cubert_AA_pos_val,common_neighbors_pos_val]
raw_features_neg_val = [straight_AA_neg_val,sqrt_AA_neg_val,cubert_AA_neg_val,common_neighbors_neg_val]

In [185]:
map(len,raw_features_pos)

[100000, 100000, 100000, 100000]

In [186]:
transformations = [lambda x: x,
                   lambda x: np.log(x+1),
                   lambda x: x**.5,
                   lambda x: x**.3,
                   lambda x: x**2]

def transform_features(feature_list):
    output_features = []
    for transform in transformations:
        for feature in feature_list:
            output_features.append(transform(feature))
    return output_features

In [187]:
features_pos = np.vstack(transform_features(raw_features_pos)).transpose()
features_neg = np.vstack(transform_features(raw_features_neg)).transpose()
features_train = np.concatenate([features_pos,features_neg],axis=0)
labels_train = np.concatenate([np.ones(len(features_pos)),0*np.ones(len(features_pos))]).astype(np.int)

In [188]:
features_pos_val = np.vstack(transform_features(raw_features_pos_val)).transpose()
features_neg_val = np.vstack(transform_features(raw_features_neg_val)).transpose()
features_val = np.concatenate([features_pos_val,features_neg_val],axis=0)
labels_val = np.concatenate([np.ones(len(features_pos_val)),0*np.ones(len(features_pos_val))]).astype(np.int)

#Train classifier

In [189]:
from sklearn.preprocessing import StandardScaler
featuresScaled_train_val = StandardScaler().fit_transform(np.concatenate([features_train,features_val],axis=0))
scaled_feats_train = featuresScaled_train_val[:len(features_train)]
scaled_feats_val = featuresScaled_train_val[len(features_train):]

In [190]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(C=10,penalty='l1')
clf.fit(scaled_feats_train,labels_train.transpose())

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l1', random_state=None, tol=0.0001)

In [191]:
clf.coef_

array([[ -4.66273389e-01,  -2.53564458e-01,   4.85040230e-01,
         -4.36690716e-01,  -5.60143261e-01,   7.55117520e-04,
          3.91808745e-01,  -2.00667252e-01,  -2.12913207e-01,
          1.97231585e-01,   4.28340863e-01,  -5.20653450e-02,
          3.42887019e-01,   4.76192743e-01,   2.74870029e-01,
          2.56046196e-01,   0.00000000e+00,  -1.00150826e+00,
          1.23946834e+00,   1.07403587e+00]])

In [192]:
clf.score(scaled_feats_val,labels_val)

0.54470842332613389

In [193]:
scaled_feats_train.shape

(200000, 20)

In [194]:
neglogprobs = -clf.predict_log_proba(scaled_feats_val)
score = 0
for logprobs,label in zip(neglogprobs,labels_val):
    score += logprobs[label]
score/len(labels_val)

0.66291875249218735