In [1]:
import pandas as pd
import re
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
# read data
data=pd.read_csv('/vol/home/s2465922/data/crime_from_2001.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# extract the crime incidents of recent 6 year
df=data[data.Year>2014]

# extract needed columns
df=df[['Date','IUCR','Location Description','Community Area']]

# remove rows that has nan values
df=df.dropna(how='any')

# convert time string to datetime type
df['Date']=pd.to_datetime(df['Date'],format='%m/%d/%Y %I:%M:%S %p')

# convert community area as str type
df['Community Area']=df['Community Area'].astype(int).astype(str)

# extract hour
df['hour']=df['Date'].dt.hour

In [4]:
# read criminal IUCR codes
iucr=pd.read_csv('/vol/home/s2465922/data/crime_iucr.csv')
iucr['IUCR']=iucr['IUCR'].apply(lambda s: '0'+s if len(s)<4 else s)
iucr.loc[iucr['PRIMARY DESCRIPTION']=='OTHER OFFENSE ','PRIMARY DESCRIPTION']='OTHER OFFENSE'

In [5]:
# add primary and secondary description based on matched IUCR code
df=df.set_index('IUCR')
iucr=iucr.set_index('IUCR')
df['primary_type']=iucr['PRIMARY DESCRIPTION']
df['secondary_description']=iucr['SECONDARY DESCRIPTION']
df=df.reset_index()

In [6]:
# df[df['Col2'].isnull()]
df=df.dropna()

In [7]:
# %%
# generate texts and clean data 

class Processor:
  def __init__(self):
    self.puncs='[+/,.;:?!()]+'
    self.min_word_df=50
    self.min_num_words=3
  
  def make_text(self, df):
    df['text']=df[['primary_type','secondary_description','Location Description']].values.tolist()
    return df
  
  def get_word2inc(self,df):
    '''
    return {word: document frequency}
    '''
    word2inc={}
    inc2text=df['text'].to_dict()
    for i in inc2text:
      for w in inc2text[i]:
        if w in word2inc:
          word2inc[w].add(i)
        else:
          word2inc[w]={i}
    return word2inc
  
  
  def clean_word(self,df):
    '''
    delete words that occur less than min_word_df
    '''
    word2inc=self.get_word2inc(df)
    
    # modify dataframe
    for w in word2inc:
      if len(word2inc[w])<=self.min_word_df:
        for i in word2inc[w]:
          df['text'][i].remove(w)
    return df
  
  def clean_data(self,df):
    '''
    delete examples where number of keywords less than min_num_words
    '''
    tmp=pd.DataFrame(df['text'].apply(len))
    tmp.columns=['numWords']
    deleted=tmp[tmp['numWords']<self.min_num_words].index.tolist()
    df=df.drop(deleted)
    return df

In [8]:
cdf=df.copy()
ps=Processor()
cdf=ps.make_text(cdf)

In [9]:
cdf=ps.clean_word(cdf)
cdf=ps.clean_data(cdf)

In [12]:
# rc=cdf.groupby(by=['primary_type','Community Area']).size().reset_index()
# rc.columns=['primary_type','community_area','count']
# rc['primary_type']=rc['primary_type'].apply(lambda x: x.lower())

In [18]:
# rc[rc['community_area']=='34']

Unnamed: 0,primary_type,community_area,count
27,arson,34,2
104,assault,34,378
181,battery,34,992
258,burglary,34,230
330,concealed carry license violation,34,1
400,crim sexual assault,34,21
477,criminal damage,34,536
554,criminal trespass,34,186
631,deceptive practice,34,470
696,gambling,34,1


In [10]:
train, test=train_test_split(cdf, test_size=0.2)

In [11]:
class CodeTable:
  def __init__(self,df):
    self.df=df
  
  def make_nodetable(self):
    '''
    retuen node2idx, idx2node
    '''
    nodes=[]
    nodes.extend(self.regions)
    nodes.extend(self.hours)
    nodes.extend(self.words)
    self.node2idx=dict((x,i) for i, x in enumerate(nodes))
    self.idx2node=dict((i,x) for i, x in enumerate(nodes))
    
  def make_node2inc(self):
    '''
    return unit2inc
    '''
    self.node2inc={}
    inc2units=[]
    units=['Community Area', 'hour', 'primary_type', 'secondary_description', 'Location Description']
    for u in units:
      inc2units.append(self.df[u].to_dict())

    for i2u in inc2units:
      for k, v in i2u.items():
        vid=self.node2idx[v]
        if vid in self.node2inc:
          self.node2inc[vid].add(k)
        else:
          self.node2inc[vid]={k}
    
  def make_codetable(self):

    self.regions=pd.unique(self.df['Community Area']).tolist()
    self.hours=pd.unique(self.df['hour']).tolist()
    self.words=set(pd.unique(self.df['primary_type']))\
              .union(set(pd.unique(self.df['secondary_description'])))\
              .union(set(pd.unique(self.df['Location Description'])))
    self.words=list(self.words)
    self.make_nodetable()
    self.make_node2inc()
    

In [12]:
ct=CodeTable(train)

In [13]:
ct.make_codetable()

In [14]:
class Cooccur:
  def __init__(self,ct):
    self.ct=ct
  
  def lookup(self, co_type):
    '''
    return the two units based on co_type 
    '''
    if co_type=='rw':
      u1=self.ct.regions
      u2=self.ct.words
      
    if co_type=='rh':
      u1=self.ct.regions
      u2=self.ct.hours
      
    if co_type=='hw':
      u1=self.ct.hours
      u2=self.ct.words
      
    if co_type=='ww':
      u1=self.ct.words
      u2=self.ct.words
    
    return u1,u2
    
  def create_cooccur_matrix(self, co_type):

    u1, u2=self.lookup(co_type)
    
    co=np.zeros((len(u1), len(u2)), dtype=int)
    for i in range(len(u1)):
      for j in range(len(u2)):
        u1_id=self.ct.node2idx[u1[i]]
        u2_id=self.ct.node2idx[u2[j]]
        co[i,j]=len(self.ct.node2inc[u1_id] & self.ct.node2inc[u2_id])
    return co
  
  def create_cooccur_edges(self, co_matrix, edge_type):
    edges=[]
    
    u1, u2 =self.lookup(edge_type)
    
#     norm_co=co_matrix/len(cdf)
        
    for i in range(co_matrix.shape[0]): 
      
      if edge_type=="ww":
        j_range=np.arange(i+1,co_matrix.shape[1])
      else:
        j_range=np.arange(co_matrix.shape[1])
        
      for j in j_range:
        c=co_matrix[i,j]
        if c>0:
          u1_id=self.ct.node2idx[u1[i]]
          u2_id=self.ct.node2idx[u2[j]]
          edges.append((u1_id, u2_id, c))
          edges.append((u2_id, u1_id, c))
        
    return edges
  
  def save_edges(self, edge_type, edge_lst):
    file=edge_type+'_edges.txt'
    with open(file, 'w') as f:
      for edge in  edge_lst:
          f.write("\t".join(str(e) for e in edge))
          f.write("\n")
    f.close()
    

In [15]:
co=Cooccur(ct)

In [16]:
graph_types=['rw','rh','hw','ww']

In [17]:
co_matrix=[]
for i in range(len(graph_types)):
  co_matrix.append(co.create_cooccur_matrix(graph_types[i]))

In [18]:
co_edges=[]
for i in range(len(graph_types)):
  gt=graph_types[i]
  co_edges.append(co.create_cooccur_edges(co_matrix[i], gt))
  co.save_edges(gt,co_edges[-1])

In [19]:
class tfidf:
  def __init__(self, co_matrix):
    self.rw_matrix=co_matrix[0]
    self.rh_matrix=co_matrix[1]
    self.hw_matrix=co_matrix[2]
    self.ww_matrix=co_matrix[3]

  def get_tfidf(self,co):
    '''
    input: an cooccurrence matrix
    return: an matrix with tf-idf weighting by treating each row as a document and 
    each column as a word
    '''
    tf=np.log10(co+1)
    df=np.sum(co!=0,axis=0)
    idf=np.log10(len(co)/df)

    return tf*idf
  
#   def get_pair_tfidf(self, co):
#     tfidf=self.get_tfidf(co)
    
#     tfidf_=self.get_tfidf(co.T)
    
#     return tfidf, tfidf_
  
  def vectorize(self):
    rvec=self.get_tfidf(self.rw_matrix)
    hvec=self.get_tfidf(self.hw_matrix)
    wvec=self.get_tfidf(self.ww_matrix)
    
    self.embeddings=np.r_[rvec,hvec,wvec]

In [20]:
td=tfidf(co_matrix)
td.vectorize() 

In [21]:
from scipy.linalg import svd

class SVD:
  def __init__(self, co_matrix, K=24):
    self.rw_matrix=co_matrix[0]
    self.rh_matrix=co_matrix[1]
    self.hw_matrix=co_matrix[2]
    self.ww_matrix=co_matrix[3]
    self.K=K

  def decompose(self, co):
    u,s,v=svd(co)
    u1_vec=u[:,:self.K]
    u2_vec=v.T[:,:self.K]
    return u1_vec, u2_vec
  
  def get_embeddings(self):
    rw, wr=self.decompose(self.rw_matrix)
    rh, hr=self.decompose(self.rh_matrix)
    hw, wh=self.decompose(self.hw_matrix)
    ww, ww_=self.decompose(self.ww_matrix)
    
#     rvec=np.maximum(rw,rh)
#     hvec=np.maximum(hw,hr)
#     wvec=np.maximum(wr,wh)

    rvec=(rw+rh)/2
    hvec=(hr+hw)/2
    wvec=(wr+wh+ww)/3
  
    
    self.embeddings=np.r_[rvec,hvec,wvec]

In [22]:
sd=SVD(co_matrix)
sd.get_embeddings()

In [23]:
import numpy as np
import networkx as nx
from utils import *
from alias import *

import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.models import *
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import *
from tensorflow.keras import backend as K

gpus = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(gpus[0], True)

In [24]:
class BiGraph:
  def __init__(self, graph_type,args):
    self.graph=nx.read_edgelist(graph_type+"_edges.txt", create_using=nx.DiGraph(), 
                  nodetype=int, data=[('weight', np.float)])
    self.args=args
    self.n2i=dict((n,i) for i, n in enumerate(self.graph.nodes()))
    self.i2n=dict((i,n) for i, n in enumerate(self.graph.nodes()))
    
    self.get_sampling_table()
    
  def get_sampling_table(self):
    
    # create sampling table for vertex
    
    numNodes=self.graph.number_of_nodes()
    node_degree=np.zeros(numNodes)
    
    for edge in self.graph.edges():
      node_degree[self.n2i[edge[0]]]+=self.graph[edge[0]][edge[1]].get('weight')
      
    total_sum=sum([pow(node_degree[i], self.args.power) for i in range(numNodes)])
    
    norm_prob=[pow(node_degree[j], self.args.power)/total_sum for j in range(numNodes)]
    
    self.node_accept, self.node_alias=create_alias_table(norm_prob)
    
    # create sampling table for edge
    numEdges=self.graph.number_of_edges()
    total_sum=sum(pow(self.graph[edge[0]][edge[1]].get('weight'),self.args.power)
                  for edge in self.graph.edges())
    norm_prob=[pow(self.graph[edge[0]][edge[1]].get('weight'),self.args.power)
               /total_sum for edge in self.graph.edges()]

    
    self.edge_accept, self.edge_alias=create_alias_table(norm_prob)
  
  def batch_iter(self):
    '''
    batch_size*example. each example: [sourceID, destinationID] [sign]
    '''
    
    data_size=self.graph.number_of_edges()
    shuffle_indice=np.random.permutation(np.arange(data_size))
    edges = [(edge[0], edge[1]) for edge in self.graph.edges()]
    
    # positive or negative mod
    s=[]
    d=[]
    sign=[]
    
    for i in range(self.args.num_group):
      
      # generate positive edges
      if np.random.random()>=self.edge_accept[shuffle_indice[i]]:
        shuffle_indice[i]=self.edge_alias[shuffle_indice[i]]
        
      cur_s=edges[shuffle_indice[i]][0]
      cur_d=edges[shuffle_indice[i]][1]
      
      s.append(cur_s)
      d.append(cur_d)
      sign.append(1)
    
    # generate negative edges, neg_ratio 
      s.extend([cur_s]*self.args.neg_ratio)
      sign.extend([-1]*self.args.neg_ratio)
      for i in range(self.args.neg_ratio):
        while True :
          d_neg=self.i2n[alias_sample(self.node_accept, self.node_alias)]
          if d_neg not in self.graph[s[-1]]:
            d.append(d_neg)
            break
            
    return np.array(s),np.array(d),np.array(sign)
    


In [25]:
class HeGraph:
  def __init__(self, args, nNodes):
    self.args=args
    self.nNodes=nNodes
    self.gt=['rw','rh','hw','ww','rr','hh']
    self.graphs=[]
    for i in range(len(self.gt)):
      self.graphs.append(BiGraph(self.gt[i],args))
    
    self.model=self.create_model()
  
  def line_loss(self,y_true, y_pred):
    '''
    y_true=1 if positive example else -1
    '''
    return -K.mean(K.log(K.sigmoid(y_true*y_pred)))

  def create_model(self):
    '''
    input: two nodes (id)
    output: inner product of embedding vectors for two nodes
    '''
    vi=Input(shape=(1,))
    vj=Input(shape=(1,))

    vi_emb=Embedding(self.nNodes, self.args.embedding_size)(vi)
    vj_emb=Embedding(self.nNodes, self.args.embedding_size)(vj)

    sim=Lambda(lambda x: tf.reduce_sum(x[0]*x[1], axis=-1))([vi_emb, vj_emb])

    model=Model(inputs=[vi,vj], outputs=[sim])

    return model
 
#   def get_graph_sampling_table(self):
#     num_edges=np.zeros(len(self.graphs),dtype=int)
#     for i in range(len(self.graphs)):
#       num_edges[i]=self.graphs[i].graph.number_of_edges()
#       self.gt_accept, self.gt_alias=create_alias_table(pow(num_edges,self.args.power)/sum(pow(num_edges,self.args.power)))
#       self.gt_accept, self.gt_alias=create_alias_table(num_edges/sum(num_edges))
      
#   def train(self):
#     self.model.compile(optimizer=RMSprop(learning_rate=0.001),loss=self.line_loss)
#     batch_size=self.args.num_group*(1+self.args.neg_ratio)
#     self.get_graph_sampling_table()
    
#     for iteration in range(self.args.T):
#       if iteration%100==0:
#         verbose=1
#       else:
#         verbose=0
        
#       gid=alias_sample(self.gt_accept, self.gt_alias)
#       s,d,y=self.graphs[gid].batch_iter()
#       self.model.fit(x=[s,d],y=y, batch_size=batch_size,verbose=verbose)

  def train(self):
    self.model.compile(optimizer=RMSprop(learning_rate=0.001),loss=self.line_loss)
    batch_size=self.args.num_group*(1+self.args.neg_ratio)
    
    for iteration in range(self.args.T):
      if iteration%100==0:
        verbose=1
      else:
        verbose=0
      for j in range(len(self.graphs)):
        s,d,y=self.graphs[j].batch_iter()
        self.model.fit(x=[s,d],y=y, batch_size=batch_size,verbose=verbose)
      
  def get_embeddings(self):
    self.embeddings=self.model.get_weights()[0]

In [61]:
ge_args=dotdict({
  'power':0.75,
  'embedding_size':100,
  'T':10000,
  'neg_ratio':5,
  'num_group':5,
})
nNodes=len(ct.node2idx)

In [62]:
he=HeGraph(ge_args, nNodes)

In [63]:
from time import time
start=time()
he.train()
runtime=(time()-start)/60











In [64]:
he.get_embeddings()

In [60]:
for i in range(len(he.graphs)):
  print(he.graphs[i].graph.number_of_edges())

43374
3696
17082
22772
558
192


In [65]:
runtime

19.106772216161094

In [31]:
from functools import reduce

class Evaluator:
  def __init__(self,test,ct,predict_type,N):
    self.test=test[['Community Area', 'hour', 'text']].values.tolist()
    self.ct=ct
    self.predict_type=predict_type
    self.N=N
    self.batch_size=1000
    self.test=self.to_number(self.test)
  
  def lookup(self, g):
    '''
    input: a crime incident record, predict type
    return: x, y, candidate pool
    '''
    r, h, w=g[0], g[1], g[2]
    if self.predict_type=='w':
      x=[r]+[h]
      y=w
      cand_pool=[self.ct.node2idx[w] for w in self.ct.words]
    if self.predict_type=='r':
#       x=[h]+np.random.choice(w,self.nWords,replace=False).tolist()
      x=[h]+w
      y=[r]
      cand_pool=[self.ct.node2idx[r] for r in self.ct.regions]
    if self.predict_type=='h':
      x=[r]+w
      y=[h]
      cand_pool=[self.ct.node2idx[h] for h in self.ct.hours]
    return x,y,cand_pool
    

  def generate_group(self, inc):
    '''
    input: a criminal incident record
    return: a group (positive + negative examples)
    '''
    # get x,y
    x, y, cand_pool=self.lookup(inc)

    # generate positive examples
    labels=[1]*len(y)
    
    # generate negative examples
    while len(y)<self.N:
      neg=np.random.choice(cand_pool)
      if neg not in y:
        y.append(neg)
        labels.append(0)
    
    g=list(zip([x]*self.N, y, labels))
              
    return g

  def reserve_example(self, inc):
    '''
    discard incident record whose nodes not occur in train set
    return: True or False
    '''
    if inc[0] not in self.ct.node2idx:
      return False
    if inc[1] not in self.ct.node2idx:
      return False
    for w in inc[2]:
      if w not in self.ct.node2idx:
        return False
    return True
      
    
  def to_number(self, data):
    '''
    input: examples set
    convert each word to number
    '''
    lst=[]
    for i in range(len(data)):
      if self.reserve_example(data[i]):
        x=[self.ct.node2idx[data[i][0]], self.ct.node2idx[data[i][1]], [self.ct.node2idx[w] for w in data[i][2]]]
        lst.append(x)  
    return lst

  def generate_test_groups(self):
    gs=[]
    for inc in self.test:
      g=self.generate_group(inc)
      gs.extend(g)
    return gs

  def cos_sim(self, a, b):
    return np.sum(a*b, axis=1)/(np.linalg.norm(a, axis=1)*np.linalg.norm(b,axis=1))
    
  def batch_sim(self, emb_lst, x, y):

    y_emb=np.array([emb_lst[u] for u in y])
    sim=np.zeros((x.shape[1],x.shape[0]))

    for i in range(x.shape[1]):
      x_emb=np.array([emb_lst[u] for u in x[:,i]])
      sim[i]=self.cos_sim(x_emb,y_emb)
 
    return np.mean(sim, axis=0)
  
  def baseline(self, gs):
    ranks=[]
    x, y, label=list(zip(*gs))
    for i in range(0, len(label), self.N):
      l=np.array(label[i:i+self.N])
      x_inter=set.intersection(*list(map(lambda h: self.ct.node2inc[h],x[i])))
      freq=list(map(lambda h: len(x_inter.intersection(self.ct.node2inc[h])), y[i:i+self.N]))
      ranks.append(l[np.argsort(freq)[::-1]])
    return np.array(ranks)

  def get_ranks(self, gs, emb_lst):
    ranks=[]
    sims=[]
    x, y, label=list(zip(*gs))
    x=np.array(x)
    y=np.array(y)
    label=np.array(label)

    for i in range(0, len(y), self.batch_size):
      end=i+self.batch_size
      sim=self.batch_sim(emb_lst, x[i:end], y[i:end])
      sims.extend(sim.tolist())
    
    for i in range(0, len(sims), self.N):
      l=label[i:i+self.N]
      ranks.append(l[np.argsort(sims[i:i+self.N])[::-1]])

    return np.array(ranks)
  
  def F1_score(self, ranks,k=10):
    '''
    for labels prediction
    '''
    relevant=np.sum(ranks[:,:k], axis=1)
    recall=relevant/np.sum(ranks,axis=1)
    precision=relevant/k
    meanF1=np.mean(2*precision*recall/(precision+recall))
    return meanF1

  def MRR(self,ranks):
    '''
    for location and labels prediction
    '''
    pos=np.argmax(ranks>0, axis=1)+1
    mrr=np.mean(1/pos)
    
    return mrr
  
  def MAP(self, ranks):
    aps=[]
    for i in range(len(ranks)):
      recall_pos=np.where(ranks[i]==1)[0]+1
      ap=np.mean((np.arange(len(recall_pos))+1)/recall_pos)
      aps.append(ap)
      
    return np.mean(aps)
      

In [32]:
ew=Evaluator(test, ct, predict_type='w', N=20)
gs_w=ew.generate_test_groups()

In [33]:
er=Evaluator(test, ct, predict_type='r', N=10)
gs_r=er.generate_test_groups()

In [34]:
eh=Evaluator(test, ct, predict_type='h', N=10)
gs_h=eh.generate_test_groups()

In [93]:
ns_w_ranks=ew.baseline(gs_w)
ns_w_mrr=ew.MRR(ns_w_ranks)
ns_w_map=ew.MAP(ns_w_ranks)
print('ns_w_mrr={}'.format(ns_w_mrr))
print('ns_w_map={}'.format(ns_w_map))

In [48]:
ns_r_ranks=er.baseline(gs_r)
ns_r_mrr=er.MRR(ns_r_ranks)
ns_r_map=er.MAP(ns_r_ranks)
print('ns_r_mrr={}'.format(ns_r_mrr))
print('ns_r_map={}'.format(ns_r_map))

ns_r_mrr=0.4479534001279555
ns_r_map=0.4479534001279555


In [47]:
ns_h_ranks=eh.baseline(gs_h)
ns_h_mrr=eh.MRR(ns_h_ranks)
ns_h_map=eh.MAP(ns_h_ranks)
print('ns_h_mrr={}'.format(ns_h_mrr))
print('ns_h_map={}'.format(ns_h_map))

ns_h_mrr=0.3179829099528738
ns_h_map=0.3179829099528738


In [66]:
ge_w_ranks=ew.get_ranks(gs_w, he.embeddings)
ge_w_mrr=ew.MRR(ge_w_ranks)
ge_w_map=ew.MAP(ge_w_ranks)
print('ge_w_mrr={}'.format(ge_w_mrr))
print('ge_w_map={}'.format(ge_w_map))

ge_w_mrr=0.8840342486300676
ge_w_map=0.7331176499544174


In [58]:
ge_r_ranks=er.get_ranks(gs_r, he.embeddings)
ge_r_mrr=er.MRR(ge_r_ranks)
ge_r_map=er.MAP(ge_r_ranks)
print('ge_r_mrr={}'.format(ge_r_mrr))
print('ge_r_map={}'.format(ge_r_map))

ge_r_mrr=0.41747027987171725
ge_r_map=0.41747027987171725


In [59]:
ge_h_ranks=eh.get_ranks(gs_h, he.embeddings)
ge_h_mrr=eh.MRR(ge_h_ranks)
ge_h_map=eh.MAP(ge_h_ranks)
print('ge_h_mrr={}'.format(ge_h_mrr))
print('ge_h_map={}'.format(ge_h_map))

ge_h_mrr=0.2759361753729986
ge_h_map=0.2759361753729986


In [38]:
td_w_ranks=ew.get_ranks(gs_w, td.embeddings)
td_w_mrr=ew.MRR(td_w_ranks)
td_w_map=ew.MAP(td_w_ranks)
print('td_w_mrr={}'.format(td_w_mrr))
print('td_w_map={}'.format(td_w_map))

td_w_mrr=0.8651352129521664
td_w_map=0.7032797058028452


In [39]:
td_r_ranks=er.get_ranks(gs_r, td.embeddings)
td_r_mrr=er.MRR(td_r_ranks)
td_r_map=er.MAP(td_r_ranks)
print('td_r_mrr={}'.format(td_r_mrr))
print('td_r_map={}'.format(td_r_map))

td_r_mrr=0.4457367827745857
td_r_map=0.4457367827745857


In [40]:
td_h_ranks=eh.get_ranks(gs_h, td.embeddings)
td_h_mrr=eh.MRR(td_h_ranks)
td_h_map=eh.MAP(td_h_ranks)
print('td_r_mrr={}'.format(td_h_mrr))
print('td_r_map={}'.format(td_h_map))

td_r_mrr=0.35759084735886265
td_r_map=0.35759084735886265


In [41]:
sd_w_ranks=ew.get_ranks(gs_w, sd.embeddings)
sd_w_mrr=ew.MRR(sd_w_ranks)
sd_w_map=ew.MAP(sd_w_ranks)
print('sd_w_mrr={}'.format(sd_w_mrr))
print('sd_w_map={}'.format(sd_w_map))

sd_w_mrr=0.5501755237735296
sd_w_map=0.42058169107434085


In [42]:
sd_r_ranks=er.get_ranks(gs_r, sd.embeddings)
sd_r_mrr=er.MRR(sd_r_ranks)
sd_r_map=er.MAP(sd_r_ranks)
print('sd_r_mrr={}'.format(sd_r_mrr))
print('sd_r_map={}'.format(sd_r_map))

sd_r_mrr=0.38972503943472275
sd_r_map=0.38972503943472275


In [43]:
sd_h_ranks=eh.get_ranks(gs_h, sd.embeddings)
sd_h_mrr=eh.MRR(sd_h_ranks)
sd_h_map=eh.MAP(sd_h_ranks)
print('sd_r_mrr={}'.format(sd_h_mrr))
print('sd_r_map={}'.format(sd_h_map))

sd_r_mrr=0.32459267364464595
sd_r_map=0.32459267364464595


In [371]:
sd_ranks=e.get_ranks(gs, sd.embeddings)

In [372]:
sd_mrr=e.MRR(sd_ranks)
sd_map=e.MAP(sd_ranks)

In [373]:
sd_mrr

0.47953113704317607

In [367]:
sd_map

0.30086364067530047

In [74]:
train, test=train_test_split(ct.df, test_size=0.2)
test=test[['Community Area', 'hour', 'text']].values.tolist()

In [82]:
cc=list(map(lambda x: [ct.node2idx[x[0]], ct.node2idx[x[1]], [ct.node2idx[w] for w in x[2]]],test))

In [77]:
for r, h, ws in test[0]:
  print(r, h, ws)

ValueError: not enough values to unpack (expected 3, got 2)

In [78]:
test[0]cc

['29', 17, {'battery', 'sidewalk', 'simple'}]

In [84]:
len(cc)

279770

In [85]:
len(ct.df)

1398847