<a href="https://colab.research.google.com/github/karino2/US-patent-analysis/blob/triplet_loss_work/colab/bert_tripletloss.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Colab porting of triplet loss trial for bert feature**

In [1]:
import os
import datetime
import pickle
import gzip
import pandas as pd
import numpy as np

import tensorflow as tf

tf.enable_eager_execution()


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.



In [3]:
tf.__version__

'1.13.1'

In [0]:
!git clone -b docker https://github.com/yoheikikuta/bert.git

In [0]:
!ls

# Data setup

Feature data is created at bert-extract-feature.ipynb.

In [0]:
!mkdir ./bert/data

In [0]:
from google.colab import auth
auth.authenticate_user()

In [4]:
!gsutil cp gs://karino2-uspatent/features/training_app_1000_features.pkl.gz bert/data/training_app_1000_features.pkl.gz
!gsutil cp gs://karino2-uspatent/features/test_app_1000_features.pkl.gz bert/data/test_app_1000_features.pkl.gz
!gsutil cp gs://karino2-uspatent/features/grants_2000_features.pkl.gz bert/data/grants_2000_features.pkl.gz

Copying gs://karino2-uspatent/features/training_app_1000_features.pkl.gz...
/ [0 files][    0.0 B/  3.1 MiB]                                                / [1 files][  3.1 MiB/  3.1 MiB]                                                
Operation completed over 1 objects/3.1 MiB.                                      
Copying gs://karino2-uspatent/features/test_app_1000_features.pkl.gz...
/ [1 files][  3.1 MiB/  3.1 MiB]                                                
Operation completed over 1 objects/3.1 MiB.                                      
Copying gs://karino2-uspatent/features/grants_2000_features.pkl.gz...
/ [1 files][  7.9 MiB/  7.9 MiB]                                                
Operation completed over 1 objects/7.9 MiB.                                      


In [5]:
with gzip.open("./bert/data/training_app_1000_features.pkl.gz", 'rb') as f:
     train_features = pickle.load(f)
with gzip.open("./bert/data/test_app_1000_features.pkl.gz", 'rb') as f:
     test_features = pickle.load(f)
with gzip.open("./bert/data/grants_2000_features.pkl.gz", 'rb') as f:
     grants_features = pickle.load(f)
print(f"{train_features.shape}, {test_features.shape}, {grants_features.shape}")

(1000, 768), (1000, 768), (2524, 768)


In [0]:
!gsutil cp gs://karino2-uspatent/citations_info_2000.df.gz ./bert/data/
!gsutil cp gs://karino2-uspatent/testset_app_1000.df.gz ./bert/data/
!gsutil cp gs://karino2-uspatent/training_app_1000.df.gz ./bert/data/
!gsutil cp gs://karino2-uspatent/grants_for_2000.df.gz ./bert/data/

In [0]:
citations_info_target = pd.read_pickle("./bert/data/citations_info_2000.df.gz")
train_app_df = pd.read_pickle("./bert/data/training_app_1000.df.gz")
test_app_df = pd.read_pickle("./bert/data/testset_app_1000.df.gz")
grants_target_df = pd.read_pickle("./bert/data/grants_for_2000.df.gz")

### Prepare data for circulum tripletloss learning

In [0]:
import random
import pandas as pd
import numpy as np
import pickle
from collections import OrderedDict

In [0]:
train_normalized_feature_dict_1000 = { 
    k:v/np.linalg.norm(v) for k,v in zip(train_app_df['app_id'], train_features[:, ])
}
grants_normalized_feature_dict_2000 = { 
    k:v/np.linalg.norm(v) for k,v in zip(grants_target_df['parsed'], grants_features[:, ])
}

Confirm result

In [19]:
train_app_df.head()

Unnamed: 0,app_id,xml
0,14222691,"<us-patent-application lang=""EN"" dtd-version=""..."
1,12515852,"<us-patent-application lang=""EN"" dtd-version=""..."
2,12033424,"<us-patent-application lang=""EN"" dtd-version=""..."
3,12402344,"<us-patent-application lang=""EN"" dtd-version=""..."
4,12155425,"<us-patent-application lang=""EN"" dtd-version=""..."


In [25]:
train_normalized_feature_dict_1000[14222691][0:5]

array([-0.05969358,  0.05039638, -0.0200557 ,  0.04519269, -0.01494433])

In [26]:
train_features[0, 0:5]

array([-1.04389739,  0.88131171, -0.35072595,  0.79031169, -0.26134047])

In [28]:
(train_features[0, ]/np.linalg.norm(train_features[0, ]))[0:5]

array([-0.05969358,  0.05039638, -0.0200557 ,  0.04519269, -0.01494433])

### Calculate cosin similarity for whole pair

In [142]:
%%time

sim_dict = {
    app_id:{ parsed:np.sum(train_normalized_feature_dict_1000[app_id]*grants_normalized_feature_dict_2000[parsed])
            for parsed 
            in grants_target_df['parsed'] } 
    for app_id 
    in train_app_df['app_id']
}

CPU times: user 15.4 s, sys: 140 ms, total: 15.5 s
Wall time: 15.5 s


check result

In [32]:
list(sim_dict.keys())[0]

14222691

In [46]:
sorted( sim_dict[12411468].values(), reverse=True )[:5]

[0.8536353801810572,
 0.8500573288849081,
 0.8478197928831097,
 0.8464742417020769,
 0.8457543474536907]

In [36]:
citations_info_target[citations_info_target.app_id == 12411468].parsed

4089    7061154
4090    7061154
Name: parsed, dtype: int64

In [44]:
grants_target_df[grants_target_df.parsed == 7061154].index[0]

540

In [45]:
list(sim_dict[12411468].values())[540]

0.8435186037581648

In [47]:
sorted( sim_dict[12411468].values(), reverse=True ).index(0.8435186037581648)

7

### Create triplet

In [0]:
def sort_similarity_by_value(sim_dict, app_id):
    '''
    input:
        sim_dict: similary dictionary
        app_id: target application id
    return:
        [(parsed1, sim1), (parsed2, sim2), ...] sorted by similarities
    '''
    return [(parsed, sim_dict[app_id][parsed]) for parsed in sorted(sim_dict[app_id], key=sim_dict[app_id].get)]
  
  
def get_cited_grants(citations_info_target, app_id):
    '''
    input:
        citations_info_target: DataFrame of citation relationships
        app_id: target application id
    return:
        {parsed1, parsed2, ...} that are cited to reject app_id
    '''
    return set(citations_info_target[citations_info_target['app_id'] == app_id]['parsed'])
  
def make_uncited_grants_for_app_id(sim_dict, citations_info_target, app_id, sidx, eidx, num, shuffle=True):
    '''
    input:
        sim_dict: 
        citations_info_target:
        app_id: target application id
        sidx: start index to slice the sorted (parsed, sim) list
        eidx: end index to slice the sorted (parsed, sim) list
        num: number of grants that will be returned
    return:
        [parsed_1, parsed_2, ..., parsed_num] that are NOT cited to reject app_id
    '''
    sorted_grants_list = sort_similarity_by_value(sim_dict, app_id)
    sorted_grants_list = sorted_grants_list[sidx:eidx]
    if shuffle:
        random.shuffle(sorted_grants_list)
    
    cited_grants = get_cited_grants(citations_info_target, app_id)
    uncited_grants = []
    
    idx = 0
    while len(uncited_grants) != num:
        grant_id, _ = sorted_grants_list[idx]
        if not grant_id in cited_grants:
            uncited_grants.append(grant_id)
        idx += 1
    
    return uncited_grants  

In [11]:
random.seed(0)
make_uncited_grants_for_app_id(sim_dict, citations_info_target, 12411468, 0, 100, 4)

[7196326, 6979916, 6971650, 6899075]

In [50]:
random.seed(1)
make_uncited_grants_for_app_id(sim_dict, citations_info_target, 12411468, 0, 100, 4)

[7108673, 7846876, 7168415, 7052189]

In [0]:
def create_triplet_pairs(sidx, eidx):
    all_elems = []
    
    for app_id in train_app_df['app_id']:
        cited_grants = get_cited_grants(citations_info_target, app_id)
        num_cited_grants = len(cited_grants)
        uncited_grants = make_uncited_grants_for_app_id(
            sim_dict, citations_info_target, app_id, sidx, eidx, num_cited_grants)
        
        for idx, cited in enumerate(cited_grants):
            all_elems.append([app_id, cited, uncited_grants[idx]])
    
    result_df = pd.DataFrame(all_elems)
    result_df.columns = ['app_id', 'cited_grants', 'uncited_grants']
    
    return result_df

In [53]:
%%time

random.seed(0)
test = create_triplet_pairs(0, 100)

CPU times: user 3.19 s, sys: 4.85 ms, total: 3.2 s
Wall time: 3.19 s


In [54]:
test.head(2)

Unnamed: 0,app_id,cited_grants,uncited_grants
0,14222691,8206188,7474966
1,14222691,8177561,6925385


In [55]:
len(test)

1282

In [0]:
random.seed(1)
test = create_triplet_pairs(0, 100)

In [57]:
test.head(2)

Unnamed: 0,app_id,cited_grants,uncited_grants
0,14222691,8206188,6977086
1,14222691,8177561,7252786


### Train Model

In [13]:
tf.executing_eagerly() 

True

In [0]:
class Model(object):
    def __init__(self, input_shape, output_shape):
        self.input_shape = input_shape
        self.output_shape = output_shape
        self.W = tf.Variable( tf.random_normal( [self.input_shape, self.output_shape] ), name='weight' )
        self.B = tf.Variable( tf.random_normal( [self.output_shape] ), name='bias' ) 
        self.variables = [ self.W, self.B ]
    
    def frwrd_pass(self,X_train):
        out = tf.matmul( X_train, self.W ) + self.B
        
        return out

In [0]:
def tripletloss(anchor_out, positive_out, negative_out, margin=0.2):
    norm_a_out = tf.nn.l2_normalize(anchor_out, axis=1)
    norm_p_out = tf.nn.l2_normalize(positive_out, axis=1)
    norm_n_out = tf.nn.l2_normalize(negative_out, axis=1)
    
    d_pos = tf.losses.cosine_distance(norm_a_out, norm_p_out, axis=1, reduction=tf.losses.Reduction.NONE)
    d_neg = tf.losses.cosine_distance(norm_a_out, norm_n_out, axis=1, reduction=tf.losses.Reduction.NONE)
    
    loss = tf.maximum(0.0, margin + d_pos - d_neg)
    
    return tf.reduce_mean(loss)

In [0]:
def create_training_input_np(sidx, eidx):
    anchor_list = []
    positive_list = []
    negative_list = []
    
    triplet_pairs = create_triplet_pairs(sidx, eidx)
    
    for row in triplet_pairs.itertuples():
        anchor_list.append(train_normalized_feature_dict_1000[row.app_id])
        positive_list.append(grants_normalized_feature_dict_2000[row.cited_grants])
        negative_list.append(grants_normalized_feature_dict_2000[row.uncited_grants])
    
    return np.array([np.array(anchor_list), np.array(positive_list), np.array(negative_list)])

In [0]:
def calc_loss_with_changing_negative_pair(sidx, eidx, lossfun=tripletloss, batch_size=10):
    seed = 0
    all_loss = []
    random.seed(seed)

    input_data_np = create_training_input_np(sidx, eidx)
    data_num = int(input_data_np.shape[1])
    rand_idx = np.random.permutation(data_num)
    index_data_np = np.array([
        input_data_np[0][rand_idx], 
        input_data_np[1][rand_idx], 
        input_data_np[2][rand_idx]])

    input_data = tf.convert_to_tensor(input_data_np, dtype=tf.float32)
    anchor_data, positive_data, negative_data = input_data

    for iter_id in range(data_num // batch_size):
      anchor_out = model.frwrd_pass(anchor_data[iter_id*batch_size : (iter_id+1)*batch_size])
      positive_out = model.frwrd_pass(positive_data[iter_id*batch_size : (iter_id+1)*batch_size])
      negative_out = model.frwrd_pass(negative_data[iter_id*batch_size : (iter_id+1)*batch_size])
      curr_loss = lossfun(anchor_out, positive_out, negative_out)
      all_loss.append(curr_loss)

    print(sum(all_loss)/len(all_loss))

In [241]:
calc_loss_with_changing_negative_pair(0, 100)
calc_loss_with_changing_negative_pair(900, 1000)
calc_loss_with_changing_negative_pair(1900, 2000)

tf.Tensor(0.01658673, shape=(), dtype=float32)
tf.Tensor(0.09671563, shape=(), dtype=float32)
tf.Tensor(0.1443036, shape=(), dtype=float32)


In [0]:
def train_with_changing_negative_pair_with_loss(sidx, eidx, lossfun, batch_size, epochs):
    optimizer = tf.train.AdamOptimizer(learning_rate=0.00001)
    
    seed = 0
    for i in range(epochs):
        seed += 1
        random.seed(seed)
        
        input_data_np = create_training_input_np(sidx, eidx)
        data_num = int(input_data_np.shape[1])
        rand_idx = np.random.permutation(data_num)
        index_data_np = np.array([
            input_data_np[0][rand_idx], 
            input_data_np[1][rand_idx], 
            input_data_np[2][rand_idx]])

        input_data = tf.convert_to_tensor(input_data_np, dtype=tf.float32)
        anchor_data, positive_data, negative_data = input_data

        for iter_id in range(data_num // batch_size):        
            with tf.GradientTape() as tape:
                anchor_out = model.frwrd_pass(anchor_data[iter_id*batch_size : (iter_id+1)*batch_size])
                positive_out = model.frwrd_pass(positive_data[iter_id*batch_size : (iter_id+1)*batch_size])
                negative_out = model.frwrd_pass(negative_data[iter_id*batch_size : (iter_id+1)*batch_size])
                curr_loss = lossfun(anchor_out, positive_out, negative_out)
            grads = tape.gradient( curr_loss, model.variables )
            optimizer.apply_gradients(zip(grads, model.variables), global_step=tf.train.get_or_create_global_step())

        if i % 10 == 0:
            print( "Loss at step {:d}: {:.5f}".format(i, curr_loss) )

def train_with_changing_negative_pair(sidx, eidx, batch_size, epochs):
  train_with_changing_negative_pair_with_loss(sidx, eidx, tripletloss, batch_size, epochs)

In [0]:
# start_end_index_pairs = (
#     (0, 100),
#     (100, 200),
#     (200, 300),
#     (300, 400),
#     (400, 500),
#     (500, 600),
#     (600, 700),
#     (700, 800),
#     (800, 900),
#     (900, 1000),
#     (1000, 1100),
#     (1100, 1200),
#     (1200, 1300),
#     (1300, 1400),
#     (1400, 1500),
#     (1500, 1600),
#     (1600, 1700),
#     (1700, 1800),
#     (1800, 1900),
#     (1900, 2000)
# )

# start_end_index_pairs = (
#     (0, 400),
#     (400, 800),
#     (800, 1200),
#     (1200, 1600),
#     (1600, 2000)
# )

start_end_index_pairs = (
    (0, 100),
    (900, 1000),
    (1900, 2000)
)

In [0]:
model = Model(input_shape=768, output_shape=100)

In [242]:
%%time

# for sidx, eidx in start_end_index_pairs:
#     print("   start index: {}, end index: {}".format(sidx,eidx))
#     train_with_changing_negative_pair(sidx, eidx, batch_size=10, epochs=31)

# for sidx, eidx in start_end_index_pairs:
#     print("   start index: {}, end index: {}".format(sidx,eidx))
#     train_with_changing_negative_pair(sidx, eidx, batch_size=10, epochs=51)

for sidx, eidx in start_end_index_pairs:
    print("   start index: {}, end index: {}".format(sidx,eidx))
    train_with_changing_negative_pair(sidx, eidx, batch_size=10, epochs=21)

   start index: 0, end index: 100
Loss at step 0: 0.02616
Loss at step 10: 0.00717
Loss at step 20: 0.01298
   start index: 900, end index: 1000
Loss at step 0: 0.09295
Loss at step 10: 0.09062
Loss at step 20: 0.06588
   start index: 1900, end index: 2000
Loss at step 0: 0.09322
Loss at step 10: 0.09928
Loss at step 20: 0.06732
CPU times: user 4min 46s, sys: 4.02 s, total: 4min 50s
Wall time: 4min 34s


In [243]:
calc_loss_with_changing_negative_pair(0, 100)
calc_loss_with_changing_negative_pair(900, 1000)
calc_loss_with_changing_negative_pair(1900, 2000)

tf.Tensor(0.00667771, shape=(), dtype=float32)
tf.Tensor(0.027310502, shape=(), dtype=float32)
tf.Tensor(0.072016366, shape=(), dtype=float32)


In [23]:
# I do not save model these days.

# os.makedirs('../trained_model/tripletloss_circulum', exist_ok=True)
# saver = tf.contrib.eager.Saver(model.variables)
# saver.save("../trained_model/tripletloss_circulum/ckpt")



'../trained_model/tripletloss_circulum/ckpt'

### Inferece with trained model

Saver seems fragile in eager mode. I use trained model on memory.

In [0]:
test_normalized_feature_dict_1000 = { 
    k:v/np.linalg.norm(v) for k,v in zip(test_app_df['app_id'], test_features[:, ])
}


In [0]:
def to_extracted_features(normalized_feature_dict, model):
  # doc_id is app_id for application, pared for grants.
  sorted_keys = sorted(normalized_feature_dict.keys())

  input_tensors = tf.convert_to_tensor(
      np.array([ normalized_feature_dict[k] for k in sorted_keys ]),
      dtype=tf.float32)
  extracted_features = model.frwrd_pass(input_tensors).numpy()
  return pd.DataFrame({ 
    'doc_id':sorted_keys, 'extracted_feature':[ v/np.linalg.norm(v) for v in extracted_features ]})

In [0]:
test_extracted_features_df = to_extracted_features(test_normalized_feature_dict_1000, model)
grants_extracted_features_df = to_extracted_features(grants_normalized_feature_dict_2000, model)

In [245]:
test_extracted_features_df.head(2)

Unnamed: 0,doc_id,extracted_feature
0,12000862,"[-0.06014522, 0.15319835, -0.094469905, 0.0928..."
1,12003258,"[-0.14375061, 0.054770324, 0.052622166, 0.2140..."


In [246]:
grants_extracted_features_df.head(2)

Unnamed: 0,doc_id,extracted_feature
0,6837383,"[0.17937347, 0.06213054, -0.0074147717, 0.2118..."
1,6837647,"[0.15643527, 0.080645256, -0.041111432, 0.1119..."


In [0]:
def create_sim_dict(test_extracted_df, grants_extracted_df):
  return {
    app_id:{ parsed:np.sum(test_f*grants_f)
            for parsed, grants_f 
            in zip(grants_extracted_df['doc_id'], grants_extracted_df['extracted_feature']) } 
    for app_id, test_f 
    in zip(test_extracted_df['doc_id'], test_extracted_df['extracted_feature'])
  }

In [247]:
%%time

sim_dict_by_extracted = create_sim_dict(test_extracted_features_df, grants_extracted_features_df)

CPU times: user 10.1 s, sys: 153 ms, total: 10.3 s
Wall time: 10.3 s


In [0]:
import collections

def calc_all_ranks(all_app_ids, sim_dic):
  all_ranks = []

  for app_id in all_app_ids:
      cited_grants = get_cited_grants(citations_info_target, app_id)
      sorted_kv = reversed(sort_similarity_by_value(sim_dic, app_id))  # higher score, similar patent

      idx = 1
      for k,v in sorted_kv:
          if k in cited_grants:
              all_ranks.append(idx)
          idx += 1
  return all_ranks

def calc_all_ranks_counter(all_app_ids, sim_dic):
  return collections.Counter(calc_all_ranks(all_app_ids, sim_dic))

In [250]:
counter = calc_all_ranks_counter(test_extracted_features_df['doc_id'], sim_dict_by_extracted)
print(counter)

Counter({1: 131, 2: 61, 3: 44, 7: 27, 6: 27, 4: 26, 5: 25, 11: 19, 10: 16, 15: 15, 22: 15, 9: 13, 19: 13, 34: 12, 25: 11, 31: 11, 18: 11, 21: 11, 12: 10, 8: 10, 24: 10, 23: 10, 17: 9, 27: 9, 13: 9, 14: 8, 26: 7, 55: 7, 43: 7, 41: 7, 38: 7, 30: 7, 36: 6, 16: 6, 69: 5, 28: 5, 37: 5, 62: 5, 49: 5, 54: 5, 182: 5, 134: 5, 50: 5, 51: 5, 130: 5, 84: 5, 33: 5, 48: 4, 186: 4, 52: 4, 35: 4, 127: 4, 91: 4, 32: 4, 20: 4, 65: 4, 68: 4, 46: 4, 120: 4, 449: 4, 101: 4, 56: 4, 57: 3, 286: 3, 61: 3, 63: 3, 106: 3, 128: 3, 53: 3, 129: 3, 125: 3, 206: 3, 76: 3, 108: 3, 167: 3, 39: 3, 107: 3, 190: 3, 73: 3, 168: 3, 191: 3, 117: 3, 110: 3, 132: 3, 44: 3, 72: 3, 243: 3, 149: 3, 40: 3, 112: 3, 196: 3, 171: 3, 159: 3, 29: 3, 151: 3, 672: 2, 60: 2, 161: 2, 266: 2, 118: 2, 96: 2, 162: 2, 443: 2, 166: 2, 193: 2, 201: 2, 1002: 2, 165: 2, 42: 2, 75: 2, 126: 2, 119: 2, 722: 2, 185: 2, 89: 2, 501: 2, 306: 2, 331: 2, 184: 2, 183: 2, 321: 2, 622: 2, 239: 2, 359: 2, 311: 2, 59: 2, 133: 2, 64: 2, 93: 2, 99: 2, 312: 2, 77

In [251]:
# counter is started from 1.

sum(counter[i] for i in range(11))

380

### Use raw feature as is (for comparison)

In [47]:
%%time

sim_dict = {
    app_id:{ parsed:np.sum(test_normalized_feature_dict_1000[app_id]*grants_normalized_feature_dict_2000[parsed])
            for parsed 
            in grants_target_df['parsed'] } 
    for app_id 
    in test_app_df['app_id']
}

CPU times: user 13.3 s, sys: 149 ms, total: 13.4 s
Wall time: 13.4 s


In [48]:
%%time

all_ranks = []

for app_id in test_app_df['app_id']:
    cited_grants = get_cited_grants(citations_info_target, app_id)
    sorted_kv = reversed(sort_similarity_by_value(sim_dict, app_id))  # higher score, similar patent

    idx = 1
    for k,v in sorted_kv:
        if k in cited_grants:
            all_ranks.append(idx)
        idx += 1

CPU times: user 2.89 s, sys: 8.73 ms, total: 2.9 s
Wall time: 2.89 s


In [49]:
counter = collections.Counter(all_ranks)
print(counter)

Counter({1: 178, 2: 73, 3: 46, 5: 40, 4: 38, 6: 31, 7: 19, 9: 17, 8: 16, 10: 15, 13: 13, 14: 13, 11: 13, 15: 12, 18: 12, 16: 11, 12: 10, 26: 9, 19: 9, 34: 9, 21: 8, 23: 8, 38: 7, 28: 7, 50: 6, 39: 6, 37: 6, 33: 6, 53: 6, 100: 5, 47: 5, 76: 5, 35: 5, 64: 5, 29: 5, 73: 5, 61: 5, 32: 5, 17: 5, 68: 5, 31: 5, 52: 4, 59: 4, 40: 4, 143: 4, 90: 4, 48: 4, 63: 4, 44: 4, 49: 4, 20: 4, 65: 4, 70: 4, 30: 4, 83: 3, 168: 3, 147: 3, 114: 3, 22: 3, 444: 3, 93: 3, 43: 3, 121: 3, 122: 3, 189: 3, 148: 3, 51: 3, 57: 3, 46: 3, 25: 3, 385: 3, 66: 3, 85: 3, 125: 3, 304: 3, 36: 3, 195: 3, 92: 3, 187: 3, 69: 3, 99: 3, 411: 2, 24: 2, 131: 2, 262: 2, 724: 2, 72: 2, 454: 2, 396: 2, 87: 2, 130: 2, 413: 2, 89: 2, 157: 2, 292: 2, 1182: 2, 264: 2, 133: 2, 78: 2, 174: 2, 56: 2, 67: 2, 227: 2, 134: 2, 163: 2, 97: 2, 126: 2, 517: 2, 213: 2, 128: 2, 110: 2, 112: 2, 173: 2, 62: 2, 42: 2, 179: 2, 378: 2, 155: 2, 88: 2, 376: 2, 505: 2, 183: 2, 167: 2, 105: 2, 45: 2, 209: 2, 211: 2, 74: 2, 75: 2, 415: 2, 617: 2, 169: 2, 615: 

In [50]:
sum(counter[i] for i in range(11))

473

### Triplet loss, positive doubled

In [0]:
def tipletloss_pos_ntimes(anchor_out, positive_out, negative_out, ntimes=2, margin=0.2):
    norm_a_out = tf.nn.l2_normalize(anchor_out, axis=1)
    norm_p_out = tf.nn.l2_normalize(positive_out, axis=1)
    norm_n_out = tf.nn.l2_normalize(negative_out, axis=1)
    
    d_pos = tf.losses.cosine_distance(norm_a_out, norm_p_out, axis=1, reduction=tf.losses.Reduction.NONE)
    d_neg = tf.losses.cosine_distance(norm_a_out, norm_n_out, axis=1, reduction=tf.losses.Reduction.NONE)
    
    loss = tf.maximum(0.0, margin + 2*d_pos - d_neg)
    
    return tf.reduce_mean(loss)
  
def tripletloss_posdouble(anchor_out, positive_out, negative_out, margin=0.2):
  return tipletloss_pos_ntimes(anchor_out, positive_out, negative_out, 2, margin)

def tripletloss_pos_10times(anchor_out, positive_out, negative_out, margin=0.2):
  return tipletloss_pos_ntimes(anchor_out, positive_out, negative_out, 2, margin)



In [0]:
model = Model(input_shape=768, output_shape=100)

In [215]:
%%time

for sidx, eidx in start_end_index_pairs:
    print("   start index: {}, end index: {}".format(sidx,eidx))
    train_with_changing_negative_pair_with_loss(sidx, eidx, tripletloss_posdouble, batch_size=10, epochs=21)

   start index: 0, end index: 100
Loss at step 0: 0.14810
Loss at step 10: 0.12633
Loss at step 20: 0.10907
   start index: 900, end index: 1000
Loss at step 0: 0.25632
Loss at step 10: 0.24276
Loss at step 20: 0.24377
   start index: 1900, end index: 2000
Loss at step 0: 0.26960
Loss at step 10: 0.26411
Loss at step 20: 0.26362
CPU times: user 4min 50s, sys: 4.33 s, total: 4min 55s
Wall time: 4min 39s


In [0]:
test_extracted_features_df = to_extracted_features(test_normalized_feature_dict_1000, model)

In [219]:
test_extracted_features_df.head()

Unnamed: 0,doc_id,extracted_feature
0,12000862,"[-0.013157393, 0.06550002, -0.17330328, 0.0534..."
1,12003258,"[-0.01271724, -0.021055741, -0.18226379, 0.150..."
2,12004701,"[-0.059054796, -0.022013474, -0.1141999, 0.041..."
3,12007341,"[-0.0034107184, -0.013823663, -0.15175237, 0.0..."
4,12018798,"[-0.017291823, -0.04676495, -0.09946125, 0.064..."


In [0]:
grants_extracted_features_df = to_extracted_features(grants_normalized_feature_dict_2000, model)

In [221]:
grants_extracted_features_df.head()

Unnamed: 0,doc_id,extracted_feature
0,6837383,"[0.01180602, -0.023277633, -0.059893485, 0.048..."
1,6837647,"[0.014063301, 0.02104149, -0.06873307, 0.01846..."
2,6837799,"[0.020941963, -0.0022532253, -0.12189834, 0.00..."
3,6837893,"[0.044268582, 0.018448185, -0.06050285, 0.0270..."
4,6837910,"[0.015180708, 0.11016171, -0.12653644, 0.11364..."


In [223]:
%%time
doubled_sim_dict = create_sim_dict(test_extracted_features_df, grants_extracted_features_df)

CPU times: user 9.99 s, sys: 77.3 ms, total: 10.1 s
Wall time: 10.1 s


In [227]:
counter = calc_all_ranks_counter(test_extracted_features_df['doc_id'], doubled_sim_dict)
print(counter)

Counter({1: 150, 2: 55, 3: 39, 4: 29, 6: 28, 5: 27, 8: 19, 11: 17, 7: 16, 13: 16, 17: 16, 10: 16, 9: 13, 12: 12, 16: 11, 23: 11, 27: 11, 21: 10, 15: 10, 19: 9, 33: 9, 20: 8, 14: 8, 22: 8, 32: 8, 24: 7, 31: 6, 41: 6, 48: 6, 65: 6, 38: 6, 44: 6, 35: 5, 68: 5, 28: 5, 25: 5, 153: 5, 62: 5, 43: 5, 55: 5, 34: 5, 105: 5, 148: 4, 92: 4, 51: 4, 164: 4, 78: 4, 46: 4, 87: 4, 26: 4, 56: 4, 67: 4, 350: 4, 192: 4, 57: 4, 120: 4, 61: 4, 145: 3, 128: 3, 155: 3, 45: 3, 144: 3, 18: 3, 85: 3, 121: 3, 86: 3, 95: 3, 137: 3, 49: 3, 29: 3, 54: 3, 53: 3, 83: 3, 230: 3, 64: 3, 106: 3, 113: 3, 579: 3, 93: 3, 47: 3, 97: 3, 123: 3, 77: 3, 42: 3, 40: 3, 58: 3, 50: 3, 147: 3, 329: 2, 456: 2, 90: 2, 150: 2, 401: 2, 260: 2, 1248: 2, 177: 2, 213: 2, 376: 2, 94: 2, 89: 2, 245: 2, 100: 2, 116: 2, 167: 2, 96: 2, 1140: 2, 66: 2, 216: 2, 232: 2, 426: 2, 172: 2, 251: 2, 139: 2, 347: 2, 156: 2, 72: 2, 366: 2, 195: 2, 122: 2, 539: 2, 169: 2, 574: 2, 288: 2, 88: 2, 509: 2, 80: 2, 52: 2, 749: 2, 173: 2, 103: 2, 1489: 2, 186: 2,

Positive 10 times.

In [0]:
model = Model(input_shape=768, output_shape=100)

In [231]:
%%time

for sidx, eidx in start_end_index_pairs:
    print("   start index: {}, end index: {}".format(sidx,eidx))
    train_with_changing_negative_pair_with_loss(sidx, eidx, tripletloss_pos_10times, batch_size=10, epochs=21)

   start index: 0, end index: 100
Loss at step 0: 0.17560
Loss at step 10: 0.15177
Loss at step 20: 0.12895
   start index: 900, end index: 1000
Loss at step 0: 0.25382
Loss at step 10: 0.24570
Loss at step 20: 0.23713
   start index: 1900, end index: 2000
Loss at step 0: 0.26674
Loss at step 10: 0.26804
Loss at step 20: 0.26128
CPU times: user 4min 45s, sys: 4.21 s, total: 4min 49s
Wall time: 4min 33s


In [0]:
test_extracted_features_df = to_extracted_features(test_normalized_feature_dict_1000, model)
grants_extracted_features_df = to_extracted_features(grants_normalized_feature_dict_2000, model)
ten_sim_dict = create_sim_dict(test_extracted_features_df, grants_extracted_features_df)

In [233]:
counter = calc_all_ranks_counter(test_extracted_features_df['doc_id'], ten_sim_dict)
print(counter)

Counter({1: 137, 2: 60, 3: 42, 4: 42, 7: 26, 5: 23, 6: 23, 9: 20, 8: 15, 11: 15, 10: 14, 23: 13, 15: 12, 18: 11, 14: 11, 12: 9, 59: 9, 20: 8, 55: 8, 26: 8, 16: 8, 19: 7, 49: 7, 38: 7, 17: 7, 22: 7, 35: 7, 56: 7, 43: 7, 32: 7, 25: 7, 13: 6, 77: 6, 24: 6, 34: 6, 30: 6, 48: 6, 39: 6, 61: 6, 42: 5, 21: 5, 40: 5, 58: 5, 27: 5, 69: 5, 72: 5, 29: 5, 80: 5, 36: 5, 143: 4, 146: 4, 75: 4, 33: 4, 65: 4, 54: 4, 71: 4, 50: 4, 37: 4, 46: 4, 31: 4, 63: 4, 92: 3, 68: 3, 103: 3, 94: 3, 145: 3, 60: 3, 82: 3, 170: 3, 109: 3, 123: 3, 62: 3, 227: 3, 126: 3, 90: 3, 99: 3, 288: 3, 147: 3, 52: 3, 98: 3, 28: 3, 174: 3, 249: 3, 137: 3, 47: 3, 121: 2, 86: 2, 155: 2, 87: 2, 144: 2, 51: 2, 177: 2, 599: 2, 141: 2, 1199: 2, 214: 2, 101: 2, 135: 2, 256: 2, 344: 2, 108: 2, 136: 2, 436: 2, 79: 2, 70: 2, 226: 2, 930: 2, 89: 2, 743: 2, 53: 2, 201: 2, 215: 2, 2400: 2, 618: 2, 298: 2, 225: 2, 166: 2, 128: 2, 468: 2, 57: 2, 186: 2, 102: 2, 326: 2, 66: 2, 1224: 2, 267: 2, 369: 2, 78: 2, 786: 2, 100: 2, 152: 2, 591: 2, 164: 2

Training is not enough?

In [234]:
sidx, eidx = 1900, 2000
print("   start index: {}, end index: {}".format(sidx,eidx))
train_with_changing_negative_pair_with_loss(sidx, eidx, tripletloss_pos_10times, batch_size=10, epochs=51)

   start index: 1900, end index: 2000
Loss at step 0: 0.25474
Loss at step 10: 0.25733
Loss at step 20: 0.25183
Loss at step 30: 0.24196
Loss at step 40: 0.23452
Loss at step 50: 0.23559


In [0]:
test_extracted_features_df = to_extracted_features(test_normalized_feature_dict_1000, model)
grants_extracted_features_df = to_extracted_features(grants_normalized_feature_dict_2000, model)
ten_sim_dict = create_sim_dict(test_extracted_features_df, grants_extracted_features_df)

In [236]:
counter = calc_all_ranks_counter(test_extracted_features_df['doc_id'], ten_sim_dict)
print(counter)

Counter({1: 145, 2: 64, 3: 52, 4: 34, 5: 28, 6: 25, 9: 24, 11: 22, 7: 19, 12: 17, 8: 16, 25: 14, 10: 12, 22: 10, 18: 10, 28: 10, 16: 10, 19: 9, 26: 9, 14: 9, 43: 9, 33: 9, 61: 7, 15: 7, 35: 7, 56: 7, 41: 7, 24: 7, 17: 7, 48: 7, 20: 7, 65: 6, 42: 6, 32: 6, 57: 6, 39: 6, 54: 6, 180: 6, 31: 6, 13: 6, 60: 6, 29: 6, 58: 5, 49: 5, 102: 5, 37: 5, 105: 5, 191: 5, 88: 4, 69: 4, 27: 4, 79: 4, 23: 4, 40: 4, 52: 4, 86: 4, 38: 4, 76: 4, 62: 4, 113: 4, 629: 3, 218: 3, 270: 3, 21: 3, 87: 3, 148: 3, 64: 3, 562: 3, 45: 3, 141: 3, 116: 3, 46: 3, 85: 3, 78: 3, 92: 3, 89: 3, 172: 3, 77: 3, 104: 3, 106: 3, 47: 3, 59: 3, 167: 3, 96: 3, 185: 3, 55: 3, 66: 3, 833: 2, 97: 2, 698: 2, 238: 2, 132: 2, 94: 2, 513: 2, 186: 2, 99: 2, 51: 2, 188: 2, 75: 2, 158: 2, 154: 2, 201: 2, 209: 2, 1372: 2, 208: 2, 91: 2, 125: 2, 63: 2, 112: 2, 293: 2, 567: 2, 335: 2, 1201: 2, 160: 2, 482: 2, 407: 2, 928: 2, 451: 2, 262: 2, 768: 2, 164: 2, 385: 2, 139: 2, 354: 2, 153: 2, 448: 2, 53: 2, 312: 2, 230: 2, 267: 2, 72: 2, 126: 2, 138

### Investigation

In [0]:
random.seed(0)
trip = create_triplet_pairs(0, 100)

In [0]:
trip.shape

(1282, 3)

In [0]:
trip[0:5]

Unnamed: 0,app_id,cited_grants,uncited_grants
0,14222691,8206188,7474966
1,14222691,8177561,6925385
2,14222691,8179692,7735351
3,12515852,7235710,7271332
4,12033424,6950953,7371046


In [0]:
random.seed(0)
test = create_training_input_np(0, 100)

In [0]:
test.shape

(3, 1282, 768)

In [0]:
test_ten = tf.convert_to_tensor(test, dtype=tf.float32)
anchor_data, positive_data, negative_data = test_ten

In [0]:
anchor_data[0, 0:5]

<tf.Tensor: id=4237669, shape=(5,), dtype=float32, numpy=
array([-0.05969358,  0.05039638, -0.02005569,  0.0451927 , -0.01494433],
      dtype=float32)>

In [0]:
train_normalized_feature_dict_1000[14222691][0:5]

array([-0.05969358,  0.05039638, -0.0200557 ,  0.04519269, -0.01494433])

In [0]:
positive_data[0, 0:5]

<tf.Tensor: id=4237674, shape=(5,), dtype=float32, numpy=
array([-0.04778686,  0.03372966, -0.01699614,  0.0232354 ,  0.00648623],
      dtype=float32)>

In [0]:
grants_normalized_feature_dict_2000[8206188][0:5]

array([-0.04778686,  0.03372966, -0.01699614,  0.0232354 ,  0.00648623])

In [0]:
negative_data[0, 0:5]

<tf.Tensor: id=4237679, shape=(5,), dtype=float32, numpy=
array([-0.01261457,  0.01773252, -0.03752513,  0.02603934, -0.01327848],
      dtype=float32)>

In [0]:
grants_normalized_feature_dict_2000[7474966][0:5]

array([-0.01261457,  0.01773252, -0.03752513,  0.02603934, -0.01327848])

In [0]:
data_num = int(test.shape[1])

In [0]:
random.seed(0)
rand_idx = np.random.permutation(data_num)

In [0]:
rand_idx[0:5]

array([1102,  653,  475,   81,  404])

In [0]:
test2 = np.array([
    test[0][rand_idx], 
    test[1][rand_idx], 
    test[2][rand_idx]])

In [0]:
trip[1102:1103]

Unnamed: 0,app_id,cited_grants,uncited_grants
1102,12450212,7719957,7506657


In [0]:
test_ten2 = tf.convert_to_tensor(test2, dtype=tf.float32)
anchor_data2, positive_data2, negative_data2 = test_ten2

In [0]:
anchor_data2[0, 0:5]

<tf.Tensor: id=4237697, shape=(5,), dtype=float32, numpy=
array([-0.03815173,  0.04558022, -0.03306705,  0.00874347, -0.01623795],
      dtype=float32)>

In [0]:
train_normalized_feature_dict_1000[12450212][0:5]

array([-0.03815173,  0.04558022, -0.03306705,  0.00874347, -0.01623795])

In [0]:
positive_data2[0, 0:5]

<tf.Tensor: id=4237702, shape=(5,), dtype=float32, numpy=
array([-0.03455767,  0.03205481, -0.05553311, -0.00180341, -0.02068364],
      dtype=float32)>

In [0]:
grants_normalized_feature_dict_2000[7719957][0:5]

array([-0.03455768,  0.03205481, -0.05553311, -0.00180341, -0.02068364])

In [0]:
grants_normalized_feature_dict_2000[7506657][0:5]

array([-0.03321561,  0.02758702,  0.02755731,  0.05383265,  0.01525151])

In [0]:
negative_data2[0, 0:5]

<tf.Tensor: id=4237707, shape=(5,), dtype=float32, numpy=
array([-0.03321561,  0.02758702,  0.02755731,  0.05383265,  0.01525151],
      dtype=float32)>

In [0]:
anchor_out2 = model.frwrd_pass(anchor_data2[0:5])
positive_out2 = model.frwrd_pass(positive_data2[0:5])
negative_out2 = model.frwrd_pass(negative_data2[0:5])
curr_loss2 = tripletloss(anchor_out2, positive_out2, negative_out2)

In [0]:
curr_loss2

<tf.Tensor: id=9827650, shape=(), dtype=float32, numpy=0.0>

In [0]:
curr_loss2

<tf.Tensor: id=6980813, shape=(), dtype=float32, numpy=0.0>

In [0]:
norm_a_out2 = tf.nn.l2_normalize(anchor_out2, axis=1)
norm_p_out2 = tf.nn.l2_normalize(positive_out2, axis=1)
norm_n_out2 = tf.nn.l2_normalize(negative_out2, axis=1)



In [0]:
d_pos2 = tf.losses.cosine_distance(norm_a_out2, norm_p_out2, axis=1, reduction=tf.losses.Reduction.NONE)
d_neg2 = tf.losses.cosine_distance(norm_a_out2, norm_n_out2, axis=1, reduction=tf.losses.Reduction.NONE)

In [0]:
d_pos2

<tf.Tensor: id=9827686, shape=(5, 1), dtype=float32, numpy=
array([[0.31181705],
       [0.2618196 ],
       [0.29379976],
       [0.10545641],
       [0.17845315]], dtype=float32)>

In [0]:
d_pos2

<tf.Tensor: id=6980849, shape=(5, 1), dtype=float32, numpy=
array([[0.32095897],
       [0.3099807 ],
       [0.24314821],
       [0.13694352],
       [0.18507373]], dtype=float32)>

In [0]:
d_neg2

<tf.Tensor: id=6980863, shape=(5, 1), dtype=float32, numpy=
array([[0.63673747],
       [0.5831015 ],
       [0.5028086 ],
       [0.7062503 ],
       [0.48968685]], dtype=float32)>

In [0]:
test_app_df.head()

Unnamed: 0,app_id,xml
0,14307191,"<us-patent-application lang=""EN"" dtd-version=""..."
1,13137006,"<us-patent-application lang=""EN"" dtd-version=""..."
2,12741959,"<us-patent-application lang=""EN"" dtd-version=""..."
3,12643447,"<us-patent-application lang=""EN"" dtd-version=""..."
4,14200253,"<us-patent-application lang=""EN"" dtd-version=""..."


In [0]:
list(reversed(sort_similarity_by_value(sim_dict_by_extracted, 14307191)))

[(7675444, 0.81486243),
 (8213360, 0.8095555),
 (8295300, 0.7975527),
 (7440728, 0.7911943),
 (7181534, 0.79116535),
 (7015791, 0.78744763),
 (7047561, 0.7843642),
 (7496099, 0.7797317),
 (6856519, 0.7784024),
 (7400687, 0.77812225),
 (7596647, 0.7777683),
 (6943925, 0.77549464),
 (6968158, 0.770937),
 (7430610, 0.77051735),
 (8149834, 0.7702616),
 (7197008, 0.76929677),
 (7080160, 0.76841784),
 (6927664, 0.768098),
 (7787480, 0.7665925),
 (7460369, 0.76627505),
 (7774833, 0.76552117),
 (6947376, 0.76504576),
 (6943668, 0.76469994),
 (7684209, 0.75894296),
 (7898105, 0.7584627),
 (7359643, 0.75701934),
 (7184876, 0.7568906),
 (7215144, 0.7568878),
 (7733589, 0.7557938),
 (7388946, 0.75565314),
 (7484979, 0.75525403),
 (7418208, 0.7550416),
 (7019617, 0.7520639),
 (6895220, 0.7517824),
 (7408379, 0.75157136),
 (7050761, 0.75155663),
 (7715309, 0.75128585),
 (7949309, 0.7512685),
 (8126094, 0.75113165),
 (8051342, 0.7495631),
 (7215775, 0.74950296),
 (7843906, 0.74846643),
 (7145862, 0.7

In [0]:
get_cited_grants(citations_info_target, 14307191)

{7576688}

In [0]:
sim_dict_by_extracted[14307191][7576688]

0.5689387

In [0]:
grants_extracted_features_df['parsed'].head()

0    6837383
1    6837647
2    6837799
3    6837893
4    6837910
Name: parsed, dtype: int64

In [0]:
grants_extracted_features_df[grants_extracted_features_df['parsed']==7576688].index[0]

1533

In [0]:
norm_p_out3 = grants_extracted_features_df['extracted_feature'][1533]

In [0]:
sum(norm_p_out3.shape)

100

In [0]:
test_extracted_features_df[test_extracted_features_df['app_id'] == 14307191].index[0]

890

In [0]:
norm_a_out3 = test_extracted_features_df['extracted_feature'][890]
norm_a_out3.shape

(100,)

In [0]:
tf.losses.cosine_distance(norm_a_out3, norm_p_out3, axis=0) 

<tf.Tensor: id=9827739, shape=(), dtype=float32, numpy=0.4310614>

In [0]:
0.5689387+tf.losses.cosine_distance(norm_a_out3, norm_p_out3, axis=0)

<tf.Tensor: id=9827768, shape=(), dtype=float32, numpy=1.0>

In [0]:
grants_extracted_features[0].sum()

7.989149