<a href="https://colab.research.google.com/github/karino2/US-patent-analysis/blob/triplet_loss_colab/colab/bert_tripletloss.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Colab porting of triplet loss trial for bert feature**

In [1]:
import os
import datetime
import pickle
import gzip
import pandas as pd
import numpy as np

import tensorflow as tf

tf.enable_eager_execution()


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.



In [3]:
tf.__version__

'1.13.1'

In [0]:
!git clone -b docker https://github.com/yoheikikuta/bert.git

In [0]:
!ls

# Data setup

Feature data is created at bert-extract-feature.ipynb.

In [0]:
!mkdir ./bert/data

In [0]:
from google.colab import auth
auth.authenticate_user()

In [4]:
!gsutil cp gs://karino2-uspatent/features/training_app_1000_features.pkl.gz bert/data/training_app_1000_features.pkl.gz
!gsutil cp gs://karino2-uspatent/features/test_app_1000_features.pkl.gz bert/data/test_app_1000_features.pkl.gz
!gsutil cp gs://karino2-uspatent/features/grants_2000_features.pkl.gz bert/data/grants_2000_features.pkl.gz

Copying gs://karino2-uspatent/features/training_app_1000_features.pkl.gz...
/ [0 files][    0.0 B/  3.1 MiB]                                                / [1 files][  3.1 MiB/  3.1 MiB]                                                
Operation completed over 1 objects/3.1 MiB.                                      
Copying gs://karino2-uspatent/features/test_app_1000_features.pkl.gz...
/ [1 files][  3.1 MiB/  3.1 MiB]                                                
Operation completed over 1 objects/3.1 MiB.                                      
Copying gs://karino2-uspatent/features/grants_2000_features.pkl.gz...
/ [1 files][  7.9 MiB/  7.9 MiB]                                                
Operation completed over 1 objects/7.9 MiB.                                      


In [5]:
with gzip.open("./bert/data/training_app_1000_features.pkl.gz", 'rb') as f:
     train_features = pickle.load(f)
with gzip.open("./bert/data/test_app_1000_features.pkl.gz", 'rb') as f:
     test_features = pickle.load(f)
with gzip.open("./bert/data/grants_2000_features.pkl.gz", 'rb') as f:
     grants_features = pickle.load(f)
print(f"{train_features.shape}, {test_features.shape}, {grants_features.shape}")

(1000, 768), (1000, 768), (2524, 768)


In [0]:
!gsutil cp gs://karino2-uspatent/citations_info_2000.df.gz ./bert/data/
!gsutil cp gs://karino2-uspatent/testset_app_1000.df.gz ./bert/data/
!gsutil cp gs://karino2-uspatent/training_app_1000.df.gz ./bert/data/
!gsutil cp gs://karino2-uspatent/grants_for_2000.df.gz ./bert/data/

In [0]:
citations_info_target = pd.read_pickle("./bert/data/citations_info_2000.df.gz")
train_app_df = pd.read_pickle("./bert/data/training_app_1000.df.gz")
test_app_df = pd.read_pickle("./bert/data/testset_app_1000.df.gz")
grants_target_df = pd.read_pickle("./bert/data/grants_for_2000.df.gz")

### Prepare data for circulum tripletloss learning

In [0]:
import random
import pandas as pd
import numpy as np
import pickle
from collections import OrderedDict

In [0]:
train_normalized_feature_dict_1000 = { 
    k:v/np.linalg.norm(v) for k,v in zip(train_app_df['app_id'], train_features[:, ])
}
grants_normalized_feature_dict_2000 = { 
    k:v/np.linalg.norm(v) for k,v in zip(grants_target_df['parsed'], grants_features[:, ])
}

Confirm result

In [19]:
train_app_df.head()

Unnamed: 0,app_id,xml
0,14222691,"<us-patent-application lang=""EN"" dtd-version=""..."
1,12515852,"<us-patent-application lang=""EN"" dtd-version=""..."
2,12033424,"<us-patent-application lang=""EN"" dtd-version=""..."
3,12402344,"<us-patent-application lang=""EN"" dtd-version=""..."
4,12155425,"<us-patent-application lang=""EN"" dtd-version=""..."


In [25]:
train_normalized_feature_dict_1000[14222691][0:5]

array([-0.05969358,  0.05039638, -0.0200557 ,  0.04519269, -0.01494433])

In [26]:
train_features[0, 0:5]

array([-1.04389739,  0.88131171, -0.35072595,  0.79031169, -0.26134047])

In [28]:
(train_features[0, ]/np.linalg.norm(train_features[0, ]))[0:5]

array([-0.05969358,  0.05039638, -0.0200557 ,  0.04519269, -0.01494433])

### Calculate cosin similarity for whole pair

In [45]:
%%time

sim_dict = {
    app_id:{ parsed:np.sum(train_normalized_feature_dict_1000[app_id]*grants_normalized_feature_dict_2000[parsed])
            for parsed 
            in grants_target_df['parsed'] } 
    for app_id 
    in train_app_df['app_id']
}

CPU times: user 13.6 s, sys: 146 ms, total: 13.7 s
Wall time: 13.8 s


check result

In [32]:
list(sim_dict.keys())[0]

14222691

In [46]:
sorted( sim_dict[12411468].values(), reverse=True )[:5]

[0.8536353801810572,
 0.8500573288849081,
 0.8478197928831097,
 0.8464742417020769,
 0.8457543474536907]

In [36]:
citations_info_target[citations_info_target.app_id == 12411468].parsed

4089    7061154
4090    7061154
Name: parsed, dtype: int64

In [44]:
grants_target_df[grants_target_df.parsed == 7061154].index[0]

540

In [45]:
list(sim_dict[12411468].values())[540]

0.8435186037581648

In [47]:
sorted( sim_dict[12411468].values(), reverse=True ).index(0.8435186037581648)

7

### Create triplet

In [0]:
def sort_similarity_by_value(sim_dict, app_id):
    '''
    input:
        sim_dict: similary dictionary
        app_id: target application id
    return:
        [(parsed1, sim1), (parsed2, sim2), ...] sorted by similarities
    '''
    return [(parsed, sim_dict[app_id][parsed]) for parsed in sorted(sim_dict[app_id], key=sim_dict[app_id].get)]
  
  
def get_cited_grants(citations_info_target, app_id):
    '''
    input:
        citations_info_target: DataFrame of citation relationships
        app_id: target application id
    return:
        {parsed1, parsed2, ...} that are cited to reject app_id
    '''
    return set(citations_info_target[citations_info_target['app_id'] == app_id]['parsed'])
  
def make_uncited_grants_for_app_id(sim_dict, citations_info_target, app_id, sidx, eidx, num, shuffle=True):
    '''
    input:
        sim_dict: 
        citations_info_target:
        app_id: target application id
        sidx: start index to slice the sorted (parsed, sim) list
        eidx: end index to slice the sorted (parsed, sim) list
        num: number of grants that will be returned
    return:
        [parsed_1, parsed_2, ..., parsed_num] that are NOT cited to reject app_id
    '''
    sorted_grants_list = sort_similarity_by_value(sim_dict, app_id)
    sorted_grants_list = sorted_grants_list[sidx:eidx]
    if shuffle:
        random.shuffle(sorted_grants_list)
    
    cited_grants = get_cited_grants(citations_info_target, app_id)
    uncited_grants = []
    
    idx = 0
    while len(uncited_grants) != num:
        grant_id, _ = sorted_grants_list[idx]
        if not grant_id in cited_grants:
            uncited_grants.append(grant_id)
        idx += 1
    
    return uncited_grants  

In [11]:
random.seed(0)
make_uncited_grants_for_app_id(sim_dict, citations_info_target, 12411468, 0, 100, 4)

[7196326, 6979916, 6971650, 6899075]

In [50]:
random.seed(1)
make_uncited_grants_for_app_id(sim_dict, citations_info_target, 12411468, 0, 100, 4)

[7108673, 7846876, 7168415, 7052189]

In [0]:
def create_triplet_pairs(sidx, eidx):
    all_elems = []
    
    for app_id in train_app_df['app_id']:
        cited_grants = get_cited_grants(citations_info_target, app_id)
        num_cited_grants = len(cited_grants)
        uncited_grants = make_uncited_grants_for_app_id(
            sim_dict, citations_info_target, app_id, sidx, eidx, num_cited_grants)
        
        for idx, cited in enumerate(cited_grants):
            all_elems.append([app_id, cited, uncited_grants[idx]])
    
    result_df = pd.DataFrame(all_elems)
    result_df.columns = ['app_id', 'cited_grants', 'uncited_grants']
    
    return result_df

In [53]:
%%time

random.seed(0)
test = create_triplet_pairs(0, 100)

CPU times: user 3.19 s, sys: 4.85 ms, total: 3.2 s
Wall time: 3.19 s


In [54]:
test.head(2)

Unnamed: 0,app_id,cited_grants,uncited_grants
0,14222691,8206188,7474966
1,14222691,8177561,6925385


In [55]:
len(test)

1282

In [0]:
random.seed(1)
test = create_triplet_pairs(0, 100)

In [57]:
test.head(2)

Unnamed: 0,app_id,cited_grants,uncited_grants
0,14222691,8206188,6977086
1,14222691,8177561,7252786


### Train Model

In [13]:
tf.executing_eagerly() 

True

In [0]:
class Model(object):
    def __init__(self, input_shape, output_shape):
        self.input_shape = input_shape
        self.output_shape = output_shape
        self.W = tf.Variable( tf.random_normal( [self.input_shape, self.output_shape] ), name='weight' )
        self.B = tf.Variable( tf.random_normal( [self.output_shape] ), name='bias' ) 
        self.variables = [ self.W, self.B ]
    
    def frwrd_pass(self,X_train):
        out = tf.matmul( X_train, self.W ) + self.B
        
        return out

In [0]:
def tripletloss(anchor_out, positive_out, negative_out, margin=0.2):
    norm_a_out = tf.nn.l2_normalize(anchor_out, axis=1)
    norm_p_out = tf.nn.l2_normalize(positive_out, axis=1)
    norm_n_out = tf.nn.l2_normalize(negative_out, axis=1)
    
    d_pos = tf.losses.cosine_distance(norm_a_out, norm_p_out, axis=1)
    d_neg = tf.losses.cosine_distance(norm_a_out, norm_n_out, axis=1)
    
    loss = tf.maximum(0.0, margin + d_pos - d_neg)
    
    return tf.reduce_mean(loss)

In [0]:
def create_training_input_np(sidx, eidx):
    anchor_list = []
    positive_list = []
    negative_list = []
    
    triplet_pairs = create_triplet_pairs(sidx, eidx)
    
    for row in triplet_pairs.itertuples():
        anchor_list.append(train_normalized_feature_dict_1000[row.app_id])
        positive_list.append(grants_normalized_feature_dict_2000[row.cited_grants])
        negative_list.append(grants_normalized_feature_dict_2000[row.uncited_grants])
    
    return np.array([np.array(anchor_list), np.array(positive_list), np.array(negative_list)])

In [0]:
def train_with_changing_negative_pair(sidx, eidx, batch_size, epochs):
    optimizer = tf.train.AdamOptimizer(learning_rate=0.00001)
    
    seed = 0
    for i in range(epochs):
        seed += 1
        random.seed(seed)
        
        input_data_np = create_training_input_np(sidx, eidx)
        data_num = int(input_data_np.shape[1])
        rand_idx = np.random.permutation(data_num)
        index_data_np = np.array([
            input_data_np[0][rand_idx], 
            input_data_np[1][rand_idx], 
            input_data_np[2][rand_idx]])

        input_data = tf.convert_to_tensor(input_data_np, dtype=tf.float32)
        anchor_data, positive_data, negative_data = input_data

        for iter_id in range(data_num // batch_size):        
            with tf.GradientTape() as tape:
                anchor_out = model.frwrd_pass(anchor_data[iter_id*batch_size : (iter_id+1)*batch_size])
                positive_out = model.frwrd_pass(positive_data[iter_id*batch_size : (iter_id+1)*batch_size])
                negative_out = model.frwrd_pass(negative_data[iter_id*batch_size : (iter_id+1)*batch_size])
                curr_loss = tripletloss(anchor_out, positive_out, negative_out)
            grads = tape.gradient( curr_loss, model.variables )
            optimizer.apply_gradients(zip(grads, model.variables), global_step=tf.train.get_or_create_global_step())

        if i % 10 == 0:
            print( "Loss at step {:d}: {:.5f}".format(i, curr_loss) )

In [0]:
# start_end_index_pairs = (
#     (0, 100),
#     (100, 200),
#     (200, 300),
#     (300, 400),
#     (400, 500),
#     (500, 600),
#     (600, 700),
#     (700, 800),
#     (800, 900),
#     (900, 1000),
#     (1000, 1100),
#     (1100, 1200),
#     (1200, 1300),
#     (1300, 1400),
#     (1400, 1500),
#     (1500, 1600),
#     (1600, 1700),
#     (1700, 1800),
#     (1800, 1900),
#     (1900, 2000)
# )

# start_end_index_pairs = (
#     (0, 400),
#     (400, 800),
#     (800, 1200),
#     (1200, 1600),
#     (1600, 2000)
# )

start_end_index_pairs = (
    (0, 100),
    (900, 1000),
    (1900, 2000)
)

In [0]:
model = Model(input_shape=768, output_shape=100)

In [20]:
%%time

# for sidx, eidx in start_end_index_pairs:
#     print("   start index: {}, end index: {}".format(sidx,eidx))
#     train_with_changing_negative_pair(sidx, eidx, batch_size=10, epochs=31)

# for sidx, eidx in start_end_index_pairs:
#     print("   start index: {}, end index: {}".format(sidx,eidx))
#     train_with_changing_negative_pair(sidx, eidx, batch_size=10, epochs=51)

for sidx, eidx in start_end_index_pairs:
    print("   start index: {}, end index: {}".format(sidx,eidx))
    train_with_changing_negative_pair(sidx, eidx, batch_size=10, epochs=21)

   start index: 0, end index: 100
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Loss at step 0: 0.02995
Loss at step 10: 0.00330
Loss at step 20: 0.00000
   start index: 900, end index: 1000
Loss at step 0: 0.12552
Loss at step 10: 0.07029
Loss at step 20: 0.05886
   start index: 1900, end index: 2000
Loss at step 0: 0.10404
Loss at step 10: 0.08103
Loss at step 20: 0.05531
CPU times: user 4min 53s, sys: 4.54 s, total: 4min 57s
Wall time: 4min 42s


In [21]:
sidx, eidx = 1900, 2000
print("   start index: {}, end index: {}".format(sidx,eidx))
train_with_changing_negative_pair(sidx, eidx, batch_size=10, epochs=21)

   start index: 1900, end index: 2000
Loss at step 0: 0.02760
Loss at step 10: 0.00558
Loss at step 20: 0.00000


In [23]:
os.makedirs('../trained_model/tripletloss_circulum', exist_ok=True)
saver = tf.contrib.eager.Saver(model.variables)
saver.save("../trained_model/tripletloss_circulum/ckpt")



'../trained_model/tripletloss_circulum/ckpt'

### Inferece with trained model

Saver seems fragile in eager mode. I use trained model on memory.

In [0]:
test_normalized_feature_dict_1000 = { 
    k:v/np.linalg.norm(v) for k,v in zip(test_app_df['app_id'], test_features[:, ])
}


In [0]:
sorted_keys = sorted(test_normalized_feature_dict_1000.keys())

test_feature_tensors = tf.convert_to_tensor(
    np.array([ test_normalized_feature_dict_1000[k] for k in sorted_keys ]),
    dtype=tf.float32)

In [0]:
test_extracted_features = model.frwrd_pass(test_feature_tensors).numpy()

In [27]:
test_extracted_features.shape

(1000, 100)

In [0]:
test_extracted_features_df = pd.DataFrame({ 
    'app_id':sorted_keys, 'extracted_feature':[ v/np.linalg.norm(v) for v in test_extracted_features ]
})

In [29]:
test_extracted_features_df.head(2)

Unnamed: 0,app_id,extracted_feature
0,12000862,"[-0.018699147, 0.022539414, 0.08703015, -0.094..."
1,12003258,"[0.11247784, 0.027767131, 0.012807915, 0.21946..."


In [0]:
sorted_keys = sorted(grants_normalized_feature_dict_2000.keys())

grants_feature_tensors = tf.convert_to_tensor(
    np.array([ grants_normalized_feature_dict_2000[k] for k in sorted_keys ]),
    dtype=tf.float32)

In [0]:
grants_extracted_features = model.frwrd_pass(grants_feature_tensors).numpy()

In [32]:
grants_extracted_features.shape

(2524, 100)

In [0]:
grants_extracted_features_df = pd.DataFrame({ 
    'parsed':sorted_keys, 'extracted_feature':[ v/np.linalg.norm(v) for v in grants_extracted_features ]
})

In [34]:
grants_extracted_features_df.head(2)

Unnamed: 0,extracted_feature,parsed
0,"[0.0040069236, 0.047635674, -0.0546313, 0.0289...",6837383
1,"[0.043915585, -0.16155677, -0.02712263, 0.0224...",6837647


In [35]:
%%time

sim_dict = {
    app_id:{ parsed:np.sum(test_f*grants_f)
            for parsed, grants_f 
            in zip(grants_extracted_features_df['parsed'], grants_extracted_features_df['extracted_feature']) } 
    for app_id, test_f 
    in zip(test_extracted_features_df['app_id'], test_extracted_features_df['extracted_feature'])
}

CPU times: user 9.85 s, sys: 96.2 ms, total: 9.95 s
Wall time: 9.95 s


In [36]:
%%time

all_ranks = []

for app_id in test_extracted_features_df['app_id']:
    cited_grants = get_cited_grants(citations_info_target, app_id)
    sorted_kv = reversed(sort_similarity_by_value(sim_dict, app_id))  # higher score, similar patent

    idx = 1
    for k,v in sorted_kv:
        if k in cited_grants:
            all_ranks.append(idx)
        idx += 1

CPU times: user 2.88 s, sys: 8.71 ms, total: 2.89 s
Wall time: 2.88 s


In [37]:
import collections
counter = collections.Counter(all_ranks)
print(counter)

Counter({1: 145, 2: 70, 4: 41, 3: 38, 5: 32, 7: 29, 6: 25, 13: 18, 12: 17, 17: 16, 8: 15, 15: 15, 10: 14, 14: 14, 11: 13, 18: 13, 9: 12, 22: 12, 30: 11, 27: 10, 40: 10, 35: 9, 29: 9, 31: 9, 20: 9, 41: 8, 19: 8, 37: 8, 55: 8, 26: 7, 32: 7, 51: 7, 25: 7, 21: 6, 46: 6, 44: 6, 16: 6, 47: 6, 139: 6, 73: 5, 23: 5, 52: 5, 53: 5, 271: 5, 54: 5, 62: 4, 28: 4, 112: 4, 24: 4, 109: 4, 83: 4, 126: 4, 110: 4, 107: 4, 75: 4, 256: 4, 33: 4, 36: 4, 127: 3, 59: 3, 60: 3, 93: 3, 58: 3, 91: 3, 63: 3, 77: 3, 67: 3, 48: 3, 72: 3, 117: 3, 89: 3, 76: 3, 43: 3, 74: 3, 135: 3, 79: 3, 86: 3, 158: 3, 227: 3, 45: 3, 87: 3, 49: 3, 120: 3, 174: 3, 134: 3, 292: 3, 39: 3, 357: 3, 154: 2, 333: 2, 220: 2, 128: 2, 84: 2, 260: 2, 81: 2, 190: 2, 68: 2, 208: 2, 287: 2, 124: 2, 237: 2, 42: 2, 166: 2, 1078: 2, 397: 2, 57: 2, 643: 2, 66: 2, 270: 2, 92: 2, 148: 2, 155: 2, 238: 2, 50: 2, 80: 2, 1148: 2, 384: 2, 105: 2, 64: 2, 195: 2, 90: 2, 189: 2, 144: 2, 113: 2, 153: 2, 235: 2, 302: 2, 359: 2, 341: 2, 641: 2, 34: 2, 100: 2, 10

In [41]:
# counter is started from 1.

sum(counter[i] for i in range(11))

421

### Use raw feature as is (for comparison)

In [47]:
%%time

sim_dict = {
    app_id:{ parsed:np.sum(test_normalized_feature_dict_1000[app_id]*grants_normalized_feature_dict_2000[parsed])
            for parsed 
            in grants_target_df['parsed'] } 
    for app_id 
    in test_app_df['app_id']
}

CPU times: user 13.3 s, sys: 149 ms, total: 13.4 s
Wall time: 13.4 s


In [48]:
%%time

all_ranks = []

for app_id in test_app_df['app_id']:
    cited_grants = get_cited_grants(citations_info_target, app_id)
    sorted_kv = reversed(sort_similarity_by_value(sim_dict, app_id))  # higher score, similar patent

    idx = 1
    for k,v in sorted_kv:
        if k in cited_grants:
            all_ranks.append(idx)
        idx += 1

CPU times: user 2.89 s, sys: 8.73 ms, total: 2.9 s
Wall time: 2.89 s


In [49]:
counter = collections.Counter(all_ranks)
print(counter)

Counter({1: 178, 2: 73, 3: 46, 5: 40, 4: 38, 6: 31, 7: 19, 9: 17, 8: 16, 10: 15, 13: 13, 14: 13, 11: 13, 15: 12, 18: 12, 16: 11, 12: 10, 26: 9, 19: 9, 34: 9, 21: 8, 23: 8, 38: 7, 28: 7, 50: 6, 39: 6, 37: 6, 33: 6, 53: 6, 100: 5, 47: 5, 76: 5, 35: 5, 64: 5, 29: 5, 73: 5, 61: 5, 32: 5, 17: 5, 68: 5, 31: 5, 52: 4, 59: 4, 40: 4, 143: 4, 90: 4, 48: 4, 63: 4, 44: 4, 49: 4, 20: 4, 65: 4, 70: 4, 30: 4, 83: 3, 168: 3, 147: 3, 114: 3, 22: 3, 444: 3, 93: 3, 43: 3, 121: 3, 122: 3, 189: 3, 148: 3, 51: 3, 57: 3, 46: 3, 25: 3, 385: 3, 66: 3, 85: 3, 125: 3, 304: 3, 36: 3, 195: 3, 92: 3, 187: 3, 69: 3, 99: 3, 411: 2, 24: 2, 131: 2, 262: 2, 724: 2, 72: 2, 454: 2, 396: 2, 87: 2, 130: 2, 413: 2, 89: 2, 157: 2, 292: 2, 1182: 2, 264: 2, 133: 2, 78: 2, 174: 2, 56: 2, 67: 2, 227: 2, 134: 2, 163: 2, 97: 2, 126: 2, 517: 2, 213: 2, 128: 2, 110: 2, 112: 2, 173: 2, 62: 2, 42: 2, 179: 2, 378: 2, 155: 2, 88: 2, 376: 2, 505: 2, 183: 2, 167: 2, 105: 2, 45: 2, 209: 2, 211: 2, 74: 2, 75: 2, 415: 2, 617: 2, 169: 2, 615: 

In [50]:
sum(counter[i] for i in range(11))

473