# Tripletloss model with circulum learning

- Extract features using pretrained BERT
- Prepare data for circulum tripletloss learning

## Extract features using pretrained BERT

You need execute the following cells just once.

In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
training_app_df = pd.read_pickle("../data/training_app_1000.df.gz")
testset_app_df = pd.read_pickle("../data/testset_app_1000.df.gz")
grants_target_df = pd.read_pickle("../data/grants_for_2000.df.gz")

In [3]:
training_app_df.head(2)

Unnamed: 0,app_id,xml
0,14222691,"<us-patent-application lang=""EN"" dtd-version=""..."
1,12515852,"<us-patent-application lang=""EN"" dtd-version=""..."


In [4]:
testset_app_df.head(2)

Unnamed: 0,app_id,xml
0,14307191,"<us-patent-application lang=""EN"" dtd-version=""..."
1,13137006,"<us-patent-application lang=""EN"" dtd-version=""..."


In [5]:
grants_target_df.head(2)

Unnamed: 0,parsed,xml
0,6837383,"<us-patent-grant lang=""EN"" dtd-version=""v40 20..."
1,6837647,"<us-patent-grant lang=""EN"" dtd-version=""v40 20..."


In [6]:
import re
CLAIM_PAT = re.compile(r'<claims[^>]*>(.*)</claims>',re.MULTILINE|re.DOTALL)
TAG_PAT = re.compile(r"<.*?>")
LB_PAT = re.compile(r'[\t\n\r\f\v][" "]*')

def whole_xml_to_claim_xml(whole):
    mat = CLAIM_PAT.search(whole)
    return mat.group(1)
def whole_xml_to_claim(whole):
    return TAG_PAT.sub(' ', whole_xml_to_claim_xml(whole))

def remove_linebreak_from_claim(claim):
    '''
    Remove line break symbol "\n" with space(s).
    '''
    return LB_PAT.sub('', claim)

In [7]:
training_app_df["claim"] = training_app_df["xml"].map(whole_xml_to_claim).map(remove_linebreak_from_claim)
testset_app_df["claim"] = testset_app_df["xml"].map(whole_xml_to_claim).map(remove_linebreak_from_claim)
grants_target_df["claim"] = grants_target_df["xml"].map(whole_xml_to_claim).map(remove_linebreak_from_claim)

Check duplication.

In [8]:
len(training_app_df.drop_duplicates(keep='first'))

1000

Use BERT for feature extractions.

In [9]:
import sys
sys.path.append("../notebook/bert")

In [10]:
import os
import re
import csv
import time
import codecs
import collections
import tempfile

import modeling
import optimization
import tokenization
import tensorflow as tf

In [11]:
tf.__version__

'1.12.0'

In [12]:
from extract_features import convert_examples_to_features
from extract_features import InputExample
from extract_features import read_examples
from extract_features import _truncate_seq_pair
from extract_features import InputFeatures
from extract_features import input_fn_builder
from extract_features import model_fn_builder

In [13]:
def create_tempfile_of_claim(df):
    tmpf = tempfile.NamedTemporaryFile(mode='r+')
    df['claim'].to_csv(tmpf.name, header=None, index=None)
    # Remove double quotations
    !sed -i -e 's/\"//g' {tmpf.name}
    
    return tmpf

In [14]:
train_tmpf = create_tempfile_of_claim(training_app_df)
test_tmpf = create_tempfile_of_claim(testset_app_df)
grants_tmpf = create_tempfile_of_claim(grants_target_df)

In [15]:
class FLAGS(object):
    '''Parameters.'''
    def __init__(self):
        self.vocab_file = "./bert/model/uncased_L-12_H-768_A-12/vocab.txt"
        self.do_lower_case = True
        self.use_tpu = False
        self.layers = "-1"
        self.bert_config_file = "./bert/model/uncased_L-12_H-768_A-12/bert_config.json"
        self.max_seq_length = 512
        self.init_checkpoint = "./bert/model/uncased_L-12_H-768_A-12/bert_model.ckpt"
        self.use_one_hot_embeddings = False
        self.batch_size = 16
        
        # The following parameters are not used in predictions.
        # Just use to create RunConfig.
        self.master = None
        self.save_checkpoints_steps = 1
        self.iterations_per_loop = 1
        self.num_tpu_cores = 1
        self.learning_rate = 0
        self.num_warmup_steps = 0
        self.num_train_steps = 0
        self.train_batch_size = 0
        self.eval_batch_size = 0

FLAGS = FLAGS()

In [16]:
layer_indexes = [int(x) for x in FLAGS.layers.split(",")]

bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

tokenizer = tokenization.FullTokenizer(
  vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)

is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
run_config = tf.contrib.tpu.RunConfig(
  master=FLAGS.master,
  tpu_config=tf.contrib.tpu.TPUConfig(
      num_shards=FLAGS.num_tpu_cores,
      per_host_input_for_training=is_per_host))

In [17]:
model_fn = model_fn_builder(
  bert_config=bert_config,
  init_checkpoint=FLAGS.init_checkpoint,
  layer_indexes=layer_indexes,
  use_tpu=FLAGS.use_tpu,
  use_one_hot_embeddings=FLAGS.use_one_hot_embeddings)

In [18]:
estimator = tf.contrib.tpu.TPUEstimator(
  use_tpu=FLAGS.use_tpu,
  model_fn=model_fn,
  config=run_config,
  predict_batch_size=FLAGS.batch_size)

INFO:tensorflow:Using config: {'_service': None, '_master': '', '_device_fn': None, '_model_dir': '/tmp/tmpq1_gmf5h', '_task_type': 'worker', '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f551c731a58>, '_protocol': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_eval_distribute': None, '_is_chief': True, '_keep_checkpoint_every_n_hours': 10000, '_save_checkpoints_steps': None, '_num_worker_replicas': 1, '_num_ps_replicas': 0, '_task_id': 0, '_experimental_distribute': None, '_cluster': None, '_keep_checkpoint_max': 5, '_train_distribute': None, '_tpu_config': TPUConfig(iterations_per_loop=2, num_shards=1, num_cores_per_replica=None, per_host_input_for_training=3, tpu_job_name=None, initial_infeed_sleep_secs=None, input_partition_dims=None), '_log_step_count_steps': None, '_save_checkpoints_secs': 600, '_save_summary_steps': 100, '_evaluation_master': '', '_global_id_i

In [19]:
def extract_features(fpath):
    examples = read_examples(fpath)
    features = convert_examples_to_features(examples=examples, seq_length=FLAGS.max_seq_length, tokenizer=tokenizer)

    unique_id_to_feature = {}
    for feature in features:
        unique_id_to_feature[feature.unique_id] = feature

    input_fn = input_fn_builder(features=features, seq_length=FLAGS.max_seq_length)

    results =  np.empty((0,768), float)

    for result in estimator.predict(input_fn, yield_single_examples=True):
        results = np.append(results, result['layer_output_0'][0].reshape(1,768), axis=0 )
    
    return results

In [27]:
training_app_feature_df = pd.DataFrame({
    'app_id' : training_app_df['app_id'],
    'feature' : [elem for elem in extract_features(train_tmpf.name)]
})

training_app_feature_df.to_pickle("../data/training_app_feature_1000.pkl")

In [26]:
testset_app_feature_df = pd.DataFrame({
    'app_id' : testset_app_df['app_id'],
    'feature' : [elem for elem in extract_features(test_tmpf.name)]
})

testset_app_feature_df.to_pickle("../data/testset_app_feature_1000.pkl")

In [25]:
grants_app_feature_df = pd.DataFrame({
    'parsed' : grants_target_df['parsed'],
    'feature' : [elem for elem in extract_features(grants_tmpf.name)]
})

grants_app_feature_df.to_pickle("../data/grants_feature_2000.pkl")

## Prepare data for circulum tripletloss learning

You need execute the following cells everytime.

In [1]:
import pandas as pd
import numpy as np
import pickle
from collections import OrderedDict

In [2]:
citations_info_target = pd.read_pickle("../data/citations_info_2000.df.gz")
train_app_feature_1000 = pd.read_pickle("../data/training_app_feature_1000.pkl")
test_app_feature_1000 = pd.read_pickle("../data/testset_app_feature_1000.pkl")
grants_feature_2000 = pd.read_pickle("../data/grants_feature_2000.pkl")

In [3]:
citations_info_target.head(2)

Unnamed: 0,app_id,app_fnm,citation_pat_pgpub_id,parsed,ifw_number,action_type,action_subtype,form892,form1449,citation_in_oa,...,rejection_103,rejection_112,rejection_dp,objection,allowed_claims,cite102_gt1,cite103_gt3,cite103_eq1,cite103_max,signature_type
0,13371769,/work/data/apps/2012/ipa120607/F_2322.xml,7391316,7391316,H20LX5QGPXXIFW4,103.0,a,1,0,1,...,1,0,1,0,0,0,0,1,2,0
1,13371769,/work/data/apps/2012/ipa120607/F_2322.xml,6992580,6992580,H20LX5QGPXXIFW4,102.0,a,1,1,1,...,1,0,1,0,0,0,0,1,2,0


In [4]:
train_app_feature_1000.head(2)

Unnamed: 0,app_id,feature
0,14222691,"[-0.8012277483940125, 0.3682347536087036, -0.4..."
1,12515852,"[-1.1272308826446533, -0.17207229137420654, -0..."


In [5]:
grants_feature_2000.head(2)

Unnamed: 0,parsed,feature
0,6837383,"[-0.9908803701400757, 0.2743351459503174, -0.0..."
1,6837647,"[-0.6510910391807556, 0.13378533720970154, -0...."


In [6]:
train_normalized_feature_dict_1000 = { 
    k:v/np.linalg.norm(v) for k,v in zip(train_app_feature_1000['app_id'], train_app_feature_1000['feature'])
}

In [7]:
grants_normalized_feature_dict_2000 = { 
    k:v/np.linalg.norm(v) for k,v in zip(grants_feature_2000['parsed'], grants_feature_2000['feature'])
}

In [8]:
%%time

sim_dict = {
    app_id:{ parsed:np.sum(train_normalized_feature_dict_1000[app_id]*grants_normalized_feature_dict_2000[parsed])
            for parsed 
            in grants_feature_2000['parsed'] } 
    for app_id 
    in train_app_feature_1000['app_id']
}

CPU times: user 19.4 s, sys: 194 ms, total: 19.6 s
Wall time: 19.6 s


In [9]:
list(sim_dict.keys())[0]

12411468

In [10]:
sorted( sim_dict[12411468].values() )[:5]

[0.5762169750690092,
 0.6097554784789876,
 0.6223759222249304,
 0.6421321976972119,
 0.6643671272890019]

In [11]:
def sort_similarity_by_value(sim_dict, app_id):
    '''
    input:
        sim_dict: similary dictionary
        app_id: target application id
    return:
        [(parsed1, sim1), (parsed2, sim2), ...] sorted by similarities
    '''
    return [(parsed, sim_dict[app_id][parsed]) for parsed in sorted(sim_dict[app_id], key=sim_dict[app_id].get)]

In [12]:
def get_cited_grants(citations_info_target, app_id):
    '''
    input:
        citations_info_target: DataFrame of citation relationships
        app_id: target application id
    return:
        {parsed1, parsed2, ...} that are cited to reject app_id
    '''
    return set(citations_info_target[citations_info_target['app_id'] == app_id]['parsed'])

In [13]:
def make_uncited_grants_for_app_id(sim_dict, citations_info_target, app_id, sidx, eidx, num):
    '''
    input:
        sim_dict: 
        citations_info_target:
        app_id: target application id
        sidx: start index to slice the sorted (parsed, sim) list
        eidx: end index to slice the sorted (parsed, sim) list
        num: number of grants that will be returned
    return:
        [parsed_1, parsed_2, ..., parsed_num] that are NOT cited to reject app_id
    '''
    sorted_grants_list = sort_similarity_by_value(sim_dict, app_id)
    sorted_grants_list = sorted_grants_list[sidx:eidx]
    
    cited_grants = get_cited_grants(citations_info_target, app_id)
    uncited_grants = []
    
    idx = 0
    while idx < num:
        grant_id, _ = sorted_grants_list[idx]
        if not grant_id in cited_grants:
            uncited_grants.append(grant_id)
        idx += 1
    
    return uncited_grants

In [14]:
make_uncited_grants_for_app_id(sim_dict, citations_info_target, 12411468, 0, 100, 4)

[7214786, 7355042, 7129390, 7384920]

In [15]:
def create_triplet_pairs(sidx, eidx):
    all_elems = []
    
    for app_id in train_app_feature_1000['app_id']:
        cited_grants = get_cited_grants(citations_info_target, app_id)
        num_cited_grants = len(cited_grants)
        uncited_grants = make_uncited_grants_for_app_id(
            sim_dict, citations_info_target, app_id, sidx, eidx, num_cited_grants)
        
        for idx, cited in enumerate(cited_grants):
            all_elems.append([app_id, cited, uncited_grants[idx]])
    
    result_df = pd.DataFrame(all_elems)
    result_df.columns = ['app_id', 'cited_grants', 'uncited_grants']
    
    return result_df

In [16]:
%%time

test = create_triplet_pairs(1900, 2000)

CPU times: user 4.25 s, sys: 93.7 ms, total: 4.34 s
Wall time: 4.11 s


In [17]:
test.head(2)

Unnamed: 0,app_id,cited_grants,uncited_grants
0,14222691,8206188,8064198
1,14222691,8177561,7682966


In [18]:
len(test)

1282

## Train model

In [19]:
import os
import tensorflow as tf

tf.enable_eager_execution()
tfe = tf.contrib.eager

In [20]:
class Model(object):
    def __init__(self, input_shape, output_shape):
        self.input_shape = input_shape
        self.output_shape = output_shape
        self.W = tfe.Variable( tf.random_normal( [self.input_shape, self.output_shape] ), name='weight' )
        self.B = tfe.Variable( tf.random_normal( [self.output_shape] ), name='bias' ) 
        self.variables = [ self.W, self.B ]
    
    def frwrd_pass(self,X_train):
        out = tf.matmul( X_train, self.W ) + self.B
        
        return out

In [21]:
def tripletloss(anchor_out, positive_out, negative_out, margin=0.2):
    norm_a_out = tf.nn.l2_normalize(anchor_out, axis=1)
    norm_p_out = tf.nn.l2_normalize(positive_out, axis=1)
    norm_n_out = tf.nn.l2_normalize(negative_out, axis=1)
    
    d_pos = tf.losses.cosine_distance(norm_a_out, norm_p_out, axis=1)
    d_neg = tf.losses.cosine_distance(norm_a_out, norm_n_out, axis=1)
    
    loss = tf.maximum(0.0, margin + d_pos - d_neg)
    
    return tf.reduce_mean(loss)

In [22]:
def train(input_data_np, batch_size, epochs):
    optimizer = tf.train.AdamOptimizer(learning_rate=0.00001)
    data_num = int(input_data_np.shape[1])

    for i in range(epochs):
        rand_idx = np.random.permutation(data_num)
        index_data_np = np.array([
            input_data_np[0][rand_idx], 
            input_data_np[1][rand_idx], 
            input_data_np[2][rand_idx]])

        input_data = tf.convert_to_tensor(input_data_np, dtype=tf.float32)
        anchor_data, positive_data, negative_data = input_data

        for iter_id in range(data_num // batch_size):        
            with tf.GradientTape() as tape:
                anchor_out = model.frwrd_pass(anchor_data[iter_id*batch_size : (iter_id+1)*batch_size])
                positive_out = model.frwrd_pass(positive_data[iter_id*batch_size : (iter_id+1)*batch_size])
                negative_out = model.frwrd_pass(negative_data[iter_id*batch_size : (iter_id+1)*batch_size])
                curr_loss = tripletloss(anchor_out, positive_out, negative_out)
            grads = tape.gradient( curr_loss, model.variables )
            optimizer.apply_gradients(zip(grads, model.variables), global_step=tf.train.get_or_create_global_step())

        if i % 10 == 0:
            print( "Loss at step {:d}: {:.5f}".format(i, curr_loss) )

In [23]:
def create_training_input_np(sidx, eidx):
    anchor_list = []
    positive_list = []
    negative_list = []
    
    triplet_pairs = create_triplet_pairs(sidx, eidx)
    
    for row in triplet_pairs.itertuples():
        anchor_list.append(train_normalized_feature_dict_1000[row.app_id])
        positive_list.append(grants_normalized_feature_dict_2000[row.cited_grants])
        negative_list.append(grants_normalized_feature_dict_2000[row.uncited_grants])
    
    return np.array([np.array(anchor_list), np.array(positive_list), np.array(negative_list)])

In [24]:
%time

test = create_training_input_np(1900, 2000)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.63 µs


In [25]:
test[:,0:5]

array([[[-0.04901844,  0.02252829, -0.02457967, ..., -0.00074622,
         -0.0088564 ,  0.00703908],
        [-0.04901844,  0.02252829, -0.02457967, ..., -0.00074622,
         -0.0088564 ,  0.00703908],
        [-0.04901844,  0.02252829, -0.02457967, ..., -0.00074622,
         -0.0088564 ,  0.00703908],
        [-0.06715591, -0.01025138, -0.02449416, ..., -0.03210071,
         -0.00928766,  0.02977114],
        [-0.06563298, -0.00142188, -0.03430097, ..., -0.02237011,
         -0.0149513 ,  0.0319426 ]],

       [[-0.05816157,  0.00442324, -0.01381707, ...,  0.00989325,
         -0.00308412,  0.01191179],
        [-0.06699397,  0.00166932, -0.02735891, ..., -0.00586027,
         -0.02177938,  0.03040199],
        [-0.04891689,  0.00812041, -0.03092893, ..., -0.00889881,
         -0.01907724,  0.02222425],
        [-0.06176343, -0.00969397, -0.03617494, ..., -0.03812347,
          0.01204742,  0.04816776],
        [-0.07063474, -0.00655207, -0.02171202, ..., -0.0406709 ,
         -0.01

In [26]:
model = Model(input_shape=768, output_shape=100)

In [131]:
# %%time

# ### EASY examples
# train(test, 10, 50)

Loss at step 0: 0.09596
Loss at step 10: 0.04093
Loss at step 20: 0.00000
Loss at step 30: 0.00000
Loss at step 40: 0.00000
CPU times: user 1min 30s, sys: 5.2 s, total: 1min 35s
Wall time: 1min 19s


In [27]:
%%time

### HARD examples
train(test, 10, 50)

Loss at step 0: 0.19671
Loss at step 10: 0.19589
Loss at step 20: 0.19483
Loss at step 30: 0.19342
Loss at step 40: 0.19149
CPU times: user 1min 32s, sys: 5.38 s, total: 1min 38s
Wall time: 1min 20s


# ===== Trial and error =====

In [52]:
s = sort_similarity_by_value(sim_dict, 12411468)

In [53]:
s[0:5]

[(7214786, 0.5762169750690092),
 (7355042, 0.6097554784789876),
 (7129390, 0.6223759222249304),
 (7384920, 0.6421321976972119),
 (7007804, 0.6643671272890019)]

In [59]:
test = [(idx,parsed) for idx,(parsed,sim) in enumerate(s) if parsed == 7061154]

In [60]:
test

[(1733, 7061154)]

In [61]:
make_uncited_grants_for_app_id(sim_dict, citations_info_target, 12411468, 1730, 1750, 10)

[7326126,
 6980185,
 7722528,
 7404613,
 7578602,
 7715103,
 7347024,
 7484979,
 7248463]

In [34]:
citations_info_target[citations_info_target['app_id'] == 12411468]

Unnamed: 0,app_id,app_fnm,citation_pat_pgpub_id,parsed,ifw_number,action_type,action_subtype,form892,form1449,citation_in_oa,...,rejection_103,rejection_112,rejection_dp,objection,allowed_claims,cite102_gt1,cite103_gt3,cite103_eq1,cite103_max,signature_type
4089,12411468,/work/data/apps/2010/ipa100930/F_2468.xml,7061154,7061154,GJZUK1ELPPOPPY5,102.0,b,1,0,1,...,1,0,0,0,0,1,0,1,2,3
4090,12411468,/work/data/apps/2010/ipa100930/F_2468.xml,7061154,7061154,GJZUK1ELPPOPPY5,103.0,a,1,0,1,...,1,0,0,0,0,1,0,1,2,3


In [37]:
set(citations_info_target[citations_info_target['app_id'] == 12411468]['parsed'])

{7061154}

In [39]:
get_cited_grants(citations_info_target, 12411468)

{7061154}