# Tripletloss model with circulum learning

- Extract features using pretrained BERT
- Prepare data for circulum tripletloss learning

## Extract features using pretrained BERT

You need execute the following cells just once.

In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
training_app_df = pd.read_pickle("../data/training_app_1000.df.gz")
testset_app_df = pd.read_pickle("../data/testset_app_1000.df.gz")
grants_target_df = pd.read_pickle("../data/grants_for_2000.df.gz")

In [3]:
training_app_df.head(2)

Unnamed: 0,app_id,xml
0,14222691,"<us-patent-application lang=""EN"" dtd-version=""..."
1,12515852,"<us-patent-application lang=""EN"" dtd-version=""..."


In [4]:
testset_app_df.head(2)

Unnamed: 0,app_id,xml
0,14307191,"<us-patent-application lang=""EN"" dtd-version=""..."
1,13137006,"<us-patent-application lang=""EN"" dtd-version=""..."


In [5]:
grants_target_df.head(2)

Unnamed: 0,parsed,xml
0,6837383,"<us-patent-grant lang=""EN"" dtd-version=""v40 20..."
1,6837647,"<us-patent-grant lang=""EN"" dtd-version=""v40 20..."


In [6]:
import re
CLAIM_PAT = re.compile(r'<claims[^>]*>(.*)</claims>',re.MULTILINE|re.DOTALL)
TAG_PAT = re.compile(r"<.*?>")
LB_PAT = re.compile(r'[\t\n\r\f\v][" "]*')

def whole_xml_to_claim_xml(whole):
    mat = CLAIM_PAT.search(whole)
    return mat.group(1)
def whole_xml_to_claim(whole):
    return TAG_PAT.sub(' ', whole_xml_to_claim_xml(whole))

def remove_linebreak_from_claim(claim):
    '''
    Remove line break symbol "\n" with space(s).
    '''
    return LB_PAT.sub('', claim)

In [7]:
training_app_df["claim"] = training_app_df["xml"].map(whole_xml_to_claim).map(remove_linebreak_from_claim)
testset_app_df["claim"] = testset_app_df["xml"].map(whole_xml_to_claim).map(remove_linebreak_from_claim)
grants_target_df["claim"] = grants_target_df["xml"].map(whole_xml_to_claim).map(remove_linebreak_from_claim)

Check duplication.

In [8]:
len(training_app_df.drop_duplicates(keep='first'))

1000

Use BERT for feature extractions.

In [9]:
import sys
sys.path.append("../notebook/bert")

In [10]:
import os
import re
import csv
import time
import codecs
import collections
import tempfile

import modeling
import optimization
import tokenization
import tensorflow as tf

In [11]:
tf.__version__

'1.12.0'

In [12]:
from extract_features import convert_examples_to_features
from extract_features import InputExample
from extract_features import read_examples
from extract_features import _truncate_seq_pair
from extract_features import InputFeatures
from extract_features import input_fn_builder
from extract_features import model_fn_builder

In [13]:
def create_tempfile_of_claim(df):
    tmpf = tempfile.NamedTemporaryFile(mode='r+')
    df['claim'].to_csv(tmpf.name, header=None, index=None)
    # Remove double quotations
    !sed -i -e 's/\"//g' {tmpf.name}
    
    return tmpf

In [14]:
train_tmpf = create_tempfile_of_claim(training_app_df)
test_tmpf = create_tempfile_of_claim(testset_app_df)
grants_tmpf = create_tempfile_of_claim(grants_target_df)

In [15]:
class FLAGS(object):
    '''Parameters.'''
    def __init__(self):
        self.vocab_file = "./bert/model/uncased_L-12_H-768_A-12/vocab.txt"
        self.do_lower_case = True
        self.use_tpu = False
        self.layers = "-1"
        self.bert_config_file = "./bert/model/uncased_L-12_H-768_A-12/bert_config.json"
        self.max_seq_length = 512
        self.init_checkpoint = "./bert/model/uncased_L-12_H-768_A-12/bert_model.ckpt"
        self.use_one_hot_embeddings = False
        self.batch_size = 16
        
        # The following parameters are not used in predictions.
        # Just use to create RunConfig.
        self.master = None
        self.save_checkpoints_steps = 1
        self.iterations_per_loop = 1
        self.num_tpu_cores = 1
        self.learning_rate = 0
        self.num_warmup_steps = 0
        self.num_train_steps = 0
        self.train_batch_size = 0
        self.eval_batch_size = 0

FLAGS = FLAGS()

In [16]:
layer_indexes = [int(x) for x in FLAGS.layers.split(",")]

bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

tokenizer = tokenization.FullTokenizer(
  vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)

is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
run_config = tf.contrib.tpu.RunConfig(
  master=FLAGS.master,
  tpu_config=tf.contrib.tpu.TPUConfig(
      num_shards=FLAGS.num_tpu_cores,
      per_host_input_for_training=is_per_host))

In [17]:
model_fn = model_fn_builder(
  bert_config=bert_config,
  init_checkpoint=FLAGS.init_checkpoint,
  layer_indexes=layer_indexes,
  use_tpu=FLAGS.use_tpu,
  use_one_hot_embeddings=FLAGS.use_one_hot_embeddings)

In [18]:
estimator = tf.contrib.tpu.TPUEstimator(
  use_tpu=FLAGS.use_tpu,
  model_fn=model_fn,
  config=run_config,
  predict_batch_size=FLAGS.batch_size)

INFO:tensorflow:Using config: {'_service': None, '_master': '', '_device_fn': None, '_model_dir': '/tmp/tmpq1_gmf5h', '_task_type': 'worker', '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f551c731a58>, '_protocol': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_eval_distribute': None, '_is_chief': True, '_keep_checkpoint_every_n_hours': 10000, '_save_checkpoints_steps': None, '_num_worker_replicas': 1, '_num_ps_replicas': 0, '_task_id': 0, '_experimental_distribute': None, '_cluster': None, '_keep_checkpoint_max': 5, '_train_distribute': None, '_tpu_config': TPUConfig(iterations_per_loop=2, num_shards=1, num_cores_per_replica=None, per_host_input_for_training=3, tpu_job_name=None, initial_infeed_sleep_secs=None, input_partition_dims=None), '_log_step_count_steps': None, '_save_checkpoints_secs': 600, '_save_summary_steps': 100, '_evaluation_master': '', '_global_id_i

In [19]:
def extract_features(fpath):
    examples = read_examples(fpath)
    features = convert_examples_to_features(examples=examples, seq_length=FLAGS.max_seq_length, tokenizer=tokenizer)

    unique_id_to_feature = {}
    for feature in features:
        unique_id_to_feature[feature.unique_id] = feature

    input_fn = input_fn_builder(features=features, seq_length=FLAGS.max_seq_length)

    results =  np.empty((0,768), float)

    for result in estimator.predict(input_fn, yield_single_examples=True):
        results = np.append(results, result['layer_output_0'][0].reshape(1,768), axis=0 )
    
    return results

In [27]:
training_app_feature_df = pd.DataFrame({
    'app_id' : training_app_df['app_id'],
    'feature' : [elem for elem in extract_features(train_tmpf.name)]
})

training_app_feature_df.to_pickle("../data/training_app_feature_1000.pkl")

In [26]:
testset_app_feature_df = pd.DataFrame({
    'app_id' : testset_app_df['app_id'],
    'feature' : [elem for elem in extract_features(test_tmpf.name)]
})

testset_app_feature_df.to_pickle("../data/testset_app_feature_1000.pkl")

In [25]:
grants_app_feature_df = pd.DataFrame({
    'parsed' : grants_target_df['parsed'],
    'feature' : [elem for elem in extract_features(grants_tmpf.name)]
})

grants_app_feature_df.to_pickle("../data/grants_feature_2000.pkl")

## Prepare data for circulum tripletloss learning

You need execute the following cells everytime.

In [1]:
import random
import pandas as pd
import numpy as np
import pickle
from collections import OrderedDict

In [2]:
citations_info_target = pd.read_pickle("../data/citations_info_2000.df.gz")
train_app_feature_1000 = pd.read_pickle("../data/training_app_feature_1000.pkl")
test_app_feature_1000 = pd.read_pickle("../data/testset_app_feature_1000.pkl")
grants_feature_2000 = pd.read_pickle("../data/grants_feature_2000.pkl")

In [3]:
citations_info_target.head(2)

Unnamed: 0,app_id,app_fnm,citation_pat_pgpub_id,parsed,ifw_number,action_type,action_subtype,form892,form1449,citation_in_oa,...,rejection_103,rejection_112,rejection_dp,objection,allowed_claims,cite102_gt1,cite103_gt3,cite103_eq1,cite103_max,signature_type
0,13371769,/work/data/apps/2012/ipa120607/F_2322.xml,7391316,7391316,H20LX5QGPXXIFW4,103.0,a,1,0,1,...,1,0,1,0,0,0,0,1,2,0
1,13371769,/work/data/apps/2012/ipa120607/F_2322.xml,6992580,6992580,H20LX5QGPXXIFW4,102.0,a,1,1,1,...,1,0,1,0,0,0,0,1,2,0


In [4]:
train_app_feature_1000.head(2)

Unnamed: 0,app_id,feature
0,14222691,"[-0.8012277483940125, 0.3682347536087036, -0.4..."
1,12515852,"[-1.1272308826446533, -0.17207229137420654, -0..."


In [5]:
grants_feature_2000.head(2)

Unnamed: 0,parsed,feature
0,6837383,"[-0.9908803701400757, 0.2743351459503174, -0.0..."
1,6837647,"[-0.6510910391807556, 0.13378533720970154, -0...."


In [6]:
train_normalized_feature_dict_1000 = { 
    k:v/np.linalg.norm(v) for k,v in zip(train_app_feature_1000['app_id'], train_app_feature_1000['feature'])
}

In [7]:
grants_normalized_feature_dict_2000 = { 
    k:v/np.linalg.norm(v) for k,v in zip(grants_feature_2000['parsed'], grants_feature_2000['feature'])
}

In [8]:
%%time

sim_dict = {
    app_id:{ parsed:np.sum(train_normalized_feature_dict_1000[app_id]*grants_normalized_feature_dict_2000[parsed])
            for parsed 
            in grants_feature_2000['parsed'] } 
    for app_id 
    in train_app_feature_1000['app_id']
}

CPU times: user 17.6 s, sys: 261 ms, total: 17.8 s
Wall time: 17.8 s


In [9]:
list(sim_dict.keys())[0]

12411468

In [10]:
sorted( sim_dict[12411468].values() )[:5]

[0.5762169750690092,
 0.6097554784789876,
 0.6223759222249304,
 0.6421321976972119,
 0.6643671272890019]

In [11]:
def sort_similarity_by_value(sim_dict, app_id):
    '''
    input:
        sim_dict: similary dictionary
        app_id: target application id
    return:
        [(parsed1, sim1), (parsed2, sim2), ...] sorted by similarities
    '''
    return [(parsed, sim_dict[app_id][parsed]) for parsed in sorted(sim_dict[app_id], key=sim_dict[app_id].get)]

In [12]:
def get_cited_grants(citations_info_target, app_id):
    '''
    input:
        citations_info_target: DataFrame of citation relationships
        app_id: target application id
    return:
        {parsed1, parsed2, ...} that are cited to reject app_id
    '''
    return set(citations_info_target[citations_info_target['app_id'] == app_id]['parsed'])

In [13]:
def make_uncited_grants_for_app_id(sim_dict, citations_info_target, app_id, sidx, eidx, num, shuffle=True):
    '''
    input:
        sim_dict: 
        citations_info_target:
        app_id: target application id
        sidx: start index to slice the sorted (parsed, sim) list
        eidx: end index to slice the sorted (parsed, sim) list
        num: number of grants that will be returned
    return:
        [parsed_1, parsed_2, ..., parsed_num] that are NOT cited to reject app_id
    '''
    sorted_grants_list = sort_similarity_by_value(sim_dict, app_id)
    sorted_grants_list = sorted_grants_list[sidx:eidx]
    if shuffle:
        random.shuffle(sorted_grants_list)
    
    cited_grants = get_cited_grants(citations_info_target, app_id)
    uncited_grants = []
    
    idx = 0
    while len(uncited_grants) != num:
        grant_id, _ = sorted_grants_list[idx]
        if not grant_id in cited_grants:
            uncited_grants.append(grant_id)
        idx += 1
    
    return uncited_grants

To return different uncited grants each call, change random seed as below.

In [14]:
random.seed(0)
make_uncited_grants_for_app_id(sim_dict, citations_info_target, 12411468, 0, 100, 4)

[7307125, 7375078, 7528825, 7122517]

In [15]:
random.seed(1)
make_uncited_grants_for_app_id(sim_dict, citations_info_target, 12411468, 0, 100, 4)

[7300765, 7132574, 7337285, 7168415]

In [16]:
def create_triplet_pairs(sidx, eidx):
    all_elems = []
    
    for app_id in train_app_feature_1000['app_id']:
        cited_grants = get_cited_grants(citations_info_target, app_id)
        num_cited_grants = len(cited_grants)
        uncited_grants = make_uncited_grants_for_app_id(
            sim_dict, citations_info_target, app_id, sidx, eidx, num_cited_grants)
        
        for idx, cited in enumerate(cited_grants):
            all_elems.append([app_id, cited, uncited_grants[idx]])
    
    result_df = pd.DataFrame(all_elems)
    result_df.columns = ['app_id', 'cited_grants', 'uncited_grants']
    
    return result_df

In [17]:
%%time

random.seed(0)
test = create_triplet_pairs(0, 100)

CPU times: user 3.88 s, sys: 88.3 ms, total: 3.96 s
Wall time: 3.81 s


In [18]:
test.head(2)

Unnamed: 0,app_id,cited_grants,uncited_grants
0,14222691,8206188,7419945
1,14222691,8177561,7263952


In [19]:
len(test)

1282

In [20]:
random.seed(1)
test = create_triplet_pairs(0, 100)

In [21]:
test.head(2)

Unnamed: 0,app_id,cited_grants,uncited_grants
0,14222691,8206188,6872766
1,14222691,8177561,7213468


## Train model

In [22]:
import os
import tensorflow as tf

tf.enable_eager_execution()
tfe = tf.contrib.eager

In [23]:
class Model(object):
    def __init__(self, input_shape, output_shape):
        self.input_shape = input_shape
        self.output_shape = output_shape
        self.W = tfe.Variable( tf.random_normal( [self.input_shape, self.output_shape] ), name='weight' )
        self.B = tfe.Variable( tf.random_normal( [self.output_shape] ), name='bias' ) 
        self.variables = [ self.W, self.B ]
    
    def frwrd_pass(self,X_train):
        out = tf.matmul( X_train, self.W ) + self.B
        
        return out

In [24]:
def tripletloss(anchor_out, positive_out, negative_out, margin=0.2):
    norm_a_out = tf.nn.l2_normalize(anchor_out, axis=1)
    norm_p_out = tf.nn.l2_normalize(positive_out, axis=1)
    norm_n_out = tf.nn.l2_normalize(negative_out, axis=1)
    
    d_pos = tf.losses.cosine_distance(norm_a_out, norm_p_out, axis=1)
    d_neg = tf.losses.cosine_distance(norm_a_out, norm_n_out, axis=1)
    
    loss = tf.maximum(0.0, margin + d_pos - d_neg)
    
    return tf.reduce_mean(loss)

In [25]:
def create_training_input_np(sidx, eidx):
    anchor_list = []
    positive_list = []
    negative_list = []
    
    triplet_pairs = create_triplet_pairs(sidx, eidx)
    
    for row in triplet_pairs.itertuples():
        anchor_list.append(train_normalized_feature_dict_1000[row.app_id])
        positive_list.append(grants_normalized_feature_dict_2000[row.cited_grants])
        negative_list.append(grants_normalized_feature_dict_2000[row.uncited_grants])
    
    return np.array([np.array(anchor_list), np.array(positive_list), np.array(negative_list)])

In [26]:
def train_with_changing_negative_pair(sidx, eidx, batch_size, epochs):
    optimizer = tf.train.AdamOptimizer(learning_rate=0.00001)
    
    seed = 0
    for i in range(epochs):
        seed += 1
        random.seed(seed)
        
        input_data_np = create_training_input_np(sidx, eidx)
        data_num = int(input_data_np.shape[1])
        rand_idx = np.random.permutation(data_num)
        index_data_np = np.array([
            input_data_np[0][rand_idx], 
            input_data_np[1][rand_idx], 
            input_data_np[2][rand_idx]])

        input_data = tf.convert_to_tensor(input_data_np, dtype=tf.float32)
        anchor_data, positive_data, negative_data = input_data

        for iter_id in range(data_num // batch_size):        
            with tf.GradientTape() as tape:
                anchor_out = model.frwrd_pass(anchor_data[iter_id*batch_size : (iter_id+1)*batch_size])
                positive_out = model.frwrd_pass(positive_data[iter_id*batch_size : (iter_id+1)*batch_size])
                negative_out = model.frwrd_pass(negative_data[iter_id*batch_size : (iter_id+1)*batch_size])
                curr_loss = tripletloss(anchor_out, positive_out, negative_out)
            grads = tape.gradient( curr_loss, model.variables )
            optimizer.apply_gradients(zip(grads, model.variables), global_step=tf.train.get_or_create_global_step())

        if i % 10 == 0:
            print( "Loss at step {:d}: {:.5f}".format(i, curr_loss) )

In [27]:
# start_end_index_pairs = (
#     (0, 100),
#     (100, 200),
#     (200, 300),
#     (300, 400),
#     (400, 500),
#     (500, 600),
#     (600, 700),
#     (700, 800),
#     (800, 900),
#     (900, 1000),
#     (1000, 1100),
#     (1100, 1200),
#     (1200, 1300),
#     (1300, 1400),
#     (1400, 1500),
#     (1500, 1600),
#     (1600, 1700),
#     (1700, 1800),
#     (1800, 1900),
#     (1900, 2000)
# )

# start_end_index_pairs = (
#     (0, 400),
#     (400, 800),
#     (800, 1200),
#     (1200, 1600),
#     (1600, 2000)
# )

start_end_index_pairs = (
    (0, 100),
    (900, 1000),
    (1900, 2000)
)

In [28]:
model = Model(input_shape=768, output_shape=100)

In [29]:
%%time

# for sidx, eidx in start_end_index_pairs:
#     print("   start index: {}, end index: {}".format(sidx,eidx))
#     train_with_changing_negative_pair(sidx, eidx, batch_size=10, epochs=31)

# for sidx, eidx in start_end_index_pairs:
#     print("   start index: {}, end index: {}".format(sidx,eidx))
#     train_with_changing_negative_pair(sidx, eidx, batch_size=10, epochs=51)

for sidx, eidx in start_end_index_pairs:
    print("   start index: {}, end index: {}".format(sidx,eidx))
    train_with_changing_negative_pair(sidx, eidx, batch_size=10, epochs=21)

   start index: 0, end index: 100
Loss at step 0: 0.13761
Loss at step 10: 0.12413
Loss at step 20: 0.07982
   start index: 900, end index: 1000
Loss at step 0: 0.16748
Loss at step 10: 0.16927
Loss at step 20: 0.14770
   start index: 1900, end index: 2000
Loss at step 0: 0.19712
Loss at step 10: 0.18856
Loss at step 20: 0.19363
CPU times: user 6min, sys: 11.8 s, total: 6min 11s
Wall time: 5min 43s


In [33]:
os.makedirs('../trained_model/tripletloss_circulum', exist_ok=True)
saver = tfe.Saver(model.variables)
saver.save("../trained_model/tripletloss_circulum/ckpt")

## Inferece with trained model

In [36]:
import pandas as pd
import numpy as np
import pickle

In [37]:
citations_info_target = pd.read_pickle("../data/citations_info_2000.df.gz")
test_app_feature_1000 = pd.read_pickle("../data/testset_app_feature_1000.pkl")
grants_feature_2000 = pd.read_pickle("../data/grants_feature_2000.pkl")

In [38]:
test_normalized_feature_dict_1000 = { 
    k:v/np.linalg.norm(v) for k,v in zip(test_app_feature_1000['app_id'], test_app_feature_1000['feature'])
}

grants_normalized_feature_dict_2000 = { 
    k:v/np.linalg.norm(v) for k,v in zip(grants_feature_2000['parsed'], grants_feature_2000['feature'])
}

In [39]:
def sort_similarity_by_value(sim_dict, app_id):
    '''
    input:
        sim_dict: similary dictionary
        app_id: target application id
    return:
        [(parsed1, sim1), (parsed2, sim2), ...] sorted by similarities
    '''
    return [(parsed, sim_dict[app_id][parsed]) for parsed in sorted(sim_dict[app_id], key=sim_dict[app_id].get)]

In [40]:
def get_cited_grants(citations_info_target, app_id):
    '''
    input:
        citations_info_target: DataFrame of citation relationships
        app_id: target application id
    return:
        {parsed1, parsed2, ...} that are cited to reject app_id
    '''
    return set(citations_info_target[citations_info_target['app_id'] == app_id]['parsed'])

In [41]:
import os
import tensorflow as tf

tf.enable_eager_execution()
tfe = tf.contrib.eager

class Model(object):
    def __init__(self, input_shape, output_shape):
        self.input_shape = input_shape
        self.output_shape = output_shape
        self.W = tfe.Variable( tf.random_normal( [self.input_shape, self.output_shape] ), name='weight' )
        self.B = tfe.Variable( tf.random_normal( [self.output_shape] ), name='bias' ) 
        self.variables = [ self.W, self.B ]
    
    def frwrd_pass(self,X_train):
        out = tf.matmul( X_train, self.W ) + self.B
        
        return out

In [2]:
model = Model(input_shape=768, output_shape=100)
tfe.Saver((model.variables)).restore("../trained_model/tripletloss_circulum/ckpt")

In [42]:
sorted_keys = sorted(test_normalized_feature_dict_1000.keys())

test_feature_tensors = tf.convert_to_tensor(
    np.array([ test_normalized_feature_dict_1000[k] for k in sorted_keys ]),
    dtype=tf.float32)

In [43]:
test_extracted_features = model.frwrd_pass(test_feature_tensors).numpy()

In [44]:
test_extracted_features.shape

(1000, 100)

In [45]:
test_extracted_features_df = pd.DataFrame({ 
    'app_id':sorted_keys, 'extracted_feature':[ v/np.linalg.norm(v) for v in test_extracted_features ]
})

In [46]:
test_extracted_features_df.head(2)

Unnamed: 0,app_id,extracted_feature
0,12000862,"[0.07616987, 0.0624041, -0.10485105, 0.0196705..."
1,12003258,"[0.1295955, 0.03865939, -0.06351629, 0.1217989..."


In [47]:
sorted_keys = sorted(grants_normalized_feature_dict_2000.keys())

grants_feature_tensors = tf.convert_to_tensor(
    np.array([ grants_normalized_feature_dict_2000[k] for k in sorted_keys ]),
    dtype=tf.float32)

In [48]:
grants_extracted_features = model.frwrd_pass(grants_feature_tensors).numpy()

In [49]:
grants_extracted_features.shape

(2524, 100)

In [50]:
grants_extracted_features_df = pd.DataFrame({ 
    'parsed':sorted_keys, 'extracted_feature':[ v/np.linalg.norm(v) for v in grants_extracted_features ]
})

In [51]:
grants_extracted_features_df.head(2)

Unnamed: 0,extracted_feature,parsed
0,"[0.05692685, 0.033602506, -0.16931649, 0.07056...",6837383
1,"[0.10059198, 0.011689524, -0.08415088, 0.09282...",6837647


In [52]:
%%time

sim_dict = {
    app_id:{ parsed:np.sum(test_f*grants_f)
            for parsed, grants_f 
            in zip(grants_extracted_features_df['parsed'], grants_extracted_features_df['extracted_feature']) } 
    for app_id, test_f 
    in zip(test_extracted_features_df['app_id'], test_extracted_features_df['extracted_feature'])
}

CPU times: user 14 s, sys: 103 ms, total: 14.1 s
Wall time: 14.1 s


In [53]:
%%time

all_ranks = []

for app_id in test_extracted_features_df['app_id']:
    cited_grants = get_cited_grants(citations_info_target, app_id)
    sorted_kv = reversed(sort_similarity_by_value(sim_dict, app_id))  # higher score, similar patent

    idx = 1
    for k,v in sorted_kv:
        if k in cited_grants:
            all_ranks.append(idx)
        idx += 1

CPU times: user 3.56 s, sys: 78.6 ms, total: 3.64 s
Wall time: 3.48 s


In [49]:
# import collections
# counter = collections.Counter(all_ranks)
# print(counter)

Counter({1: 61, 5: 32, 2: 22, 3: 20, 8: 17, 4: 15, 6: 15, 7: 15, 17: 12, 10: 11, 11: 11, 9: 10, 16: 10, 12: 8, 52: 8, 35: 7, 43: 7, 48: 7, 55: 7, 69: 7, 13: 6, 14: 6, 18: 6, 19: 6, 22: 6, 23: 6, 27: 6, 29: 6, 36: 6, 37: 6, 62: 6, 81: 6, 88: 6, 99: 6, 108: 6, 15: 5, 20: 5, 21: 5, 26: 5, 28: 5, 41: 5, 42: 5, 51: 5, 63: 5, 66: 5, 71: 5, 83: 5, 92: 5, 118: 5, 125: 5, 133: 5, 24: 4, 32: 4, 33: 4, 34: 4, 39: 4, 44: 4, 47: 4, 56: 4, 79: 4, 97: 4, 107: 4, 114: 4, 131: 4, 134: 4, 154: 4, 155: 4, 160: 4, 189: 4, 216: 4, 221: 4, 25: 3, 30: 3, 38: 3, 40: 3, 45: 3, 75: 3, 76: 3, 77: 3, 87: 3, 90: 3, 96: 3, 101: 3, 103: 3, 104: 3, 115: 3, 122: 3, 126: 3, 135: 3, 140: 3, 150: 3, 163: 3, 179: 3, 186: 3, 31: 3, 200: 3, 211: 3, 249: 3, 172: 3, 267: 3, 271: 3, 293: 3, 319: 3, 322: 3, 373: 3, 378: 3, 406: 3, 418: 3, 430: 3, 436: 3, 452: 3, 475: 3, 121: 3, 136: 3, 49: 2, 50: 2, 53: 2, 57: 2, 59: 2, 60: 2, 61: 2, 64: 2, 68: 2, 72: 2, 1036: 2, 78: 2, 85: 2, 89: 2, 93: 2, 94: 2, 100: 2, 105: 2, 110: 2, 117: 2

In [63]:
# import collections
# counter = collections.Counter(all_ranks)
# print(counter)

Counter({1: 48, 2: 33, 3: 21, 4: 18, 8: 13, 6: 12, 11: 12, 5: 10, 7: 10, 9: 10, 10: 10, 24: 10, 13: 9, 17: 9, 27: 9, 16: 8, 23: 8, 34: 8, 12: 7, 21: 7, 28: 7, 47: 7, 14: 6, 15: 6, 18: 6, 40: 6, 45: 6, 57: 6, 84: 6, 105: 6, 139: 6, 144: 6, 20: 5, 36: 5, 41: 5, 53: 5, 69: 5, 71: 5, 88: 5, 89: 5, 91: 5, 22: 4, 31: 4, 33: 4, 37: 4, 39: 4, 42: 4, 43: 4, 54: 4, 55: 4, 59: 4, 65: 4, 96: 4, 115: 4, 123: 4, 149: 4, 163: 4, 342: 4, 375: 4, 85: 4, 19: 3, 26: 3, 29: 3, 30: 3, 32: 3, 44: 3, 48: 3, 49: 3, 50: 3, 52: 3, 56: 3, 62: 3, 63: 3, 67: 3, 73: 3, 74: 3, 76: 3, 78: 3, 80: 3, 81: 3, 83: 3, 90: 3, 92: 3, 108: 3, 114: 3, 120: 3, 122: 3, 130: 3, 145: 3, 153: 3, 160: 3, 161: 3, 166: 3, 182: 3, 187: 3, 194: 3, 195: 3, 215: 3, 269: 3, 287: 3, 308: 3, 400: 3, 66: 3, 449: 3, 496: 3, 102: 3, 661: 3, 127: 3, 773: 3, 35: 2, 61: 2, 64: 2, 77: 2, 1109: 2, 86: 2, 94: 2, 95: 2, 99: 2, 100: 2, 101: 2, 104: 2, 107: 2, 109: 2, 112: 2, 116: 2, 121: 2, 131: 2, 135: 2, 136: 2, 141: 2, 143: 2, 146: 2, 148: 2, 152: 2

In [54]:
import collections
counter = collections.Counter(all_ranks)
print(counter)

Counter({1: 61, 2: 28, 3: 18, 4: 16, 6: 14, 5: 12, 10: 12, 8: 10, 13: 9, 17: 9, 7: 9, 16: 9, 12: 6, 14: 6, 25: 6, 26: 6, 29: 6, 32: 6, 38: 6, 42: 6, 48: 6, 51: 6, 68: 6, 71: 6, 76: 6, 11: 5, 19: 5, 20: 5, 21: 5, 23: 5, 24: 5, 28: 5, 31: 5, 35: 5, 36: 5, 39: 5, 50: 5, 54: 5, 62: 5, 86: 5, 87: 5, 43: 5, 59: 5, 158: 5, 18: 4, 22: 4, 27: 4, 30: 4, 41: 4, 44: 4, 45: 4, 49: 4, 56: 4, 60: 4, 77: 4, 90: 4, 92: 4, 108: 4, 145: 4, 148: 4, 152: 4, 207: 4, 225: 4, 279: 4, 441: 4, 467: 4, 15: 3, 34: 3, 37: 3, 46: 3, 53: 3, 55: 3, 63: 3, 64: 3, 67: 3, 72: 3, 78: 3, 79: 3, 82: 3, 88: 3, 94: 3, 103: 3, 109: 3, 118: 3, 120: 3, 125: 3, 132: 3, 133: 3, 146: 3, 173: 3, 185: 3, 199: 3, 218: 3, 222: 3, 240: 3, 296: 3, 308: 3, 334: 3, 347: 3, 395: 3, 1502: 3, 593: 3, 83: 3, 188: 3, 615: 3, 638: 3, 675: 3, 808: 3, 151: 3, 910: 3, 153: 3, 9: 2, 33: 2, 40: 2, 80: 2, 81: 2, 84: 2, 85: 2, 93: 2, 97: 2, 98: 2, 102: 2, 105: 2, 107: 2, 111: 2, 113: 2, 117: 2, 2068: 2, 127: 2, 128: 2, 129: 2, 137: 2, 138: 2, 139: 2, 

# ===== Trial and error =====

In [49]:
sim_dict.keys()

dict_keys([12087297, 12670979, 12486657, 12201992, 13316105, 13502478, 14227109, 14219282, 14264323, 13541397, 12592473, 12662714, 13314075, 14090586, 12597279, 14350339, 13647905, 12206114, 13574179, 12816421, 12269607, 12069212, 13606954, 12560430, 12004701, 14483504, 14053428, 14030174, 13512760, 13983801, 13809722, 12134069, 12859456, 12601696, 12613701, 14155849, 14750269, 14314509, 12724304, 13635939, 12417108, 14205013, 12609623, 12204121, 12087388, 14327904, 14529894, 14409833, 13828202, 13670507, 14368999, 13181037, 12824687, 12255344, 13902501, 12216434, 12107892, 12433086, 13309289, 13668475, 13709436, 13259797, 13934720, 13584513, 13707396, 13137006, 13457773, 13144209, 12843154, 14245571, 12347542, 14404633, 13588632, 12349593, 13517811, 14186742, 12712098, 13652131, 14330851, 12204201, 13922475, 13277357, 13951151, 13217968, 13774195, 13303989, 14002889, 12867769, 13729978, 12945601, 12482756, 12417222, 12491809, 13703372, 12742690, 12531919, 14250192, 12544209, 12359890,

In [53]:
sort_similarity_by_value( sim_dict, 12087297 )

[(6845583, -0.331146),
 (7040489, -0.32731208),
 (7640556, -0.2882068),
 (7003994, -0.26676342),
 (7501056, -0.2597713),
 (7673934, -0.25661272),
 (7861458, -0.2540089),
 (7249572, -0.24756268),
 (7484979, -0.24713153),
 (7967867, -0.2459397),
 (7239031, -0.23855984),
 (6837383, -0.23687981),
 (7886370, -0.23618148),
 (8016792, -0.23477368),
 (6981455, -0.23439565),
 (7150440, -0.23147926),
 (6991262, -0.2303494),
 (7136267, -0.22872514),
 (7373709, -0.22250536),
 (6945263, -0.22194904),
 (7394021, -0.21785505),
 (7633779, -0.21664618),
 (7373031, -0.21622634),
 (6877929, -0.21458593),
 (7900011, -0.21453314),
 (7917396, -0.21443552),
 (8066742, -0.21353787),
 (6959993, -0.21318282),
 (7121956, -0.2124558),
 (6959931, -0.20990063),
 (6843081, -0.2086722),
 (7381917, -0.20806542),
 (7831644, -0.20751971),
 (7409963, -0.20607749),
 (7015791, -0.20604499),
 (7279031, -0.20601332),
 (7438346, -0.20504527),
 (7976156, -0.20435822),
 (7261455, -0.2014785),
 (6928662, -0.20146552),
 (7864538,

In [26]:
model = Model(input_shape=768, output_shape=100)

In [131]:
# %%time

# ### EASY examples
# train(test, 10, 50)

Loss at step 0: 0.09596
Loss at step 10: 0.04093
Loss at step 20: 0.00000
Loss at step 30: 0.00000
Loss at step 40: 0.00000
CPU times: user 1min 30s, sys: 5.2 s, total: 1min 35s
Wall time: 1min 19s


In [27]:
# %%time

# ### HARD examples
# train(test, 10, 50)

Loss at step 0: 0.19671
Loss at step 10: 0.19589
Loss at step 20: 0.19483
Loss at step 30: 0.19342
Loss at step 40: 0.19149
CPU times: user 1min 32s, sys: 5.38 s, total: 1min 38s
Wall time: 1min 20s


In [52]:
s = sort_similarity_by_value(sim_dict, 12411468)

In [53]:
s[0:5]

[(7214786, 0.5762169750690092),
 (7355042, 0.6097554784789876),
 (7129390, 0.6223759222249304),
 (7384920, 0.6421321976972119),
 (7007804, 0.6643671272890019)]

In [59]:
test = [(idx,parsed) for idx,(parsed,sim) in enumerate(s) if parsed == 7061154]

In [60]:
test

[(1733, 7061154)]

In [61]:
make_uncited_grants_for_app_id(sim_dict, citations_info_target, 12411468, 1730, 1750, 10)

[7326126,
 6980185,
 7722528,
 7404613,
 7578602,
 7715103,
 7347024,
 7484979,
 7248463]

In [34]:
citations_info_target[citations_info_target['app_id'] == 12411468]

Unnamed: 0,app_id,app_fnm,citation_pat_pgpub_id,parsed,ifw_number,action_type,action_subtype,form892,form1449,citation_in_oa,...,rejection_103,rejection_112,rejection_dp,objection,allowed_claims,cite102_gt1,cite103_gt3,cite103_eq1,cite103_max,signature_type
4089,12411468,/work/data/apps/2010/ipa100930/F_2468.xml,7061154,7061154,GJZUK1ELPPOPPY5,102.0,b,1,0,1,...,1,0,0,0,0,1,0,1,2,3
4090,12411468,/work/data/apps/2010/ipa100930/F_2468.xml,7061154,7061154,GJZUK1ELPPOPPY5,103.0,a,1,0,1,...,1,0,0,0,0,1,0,1,2,3


In [37]:
set(citations_info_target[citations_info_target['app_id'] == 12411468]['parsed'])

{7061154}

In [39]:
get_cited_grants(citations_info_target, 12411468)

{7061154}