# Similarity computations using w2v and d2v

Compute similarities among patents using word2vec and doc2vec models.  
Use the same datasets as `tfidf_nearest.ipynb`.

In [1]:
import h5py
import pandas as pd
import numpy as np
import pickle

In [2]:
citations_info_target = pd.read_pickle("../data/citations_info_2000.df.gz")
training_app_df = pd.read_pickle("../data/training_app_1000.df.gz")
testset_app_df = pd.read_pickle("../data/testset_app_1000.df.gz")
grants_target_df = pd.read_pickle("../data/grants_for_2000.df.gz")

In [3]:
import re
CLAIM_PAT = re.compile(r'<claims[^>]*>(.*)</claims>',re.MULTILINE|re.DOTALL)
TAG_PAT = re.compile(r"<.*?>")
def whole_xml_to_claim_xml(whole):
    mat = CLAIM_PAT.search(whole)
    return mat.group(1)
def whole_xml_to_claim(whole):
    return TAG_PAT.sub(' ', whole_xml_to_claim_xml(whole))

grants_target_df["claim"] = grants_target_df["xml"].map(whole_xml_to_claim)

In [4]:
def set_one_answer_appid(labeldf, oneappid):
    cited_patids = citations_info_target[citations_info_target.app_id == oneappid].parsed
    labeldf.loc[oneappid] = labeldf.columns.isin(cited_patids)
    
def create_label_df():
    label_df = pd.DataFrame(columns=grants_target_df.parsed.values, dtype=np.bool)
    for appid in testset_app_df.app_id:
        set_one_answer_appid(label_df, appid)
    return label_df

label_df = create_label_df()

In [5]:
def predict_test_set(predict_func):
    """
    predict_func(claims) return NxM of boolean. N is len(claims). M is rownum of grants_target_df.
            value indicate n claim is cite patent of m row of grants_all_df.
    """
    predictdf = pd.DataFrame(columns=grants_target_df.parsed.values, dtype=np.bool)
    res = predict_func(testset_app_df["xml"].map(whole_xml_to_claim))
    for idx, appid in enumerate(testset_app_df.app_id):
        predictdf.loc[appid] = res[idx, :]
    return predictdf

In [6]:
def calc_TPs(preddf, labeldf):
    return sum([sum(preddf.loc[one_appid][labeldf.loc[one_appid]]) for one_appid in testset_app_df.app_id])

def calc_FPs(preddf, labeldf):
    return sum([sum(preddf.loc[one_appid][~labeldf.loc[one_appid]]) for one_appid in testset_app_df.app_id])

def calc_TNs(preddf, labeldf):
    return sum([sum(preddf.loc[one_appid][~labeldf.loc[one_appid]] == False) for one_appid in testset_app_df.app_id])

def calc_FNs(preddf, labeldf):
    return sum([sum(preddf.loc[one_appid][labeldf.loc[one_appid]] == False) for one_appid in testset_app_df.app_id])

def calc_TFPNs(preddf, labeldf):
    return calc_TPs(preddf, labeldf), calc_FPs(preddf, labeldf), calc_TNs(preddf, labeldf), calc_FNs(preddf, labeldf)

In [7]:
def calc_summary_TFPNs(TP, FP, TN, FN):
    "return acc, prec, recall, f1."
    return pd.DataFrame(columns=["acc", "prec", "recall", "f1"], data=[[(TP+TN)/(TP+FP+TN+FN), TP/(TP+FP), TP/(TP+FN), 2*TP/(2*TP+FP+FN)]])
    
def calc_summary(preddf, labeldf):
    TP, FP, TN, FN = calc_TFPNs(preddf, labeldf)
    return calc_summary_TFPNs(TP, FP, TN, FN)

# Word2Vec model

In [8]:
from gensim.models import Word2Vec
from scipy import spatial

import multiprocessing
CPUNUM = multiprocessing.cpu_count()

In [9]:
grants_target_claims = grants_target_df['claim'].map(lambda x:x.split()).tolist()

In [10]:
%%time

w2v = Word2Vec(grants_target_claims, size=100, window=5, min_count=5, workers=CPUNUM, iter=10, hs=1)

CPU times: user 2min 33s, sys: 503 ms, total: 2min 33s
Wall time: 41.5 s


In [11]:
w2v.most_similar('machine')

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('machine.', 0.5728158950805664),
 ('machines', 0.521472692489624),
 ('sensor', 0.4440922737121582),
 ('medium', 0.4248672127723694),
 ('system', 0.42129936814308167),
 ('resource', 0.421298086643219),
 ('engine', 0.41965097188949585),
 ('head', 0.41125422716140747),
 ('machine,', 0.3992379903793335),
 ('machines.', 0.39488452672958374)]

In [12]:
grants_ids = columns=grants_target_df.parsed.values

In [13]:
np.average(w2v[['machine', 'water']], axis=0)

  """Entry point for launching an IPython kernel.


array([ 1.22583437e+00,  2.94116902e+00, -7.02696562e-01, -1.63346851e+00,
       -6.11121655e-01, -3.79381090e-01, -4.10424769e-01, -1.05069315e+00,
        7.15401590e-01,  1.04958832e+00,  4.27327931e-01,  2.72838235e-01,
       -1.11034185e-01,  1.55474141e-01, -2.44855344e-01, -2.97003776e-01,
        9.55128372e-02,  2.02698398e+00,  1.53319991e+00,  1.67445585e-01,
        7.58966804e-01, -7.08682299e-01,  9.35697317e-01,  3.93907398e-01,
       -8.91628742e-01,  2.13811350e+00, -4.44612712e-01,  6.26344860e-01,
        4.05613840e-01, -4.01553422e-01,  1.79918563e+00, -9.92178321e-02,
        3.92593646e+00,  3.39302957e-01,  8.49752426e-01,  7.92777061e-01,
        1.16824222e+00,  3.13373089e-01, -1.73695636e+00, -1.29960227e+00,
       -3.47704053e-01,  8.32710981e-01,  1.72713184e+00, -1.08539021e+00,
        1.06838310e+00,  6.88125789e-02, -5.47692776e-01,  3.93617094e-01,
        5.41976094e-02,  5.57674468e-01,  4.18339908e-01,  1.03454053e-01,
       -1.65046543e-01, -

In [14]:
def text_to_vec(text):
    words = text.split()
    filtered_words = []
    for word in words:
        if word in w2v.wv.vocab:
            filtered_words.append(word)
    vec = np.average(w2v[filtered_words], axis=0)
    return vec

In [15]:
grants_w2v_vectors = []
for i, id in enumerate(grants_ids):
    vec = text_to_vec(grants_target_df.loc[i]['claim'])
    grants_w2v_vectors.append(vec)
grants_w2v_vectors = np.array(grants_w2v_vectors)

  import sys


In [16]:
np.argsort([6,1,2,4,5])[::-1][0:2]

array([0, 4])

In [17]:
def find_most_similar(vec):
    topN = 5
    similarities = []
    for grants_vec in grants_w2v_vectors:
        sim = 1 - spatial.distance.cosine(vec, grants_vec)
        similarities.append(sim)
    return [np.argmax(similarities)]
#     return np.argsort(similarities)[::-1][0:topN]

In [18]:
find_most_similar(grants_w2v_vectors[2,:])

[2]

In [19]:
def predict_test_set():
    predictdf = pd.DataFrame(columns=grants_target_df.parsed.values, dtype=np.bool)
    for idx, appid in enumerate(testset_app_df.app_id):
        text = whole_xml_to_claim(testset_app_df.loc[idx]['xml'])
        vec = text_to_vec(text)
        similar_grant_indexes = find_most_similar(vec)
        predictdf.loc[appid] = False
        for similar_grant_index in similar_grant_indexes:
            similar_grant_id = grants_ids[similar_grant_index]
            predictdf.loc[appid, similar_grant_id] = True
        if idx%100 == 0:
            print(idx)
    return predictdf

In [20]:
%%time

pred_df = predict_test_set()

  import sys


0
100
200
300
400
500
600
700
800
900
CPU times: user 2min 42s, sys: 8.16 s, total: 2min 50s
Wall time: 2min 50s


In [21]:
pred_df.head()

Unnamed: 0,6837383,6837647,6837799,6837893,6837910,6838140,6838207,6838507,6838812,6838925,...,8334161,8334431,8334887,8336128,8336158,8336789,8336964,8337193,8339697,8340894
14307191,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
13137006,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
12741959,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
12643447,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
14200253,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [22]:
# top1
calc_summary(pred_df, label_df)

Unnamed: 0,acc,prec,recall,f1
0,0.999214,0.133,0.106315,0.11817


In [23]:
# # top5
# calc_summary(pred_df, label_df)

## Example

In [24]:
testset_app_df.head()

Unnamed: 0,app_id,xml
0,14307191,"<us-patent-application lang=""EN"" dtd-version=""..."
1,13137006,"<us-patent-application lang=""EN"" dtd-version=""..."
2,12741959,"<us-patent-application lang=""EN"" dtd-version=""..."
3,12643447,"<us-patent-application lang=""EN"" dtd-version=""..."
4,14200253,"<us-patent-application lang=""EN"" dtd-version=""..."


In [25]:
idx = 0
appid = 14307191
text = whole_xml_to_claim(testset_app_df.loc[idx]['xml'])
vec = text_to_vec(text)
similar_grant_index = find_most_similar(vec)
similar_grant_id = grants_ids[similar_grant_index]
print(text)
print(grants_target_df.loc[similar_grant_index]['claim'])


 
  1 . A method to aggregate, filter, and share energy data for analysis, the method comprising:
 receiving first data associated with a first electrical circuit, the first data having a first protocol; 
 sampling the first data at a first sampling rate to generate first digital data, wherein the first sampling rate is substantially continuous; and 
 transmitting reporting digital data over a network having a network protocol different from the first protocol, the reporting digital data comprising at least the first digital data, wherein the reporting digital data is transmitted at a reporting rate that is decoupled from the first sampling rate. 
 
 
 
  2 . The method of  claim 1  further comprising:
 receiving second data associated with a second electrical circuit, the second data having a second protocol different from the first protocol; and 
 sampling the second data at a second sampling rate to generate second digital data, wherein the second sampling rate is substantially con

  import sys


In [26]:
idx = 1
text = whole_xml_to_claim(testset_app_df.loc[idx]['xml'])
vec = text_to_vec(text)
similar_grant_index = find_most_similar(vec)
similar_grant_id = grants_ids[similar_grant_index]
print(text)
print(grants_target_df.loc[similar_grant_index]['claim'])


 
  1 . A display apparatus, comprising:
 a position sensor to sense an eye position of a user; 
 a controller to set a virtual viewing window corresponding to the sensed eye position of the user and to provide a control signal to generate a directional light toward the virtual viewing window; and 
 a light generator to generate a directional light based on the control signal. 
 
 
 
  2 . The display apparatus of  claim 1 , further comprising:
 a light modulator to modulate an intensity of the directional light based on the control signal. 
 
 
 
  3 . The display apparatus of  claim 1 , wherein the position sensor comprises at least one camera to photograph the eye position of the user. 
 
 
  4 . The display apparatus of  claim 1 , wherein the position sensor comprises at least one camera to identify the eye position of the user by photographing an identifier fixed around the eye position of the user. 
 
 
  5 . The display apparatus of  claim 1 , wherein the light generator compri

  import sys


## Trial and error

In [26]:
predictdf = pd.DataFrame(columns=grants_target_df.parsed.values, dtype=np.bool)

In [27]:
predictdf.loc[14307191] = False

In [28]:
predictdf.loc[14307191, 6837383] = True

In [29]:
predictdf

Unnamed: 0,6837383,6837647,6837799,6837893,6837910,6838140,6838207,6838507,6838812,6838925,...,8334161,8334431,8334887,8336128,8336158,8336789,8336964,8337193,8339697,8340894
14307191,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [30]:
predictdf.loc[14307191, 6837383]

True

# Doc2vec model

In [27]:
grants_target_df['claim'][0]

'\n \n 1. A tool organizer for mounting to and adapted for use with a vehicle lift rack having support arms, said tool organizer comprising:\n four upstanding walls depending from a base and forming a storage volume for storing a plurality of tools and parts;  \n a selectively articulating lid enclosing said storage volume;  \n adjustable attachment means depending from said storage volume, said attachment means removably attachable to said support arms for supporting said tool organizer by gravity impingement;  \n a U-shaped ring affixed to an external portion of at least one of said walls, said ring for supporting a hand tool;  \n a support plate projected from a front of said tool organizer and co-extensive with said base;  \n a plurality of intermediate apertures formed in said support plate, said plurality of intermediate apertures adapted sized to accommodate intermediate and large hand tools;  \n a plurality of small apertures formed in said support plate, said plurality of smal

### Do simple text preprocessing and create datasets for doc2vec model.

In [28]:
preprocessed_grant_data = grants_target_df['claim'].str\
    .replace("\n", "", regex=False).str\
    .replace("[0-9]*\.", "", regex=True).str\
    .replace("[,:;]", "", regex=True).str\
    .replace("a |A |the |The ", "", regex=True)

In [29]:
preprocessed_grant_data[0]

'   tool organizer for mounting to and adapted for use with vehicle lift rack having support arms said tool organizer comprising four upstanding walls depending from base and forming storage volume for storing plurality of tools and parts   selectively articulating lid enclosing said storage volume   adjustable attachment means depending from said storage volume said attachment means removably attachable to said support arms for supporting said tool organizer by gravity impingement   U-shaped ring affixed to an external portion of at least one of said walls said ring for supporting hand tool   support plate projected from front of said tool organizer and co-extensive with said base   plurality of intermediate apertures formed in said support plate said plurality of intermediate apertures adapted sized to accommodate intermediate and large hand tools   plurality of small apertures formed in said support plate said plurality of small apertures adapted to accommodate small hand tools   at

In [30]:
from gensim.models import doc2vec
from gensim.utils import simple_preprocess

In [31]:
d2v_grant_data = list()

for idx, elem in enumerate(preprocessed_grant_data):
    d2v_grant_data.append(
        doc2vec.TaggedDocument(simple_preprocess(elem), [str(grants_target_df.parsed[idx])])
    )

In [32]:
d2v_grant_data[0]

TaggedDocument(words=['tool', 'organizer', 'for', 'mounting', 'to', 'and', 'adapted', 'for', 'use', 'with', 'vehicle', 'lift', 'rack', 'having', 'support', 'arms', 'said', 'tool', 'organizer', 'comprising', 'four', 'upstanding', 'walls', 'depending', 'from', 'base', 'and', 'forming', 'storage', 'volume', 'for', 'storing', 'plurality', 'of', 'tools', 'and', 'parts', 'selectively', 'articulating', 'lid', 'enclosing', 'said', 'storage', 'volume', 'adjustable', 'attachment', 'means', 'depending', 'from', 'said', 'storage', 'volume', 'said', 'attachment', 'means', 'removably', 'attachable', 'to', 'said', 'support', 'arms', 'for', 'supporting', 'said', 'tool', 'organizer', 'by', 'gravity', 'impingement', 'shaped', 'ring', 'affixed', 'to', 'an', 'external', 'portion', 'of', 'at', 'least', 'one', 'of', 'said', 'walls', 'said', 'ring', 'for', 'supporting', 'hand', 'tool', 'support', 'plate', 'projected', 'from', 'front', 'of', 'said', 'tool', 'organizer', 'and', 'co', 'extensive', 'with', 'said

### doc2vec model training.

model parameters: https://radimrehurek.com/gensim/models/doc2vec.html

In [33]:
import multiprocessing
CPUNUM = multiprocessing.cpu_count()

In [34]:
%%time

model = doc2vec.Doc2Vec(
    documents=d2v_grant_data
    , dm = 1
    , epochs = 50
    , vector_size=350
    , window=5
    , min_count=5
    , workers=CPUNUM
    , seed=23)

CPU times: user 9min 30s, sys: 4.62 s, total: 9min 35s
Wall time: 2min 36s


Check word similarities.

In [35]:
model.most_similar("machine")

  """Entry point for launching an IPython kernel.


[('computer', 0.2695177495479584),
 ('writable', 0.21770721673965454),
 ('framework', 0.21354065835475922),
 ('dedicated', 0.20993411540985107),
 ('mediaccording', 0.20617230236530304),
 ('piezoelectric', 0.194503515958786),
 ('computing', 0.19321362674236298),
 ('faucet', 0.19124718010425568),
 ('pawl', 0.19111208617687225),
 ('processor', 0.18989260494709015)]

It looks nice.  
Next check document similarities.

In [36]:
grants_target_df.head(2)

Unnamed: 0,parsed,xml,claim
0,6837383,"<us-patent-grant lang=""EN"" dtd-version=""v40 20...",\n \n 1. A tool organizer for mounting to and ...
1,6837647,"<us-patent-grant lang=""EN"" dtd-version=""v40 20...",\n \n 1. A modular crowd and traffic control b...


In [37]:
idx = 6837383

In [38]:
model.docvecs.most_similar(str(idx))

[('7242108', 0.422192245721817),
 ('7258401', 0.42090556025505066),
 ('6860386', 0.41485100984573364),
 ('7383954', 0.3827245533466339),
 ('6991262', 0.3511444926261902),
 ('8332466', 0.3463149666786194),
 ('8291649', 0.34570398926734924),
 ('7384920', 0.3456510901451111),
 ('7213816', 0.34189537167549133),
 ('7604048', 0.34058135747909546)]

In [39]:
most_similar_grant_idx = int(model.docvecs.most_similar(str(idx))[0][0])

In [40]:
most_similar_grant_idx

7242108

Check similar documents.

In [41]:
grants_target_df[grants_target_df.parsed == idx]['claim'].values

array(['\n \n 1. A tool organizer for mounting to and adapted for use with a vehicle lift rack having support arms, said tool organizer comprising:\n four upstanding walls depending from a base and forming a storage volume for storing a plurality of tools and parts;  \n a selectively articulating lid enclosing said storage volume;  \n adjustable attachment means depending from said storage volume, said attachment means removably attachable to said support arms for supporting said tool organizer by gravity impingement;  \n a U-shaped ring affixed to an external portion of at least one of said walls, said ring for supporting a hand tool;  \n a support plate projected from a front of said tool organizer and co-extensive with said base;  \n a plurality of intermediate apertures formed in said support plate, said plurality of intermediate apertures adapted sized to accommodate intermediate and large hand tools;  \n a plurality of small apertures formed in said support plate, said plurality 

In [42]:
grants_target_df[grants_target_df.parsed == most_similar_grant_idx]['claim'].values

array(['\n \n 1. A wind-actuated electric power alternator, comprising:\n an impeller enclosure having a generally triangular cross-sectional configuration; \n an impeller rotatably mounted in said impeller enclosure; and \n a alternator coupled to said impeller. \n \n \n \n 2. The wind-powered electric power alternator of  claim 1  wherein said impeller enclosure has a generally equilateral triangular configuration. \n \n \n 3. The wind-powered electric power alternator of  claim 1  further comprising a base and wherein said impeller enclosure is carried by said base. \n \n \n 4. The wind-powered electric power alternator of  claim 1  wherein said impeller comprises an impeller shaft coupled to said alternator, an impeller body carried by said impeller shaft and a plurality of impeller blades and a base plate carried by said impeller body. \n \n \n 5. The wind-powered electric power alternator of  claim 4  wherein said plurality of impeller blades each has a generally curved cross-sec

Are these similar? It's not easy to judge, but these are both physical devices?

Let's continue to check model predictions

### Model prediction for test app data

Create the same type data of testset_app_df as we did for grant_target_df.

In [43]:
testset_app_df.head()

Unnamed: 0,app_id,xml
0,14307191,"<us-patent-application lang=""EN"" dtd-version=""..."
1,13137006,"<us-patent-application lang=""EN"" dtd-version=""..."
2,12741959,"<us-patent-application lang=""EN"" dtd-version=""..."
3,12643447,"<us-patent-application lang=""EN"" dtd-version=""..."
4,14200253,"<us-patent-application lang=""EN"" dtd-version=""..."


In [44]:
testset_app_df['claim'] = testset_app_df["xml"].map(whole_xml_to_claim)

In [45]:
testset_app_df.head()

Unnamed: 0,app_id,xml,claim
0,14307191,"<us-patent-application lang=""EN"" dtd-version=""...","\n \n 1 . A method to aggregate, filter, and ..."
1,13137006,"<us-patent-application lang=""EN"" dtd-version=""...","\n \n 1 . A display apparatus, comprising:\n ..."
2,12741959,"<us-patent-application lang=""EN"" dtd-version=""...",\n \n 1 - 33 . (canceled) \n \n \n 34 . A co...
3,12643447,"<us-patent-application lang=""EN"" dtd-version=""...",\n \n 1 . A terminal fitting formed by bendin...
4,14200253,"<us-patent-application lang=""EN"" dtd-version=""...",\n \n 1 . A printer for printing a three-dime...


In [46]:
preprocessed_testset_app_data = testset_app_df['claim'].str\
    .replace("\n", "", regex=False).str\
    .replace("[0-9]*\.", "", regex=True).str\
    .replace("[,:;]", "", regex=True).str\
    .replace("a |A |the |The ", "", regex=True)

In [47]:
preprocessed_testset_app_data = testset_app_df['claim']

In [48]:
vec = model.infer_vector( simple_preprocess(preprocessed_testset_app_data[0]) )

In [49]:
vec

array([ 2.6444149e+00, -3.6477394e+00, -3.7531040e+00, -2.5432411e-01,
       -1.8252881e+00,  5.6969414e+00, -4.0051522e+00,  2.0534508e+00,
       -9.0166771e-01, -2.3761058e+00,  4.5725179e+00,  3.4028869e+00,
       -9.2733127e-01, -3.1435108e-01,  2.5389445e+00, -2.1533847e-01,
        1.9849111e+00, -3.9761400e-01,  2.5782940e+00,  1.0789891e+00,
       -1.3914300e+00,  3.1551330e+00, -3.2262039e+00,  1.2582591e+00,
        1.0275270e+00,  2.4785342e+00,  1.6779217e+00, -1.1197709e+00,
        5.8774769e-01,  1.5935812e+00,  6.8448937e-01, -8.2386208e-01,
       -1.1830628e+00, -4.4023366e+00, -1.7895831e+00, -3.5797238e+00,
       -3.7738645e-01, -2.0456574e+00,  2.2015042e+00,  5.9964943e+00,
       -1.4091744e-01, -3.3812180e+00, -1.4377692e+00, -2.5217264e+00,
       -4.6142802e+00,  2.5616918e+00,  3.7756991e-02,  3.5073392e+00,
       -1.0132335e+00,  1.2888126e-01, -1.2981948e+00,  1.3106352e+00,
       -9.3405135e-03, -3.4578588e+00,  1.3138810e+00, -1.6047982e+00,
      

In [50]:
model.docvecs.most_similar([vec])

[('7057392', 0.25322583317756653),
 ('8104041', 0.24960508942604065),
 ('8214829', 0.2410489022731781),
 ('7250464', 0.2402666211128235),
 ('7898807', 0.2377089560031891),
 ('6852787', 0.23684214055538177),
 ('6914242', 0.23216034471988678),
 ('7252786', 0.2257317304611206),
 ('7453937', 0.22069600224494934),
 ('7620127', 0.21937915682792664)]

In [51]:
[int(grantid) for grantid,similarity in model.docvecs.most_similar([vec], topn=1)]

[7057392]

In [52]:
def predict_test_set():
    predictdf = pd.DataFrame(columns=grants_target_df.parsed.values, dtype=np.bool)
    for idx, appid in enumerate(testset_app_df.app_id):
        text = preprocessed_testset_app_data[idx]
        vec = model.infer_vector(simple_preprocess(text))
        
        similar_grant_indexes = [int(grantid) 
                                 for grantid, similarity 
                                 in model.docvecs.most_similar([vec], topn=1)]

        predictdf.loc[appid] = False
        for similar_grant_index in similar_grant_indexes:
            predictdf.loc[appid, similar_grant_index] = True
        if idx%100 == 0:
            print(idx)
    
    return predictdf

In [53]:
%%time

pred_df = predict_test_set()

0
100
200
300
400
500
600
700
800
900
CPU times: user 4min 20s, sys: 4min 17s, total: 8min 38s
Wall time: 2min 51s


In [54]:
calc_summary(pred_df, label_df)

Unnamed: 0,acc,prec,recall,f1
0,0.999247,0.175,0.139888,0.155486


### Evaluation like `tripletloss_loss_model_with_circulum_learning`.

In [71]:
def sort_similarity_by_value(sim_dict, app_id):
    '''
    input:
        sim_dict: similary dictionary
        app_id: target application id
    return:
        [(parsed1, sim1), (parsed2, sim2), ...] sorted by similarities
    '''
    return [(parsed, sim_dict[app_id][parsed]) for parsed in sorted(sim_dict[app_id], key=sim_dict[app_id].get)]

In [72]:
def get_cited_grants(citations_info_target, app_id):
    '''
    input:
        citations_info_target: DataFrame of citation relationships
        app_id: target application id
    return:
        {parsed1, parsed2, ...} that are cited to reject app_id
    '''
    return set(citations_info_target[citations_info_target['app_id'] == app_id]['parsed'])

In [58]:
testset_app_df.head(2)

Unnamed: 0,app_id,xml,claim
0,14307191,"<us-patent-application lang=""EN"" dtd-version=""...","\n \n 1 . A method to aggregate, filter, and ..."
1,13137006,"<us-patent-application lang=""EN"" dtd-version=""...","\n \n 1 . A display apparatus, comprising:\n ..."


In [78]:
# simple_preprocess(preprocessed_testset_app_data[0])

In [79]:
test_extracted_features_df = pd.DataFrame({ 
    'app_id':testset_app_df['app_id'], 
    'extracted_feature':[ model.infer_vector(simple_preprocess(v))/np.linalg.norm(model.infer_vector(simple_preprocess(v))) for v in preprocessed_testset_app_data ]
})

In [80]:
test_extracted_features_df.head(2)

Unnamed: 0,app_id,extracted_feature
0,14307191,"[0.060070653, -0.07921251, -0.07815885, -0.000..."
1,13137006,"[-0.030348752, -0.02185373, -0.020556105, 0.02..."


In [81]:
grants_extracted_features_df = pd.DataFrame({ 
    'parsed':grants_target_df['parsed'], 
    'extracted_feature':[ model.infer_vector(simple_preprocess(v))/np.linalg.norm(model.infer_vector(simple_preprocess(v))) for v in preprocessed_grant_data ]
})

In [82]:
%%time

sim_dict = {
    app_id:{ parsed:np.sum(test_f*grants_f)
            for parsed, grants_f 
            in zip(grants_extracted_features_df['parsed'], grants_extracted_features_df['extracted_feature']) } 
    for app_id, test_f 
    in zip(test_extracted_features_df['app_id'], test_extracted_features_df['extracted_feature'])
}

CPU times: user 13.9 s, sys: 0 ns, total: 13.9 s
Wall time: 13.9 s


In [83]:
%%time

all_ranks = []

for app_id in test_extracted_features_df['app_id']:
    cited_grants = get_cited_grants(citations_info_target, app_id)
    sorted_kv = reversed(sort_similarity_by_value(sim_dict, app_id))  # higher score, similar patent

    idx = 1
    for k,v in sorted_kv:
        if k in cited_grants:
            all_ranks.append(idx)
        idx += 1

CPU times: user 3.53 s, sys: 0 ns, total: 3.53 s
Wall time: 3.43 s


In [84]:
import collections
counter = collections.Counter(all_ranks)
print(counter)

Counter({1: 198, 2: 76, 3: 54, 5: 41, 7: 35, 4: 27, 6: 27, 9: 21, 8: 18, 12: 18, 13: 18, 10: 15, 11: 13, 14: 11, 21: 11, 16: 10, 20: 10, 15: 9, 18: 9, 56: 9, 17: 8, 25: 8, 34: 8, 19: 7, 24: 7, 26: 7, 32: 7, 68: 7, 23: 6, 29: 6, 35: 6, 40: 6, 46: 6, 53: 6, 77: 6, 37: 5, 43: 5, 50: 5, 52: 5, 57: 5, 172: 5, 22: 4, 27: 4, 28: 4, 33: 4, 44: 4, 45: 4, 60: 4, 62: 4, 73: 4, 80: 4, 92: 4, 93: 4, 103: 4, 129: 4, 145: 4, 154: 4, 85: 4, 30: 3, 31: 3, 38: 3, 41: 3, 42: 3, 47: 3, 51: 3, 54: 3, 55: 3, 64: 3, 65: 3, 71: 3, 79: 3, 81: 3, 82: 3, 90: 3, 101: 3, 107: 3, 120: 3, 131: 3, 135: 3, 149: 3, 302: 3, 350: 3, 388: 3, 36: 2, 39: 2, 49: 2, 58: 2, 61: 2, 63: 2, 66: 2, 67: 2, 70: 2, 75: 2, 76: 2, 78: 2, 83: 2, 86: 2, 88: 2, 91: 2, 98: 2, 99: 2, 108: 2, 109: 2, 111: 2, 112: 2, 117: 2, 128: 2, 132: 2, 133: 2, 137: 2, 142: 2, 150: 2, 153: 2, 156: 2, 159: 2, 162: 2, 163: 2, 166: 2, 173: 2, 189: 2, 194: 2, 196: 2, 198: 2, 211: 2, 213: 2, 233: 2, 238: 2, 248: 2, 278: 2, 288: 2, 291: 2, 296: 2, 332: 2, 339: 