# Similarity computations using w2v and d2v

Compute similarities among patents using word2vec and doc2vec models.  
Use the same datasets as `tfidf_nearest.ipynb`.

In [3]:
!pip3 install gensim

Collecting gensim
  Downloading https://files.pythonhosted.org/packages/03/0a/02c7ac51565a0a5b05a07936e5559a635d5d2e8cf19801e0f00204df5ece/gensim-3.6.0-cp35-cp35m-manylinux1_x86_64.whl (23.6MB)
[K    100% |################################| 23.6MB 59kB/s  eta 0:00:01
Collecting smart-open>=1.2.1 (from gensim)
  Downloading https://files.pythonhosted.org/packages/4b/1f/6f27e3682124de63ac97a0a5876da6186de6c19410feab66c1543afab055/smart_open-1.7.1.tar.gz
Collecting boto>=2.32 (from smart-open>=1.2.1->gensim)
  Downloading https://files.pythonhosted.org/packages/23/10/c0b78c27298029e4454a472a1919bde20cb182dab1662cec7f2ca1dcc523/boto-2.49.0-py2.py3-none-any.whl (1.4MB)
[K    100% |################################| 1.4MB 1.1MB/s eta 0:00:01
[?25hCollecting bz2file (from smart-open>=1.2.1->gensim)
  Downloading https://files.pythonhosted.org/packages/61/39/122222b5e85cd41c391b68a99ee296584b2a2d1d233e7ee32b4532384f2d/bz2file-0.98.tar.gz
Collecting requests (from smart-open>=1.2.1->gensim)
  

In [1]:
import h5py
import pandas as pd
import numpy as np
import pickle

In [2]:
citations_info_target = pd.read_pickle("../data/citations_info_2000.df.gz")
training_app_df = pd.read_pickle("../data/training_app_1000.df.gz")
testset_app_df = pd.read_pickle("../data/testset_app_1000.df.gz")
grants_target_df = pd.read_pickle("../data/grants_for_2000.df.gz")

In [3]:
import re
CLAIM_PAT = re.compile(r'<claims[^>]*>(.*)</claims>',re.MULTILINE|re.DOTALL)
TAG_PAT = re.compile(r"<.*?>")
def whole_xml_to_claim_xml(whole):
    mat = CLAIM_PAT.search(whole)
    return mat.group(1)
def whole_xml_to_claim(whole):
    return TAG_PAT.sub(' ', whole_xml_to_claim_xml(whole))

grants_target_df["claim"] = grants_target_df["xml"].map(whole_xml_to_claim)

In [4]:
def set_one_answer_appid(labeldf, oneappid):
    cited_patids = citations_info_target[citations_info_target.app_id == oneappid].parsed
    labeldf.loc[oneappid] = labeldf.columns.isin(cited_patids)
    
def create_label_df():
    label_df = pd.DataFrame(columns=grants_target_df.parsed.values, dtype=np.bool)
    for appid in testset_app_df.app_id:
        set_one_answer_appid(label_df, appid)
    return label_df

label_df = create_label_df()

In [5]:
def predict_test_set(predict_func):
    """
    predict_func(claims) return NxM of boolean. N is len(claims). M is rownum of grants_target_df.
            value indicate n claim is cite patent of m row of grants_all_df.
    """
    predictdf = pd.DataFrame(columns=grants_target_df.parsed.values, dtype=np.bool)
    res = predict_func(testset_app_df["xml"].map(whole_xml_to_claim))
    for idx, appid in enumerate(testset_app_df.app_id):
        predictdf.loc[appid] = res[idx, :]
    return predictdf

In [6]:
def calc_TPs(preddf, labeldf):
    return sum([sum(preddf.loc[one_appid][labeldf.loc[one_appid]]) for one_appid in testset_app_df.app_id])

def calc_FPs(preddf, labeldf):
    return sum([sum(preddf.loc[one_appid][~labeldf.loc[one_appid]]) for one_appid in testset_app_df.app_id])

def calc_TNs(preddf, labeldf):
    return sum([sum(preddf.loc[one_appid][~labeldf.loc[one_appid]] == False) for one_appid in testset_app_df.app_id])

def calc_FNs(preddf, labeldf):
    return sum([sum(preddf.loc[one_appid][labeldf.loc[one_appid]] == False) for one_appid in testset_app_df.app_id])

def calc_TFPNs(preddf, labeldf):
    return calc_TPs(preddf, labeldf), calc_FPs(preddf, labeldf), calc_TNs(preddf, labeldf), calc_FNs(preddf, labeldf)

In [7]:
def calc_summary_TFPNs(TP, FP, TN, FN):
    "return acc, prec, recall, f1."
    return pd.DataFrame(columns=["acc", "prec", "recall", "f1"], data=[[(TP+TN)/(TP+FP+TN+FN), TP/(TP+FP), TP/(TP+FN), 2*TP/(2*TP+FP+FN)]])
    
def calc_summary(preddf, labeldf):
    TP, FP, TN, FN = calc_TFPNs(preddf, labeldf)
    return calc_summary_TFPNs(TP, FP, TN, FN)

# Word2Vec model

In [8]:
from gensim.models import Word2Vec
from scipy import spatial

import multiprocessing
CPUNUM = multiprocessing.cpu_count()

In [9]:
grants_target_claims = grants_target_df['claim'].map(lambda x:x.split()).tolist()

In [10]:
%%time

w2v = Word2Vec(grants_target_claims, size=100, window=5, min_count=5, workers=CPUNUM, iter=10, hs=1)

CPU times: user 2min 21s, sys: 340 ms, total: 2min 22s
Wall time: 38.1 s


In [11]:
w2v.most_similar('machine')

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('machine.', 0.5617180466651917),
 ('machine,', 0.5128198862075806),
 ('system', 0.46534883975982666),
 ('machine;', 0.44383180141448975),
 ('engine', 0.4297574758529663),
 ('machines', 0.42752546072006226),
 ('machines.', 0.4204997420310974),
 ('broadcaster', 0.40508541464805603),
 ('medium', 0.40062206983566284),
 ('refrigerator', 0.39362210035324097)]

In [12]:
grants_ids = columns=grants_target_df.parsed.values

In [13]:
np.average(w2v[['machine', 'water']], axis=0)

  """Entry point for launching an IPython kernel.


array([ 0.5916048 , -1.7218616 , -0.59667057,  0.5818731 ,  0.52734256,
       -0.512131  , -1.4294631 , -3.4880166 ,  0.3735    , -0.21845096,
       -0.72319996,  1.9481097 ,  0.3326751 , -2.0113938 , -0.4199664 ,
       -0.4454579 , -2.2155037 ,  0.44235536,  1.5916381 , -1.2711016 ,
        0.3362299 ,  0.4483676 , -2.1437216 , -0.7228693 ,  1.1525648 ,
        0.01781464, -0.90633017, -0.6028116 , -0.09912238, -1.3534193 ,
        0.3907733 , -1.2966522 , -2.0048268 , -0.26193586,  0.02206579,
        0.26357436,  1.5647645 ,  1.2191278 , -0.67076594, -0.53152335,
       -1.5127419 ,  0.3441564 ,  0.34324253, -0.07784438,  1.5538245 ,
        0.57257354,  0.2855848 ,  0.01412794,  0.843     ,  0.5111835 ,
       -0.06183404, -0.07303996,  0.272247  , -0.79330105,  1.7968342 ,
       -1.1751816 ,  0.16157393, -0.7912256 , -0.35905218, -0.24691026,
       -0.82257247, -0.95601   , -1.0435181 , -0.07435986,  0.1962175 ,
        0.336578  , -0.6567713 , -0.7824007 ,  1.2987434 ,  1.01

In [14]:
def text_to_vec(text):
    words = text.split()
    filtered_words = []
    for word in words:
        if word in w2v.wv.vocab:
            filtered_words.append(word)
    vec = np.average(w2v[filtered_words], axis=0)
    return vec

In [15]:
grants_w2v_vectors = []
for i, id in enumerate(grants_ids):
    vec = text_to_vec(grants_target_df.loc[i]['claim'])
    grants_w2v_vectors.append(vec)
grants_w2v_vectors = np.array(grants_w2v_vectors)

  import sys


In [16]:
np.argsort([6,1,2,4,5])[::-1][0:2]

array([0, 4])

In [17]:
def find_most_similar(vec):
    topN = 10
    similarities = []
    for grants_vec in grants_w2v_vectors:
        sim = 1 - spatial.distance.cosine(vec, grants_vec)
        similarities.append(sim)
    return [np.argmax(similarities)]
#     return np.argsort(similarities)[::-1][0:topN]

In [18]:
find_most_similar(grants_w2v_vectors[2,:])

[2]

In [19]:
def predict_test_set():
    predictdf = pd.DataFrame(columns=grants_target_df.parsed.values, dtype=np.bool)
    for idx, appid in enumerate(testset_app_df.app_id):
        text = whole_xml_to_claim(testset_app_df.loc[idx]['xml'])
        vec = text_to_vec(text)
        similar_grant_indexes = find_most_similar(vec)
        predictdf.loc[appid] = False
        for similar_grant_index in similar_grant_indexes:
            similar_grant_id = grants_ids[similar_grant_index]
            predictdf.loc[appid, similar_grant_id] = True
        if idx%100 == 0:
            print(idx)
    return predictdf

In [20]:
%%time

pred_df = predict_test_set()

  import sys


0
100
200
300
400
500
600
700
800
900
CPU times: user 2min 22s, sys: 7.47 s, total: 2min 29s
Wall time: 2min 29s


In [21]:
pred_df.head()

Unnamed: 0,6837383,6837647,6837799,6837893,6837910,6838140,6838207,6838507,6838812,6838925,...,8334161,8334431,8334887,8336128,8336158,8336789,8336964,8337193,8339697,8340894
14307191,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
13137006,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
12741959,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
12643447,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
14200253,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [22]:
# top1
calc_summary(pred_df, label_df)

Unnamed: 0,acc,prec,recall,f1
0,0.999207,0.125,0.09992,0.111062


In [114]:
# # top5
# calc_summary(pred_df, label_df)

Unnamed: 0,acc,prec,recall,f1
0,0.997756,0.0588,0.235012,0.094065


## Example

In [23]:
testset_app_df.head()

Unnamed: 0,app_id,xml
0,14307191,"<us-patent-application lang=""EN"" dtd-version=""..."
1,13137006,"<us-patent-application lang=""EN"" dtd-version=""..."
2,12741959,"<us-patent-application lang=""EN"" dtd-version=""..."
3,12643447,"<us-patent-application lang=""EN"" dtd-version=""..."
4,14200253,"<us-patent-application lang=""EN"" dtd-version=""..."


In [24]:
idx = 0
appid = 14307191
text = whole_xml_to_claim(testset_app_df.loc[idx]['xml'])
vec = text_to_vec(text)
similar_grant_index = find_most_similar(vec)
similar_grant_id = grants_ids[similar_grant_index]
print(text)
print(grants_target_df.loc[similar_grant_index]['claim'])


 
  1 . A method to aggregate, filter, and share energy data for analysis, the method comprising:
 receiving first data associated with a first electrical circuit, the first data having a first protocol; 
 sampling the first data at a first sampling rate to generate first digital data, wherein the first sampling rate is substantially continuous; and 
 transmitting reporting digital data over a network having a network protocol different from the first protocol, the reporting digital data comprising at least the first digital data, wherein the reporting digital data is transmitted at a reporting rate that is decoupled from the first sampling rate. 
 
 
 
  2 . The method of  claim 1  further comprising:
 receiving second data associated with a second electrical circuit, the second data having a second protocol different from the first protocol; and 
 sampling the second data at a second sampling rate to generate second digital data, wherein the second sampling rate is substantially con

  import sys


In [25]:
idx = 1
text = whole_xml_to_claim(testset_app_df.loc[idx]['xml'])
vec = text_to_vec(text)
similar_grant_index = find_most_similar(vec)
similar_grant_id = grants_ids[similar_grant_index]
print(text)
print(grants_target_df.loc[similar_grant_index]['claim'])


 
  1 . A display apparatus, comprising:
 a position sensor to sense an eye position of a user; 
 a controller to set a virtual viewing window corresponding to the sensed eye position of the user and to provide a control signal to generate a directional light toward the virtual viewing window; and 
 a light generator to generate a directional light based on the control signal. 
 
 
 
  2 . The display apparatus of  claim 1 , further comprising:
 a light modulator to modulate an intensity of the directional light based on the control signal. 
 
 
 
  3 . The display apparatus of  claim 1 , wherein the position sensor comprises at least one camera to photograph the eye position of the user. 
 
 
  4 . The display apparatus of  claim 1 , wherein the position sensor comprises at least one camera to identify the eye position of the user by photographing an identifier fixed around the eye position of the user. 
 
 
  5 . The display apparatus of  claim 1 , wherein the light generator compri

  import sys


## Trial and error

In [26]:
predictdf = pd.DataFrame(columns=grants_target_df.parsed.values, dtype=np.bool)

In [27]:
predictdf.loc[14307191] = False

In [28]:
predictdf.loc[14307191, 6837383] = True

In [29]:
predictdf

Unnamed: 0,6837383,6837647,6837799,6837893,6837910,6838140,6838207,6838507,6838812,6838925,...,8334161,8334431,8334887,8336128,8336158,8336789,8336964,8337193,8339697,8340894
14307191,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [30]:
predictdf.loc[14307191, 6837383]

True

# Doc2vec model

In [8]:
grants_target_df['claim'][0]

'\n \n 1. A tool organizer for mounting to and adapted for use with a vehicle lift rack having support arms, said tool organizer comprising:\n four upstanding walls depending from a base and forming a storage volume for storing a plurality of tools and parts;  \n a selectively articulating lid enclosing said storage volume;  \n adjustable attachment means depending from said storage volume, said attachment means removably attachable to said support arms for supporting said tool organizer by gravity impingement;  \n a U-shaped ring affixed to an external portion of at least one of said walls, said ring for supporting a hand tool;  \n a support plate projected from a front of said tool organizer and co-extensive with said base;  \n a plurality of intermediate apertures formed in said support plate, said plurality of intermediate apertures adapted sized to accommodate intermediate and large hand tools;  \n a plurality of small apertures formed in said support plate, said plurality of smal

### Do simple text preprocessing and create datasets for doc2vec model.

In [9]:
preprocessed_grant_data = grants_target_df['claim'].str\
    .replace("\n", "", regex=False).str\
    .replace("[0-9]*\.", "", regex=True).str\
    .replace("[,:;]", "", regex=True).str\
    .replace("a |A |the |The ", "", regex=True)

In [10]:
preprocessed_grant_data[0]

'   tool organizer for mounting to and adapted for use with vehicle lift rack having support arms said tool organizer comprising four upstanding walls depending from base and forming storage volume for storing plurality of tools and parts   selectively articulating lid enclosing said storage volume   adjustable attachment means depending from said storage volume said attachment means removably attachable to said support arms for supporting said tool organizer by gravity impingement   U-shaped ring affixed to an external portion of at least one of said walls said ring for supporting hand tool   support plate projected from front of said tool organizer and co-extensive with said base   plurality of intermediate apertures formed in said support plate said plurality of intermediate apertures adapted sized to accommodate intermediate and large hand tools   plurality of small apertures formed in said support plate said plurality of small apertures adapted to accommodate small hand tools   at

In [11]:
from gensim.models import doc2vec
from gensim.utils import simple_preprocess

In [12]:
d2v_grant_data = list()

for idx, elem in enumerate(preprocessed_grant_data):
    d2v_grant_data.append(
        doc2vec.TaggedDocument(simple_preprocess(elem), [str(grants_target_df.parsed[idx])])
    )

In [13]:
d2v_grant_data[0]

TaggedDocument(words=['tool', 'organizer', 'for', 'mounting', 'to', 'and', 'adapted', 'for', 'use', 'with', 'vehicle', 'lift', 'rack', 'having', 'support', 'arms', 'said', 'tool', 'organizer', 'comprising', 'four', 'upstanding', 'walls', 'depending', 'from', 'base', 'and', 'forming', 'storage', 'volume', 'for', 'storing', 'plurality', 'of', 'tools', 'and', 'parts', 'selectively', 'articulating', 'lid', 'enclosing', 'said', 'storage', 'volume', 'adjustable', 'attachment', 'means', 'depending', 'from', 'said', 'storage', 'volume', 'said', 'attachment', 'means', 'removably', 'attachable', 'to', 'said', 'support', 'arms', 'for', 'supporting', 'said', 'tool', 'organizer', 'by', 'gravity', 'impingement', 'shaped', 'ring', 'affixed', 'to', 'an', 'external', 'portion', 'of', 'at', 'least', 'one', 'of', 'said', 'walls', 'said', 'ring', 'for', 'supporting', 'hand', 'tool', 'support', 'plate', 'projected', 'from', 'front', 'of', 'said', 'tool', 'organizer', 'and', 'co', 'extensive', 'with', 'said

### doc2vec model training.

model parameters: https://radimrehurek.com/gensim/models/doc2vec.html

In [14]:
import multiprocessing
CPUNUM = multiprocessing.cpu_count()

In [15]:
%%time

model = doc2vec.Doc2Vec(
    documents=d2v_grant_data
    , dm = 1
    , epochs = 50
    , vector_size=350
    , window=5
    , min_count=5
    , workers=CPUNUM
    , seed=23)

CPU times: user 8min 55s, sys: 3.37 s, total: 8min 58s
Wall time: 2min 24s


Check word similarities.

In [16]:
model.most_similar("machine")

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('computer', 0.3288521468639374),
 ('computing', 0.22295017540454865),
 ('mediaccording', 0.21654629707336426),
 ('carrier', 0.21311798691749573),
 ('processor', 0.2070944905281067),
 ('solid', 0.19546379148960114),
 ('abradable', 0.19497519731521606),
 ('writable', 0.19063375890254974),
 ('watercraft', 0.18996915221214294),
 ('browser', 0.1790914684534073)]

It looks nice.  
Next check document similarities.

In [17]:
grants_target_df.head(2)

Unnamed: 0,parsed,xml,claim
0,6837383,"<us-patent-grant lang=""EN"" dtd-version=""v40 20...",\n \n 1. A tool organizer for mounting to and ...
1,6837647,"<us-patent-grant lang=""EN"" dtd-version=""v40 20...",\n \n 1. A modular crowd and traffic control b...


In [18]:
idx = 6837383

In [19]:
model.docvecs.most_similar(str(idx))

  if np.issubdtype(vec.dtype, np.int):


[('7258401', 0.44296786189079285),
 ('7242108', 0.4299090802669525),
 ('7383954', 0.42481231689453125),
 ('6860386', 0.395189493894577),
 ('7954481', 0.37927863001823425),
 ('7384025', 0.35905197262763977),
 ('6908418', 0.3479470908641815),
 ('6991262', 0.34716877341270447),
 ('7845510', 0.34139519929885864),
 ('7253127', 0.34035345911979675)]

In [20]:
most_similar_grant_idx = int(model.docvecs.most_similar(str(idx))[0][0])

  if np.issubdtype(vec.dtype, np.int):


In [21]:
most_similar_grant_idx

7258401

Check similar documents.

In [22]:
grants_target_df[grants_target_df.parsed == idx]['claim'].values

array(['\n \n 1. A tool organizer for mounting to and adapted for use with a vehicle lift rack having support arms, said tool organizer comprising:\n four upstanding walls depending from a base and forming a storage volume for storing a plurality of tools and parts;  \n a selectively articulating lid enclosing said storage volume;  \n adjustable attachment means depending from said storage volume, said attachment means removably attachable to said support arms for supporting said tool organizer by gravity impingement;  \n a U-shaped ring affixed to an external portion of at least one of said walls, said ring for supporting a hand tool;  \n a support plate projected from a front of said tool organizer and co-extensive with said base;  \n a plurality of intermediate apertures formed in said support plate, said plurality of intermediate apertures adapted sized to accommodate intermediate and large hand tools;  \n a plurality of small apertures formed in said support plate, said plurality 

In [23]:
grants_target_df[grants_target_df.parsed == most_similar_grant_idx]['claim'].values

array(['\n \n 1. A method of sitting, wherein said method comprises the steps of:\n a. securing a portable chair to an upright support structure; \n b. extending a support rod proximate a base of the support structure; \n c. wherein said chair comprises fabric material, folding said fabric material into fan-folds; and \n d. securing said fan-folds together via a grommet. \n \n \n'],
      dtype=object)

Are these similar? It's not easy to judge, but these are both physical devices?

Let's continue to check model predictions

### Model prediction for test app data

Create the same type data of testset_app_df as we did for grant_target_df.

In [24]:
testset_app_df.head()

Unnamed: 0,app_id,xml
0,14307191,"<us-patent-application lang=""EN"" dtd-version=""..."
1,13137006,"<us-patent-application lang=""EN"" dtd-version=""..."
2,12741959,"<us-patent-application lang=""EN"" dtd-version=""..."
3,12643447,"<us-patent-application lang=""EN"" dtd-version=""..."
4,14200253,"<us-patent-application lang=""EN"" dtd-version=""..."


In [25]:
testset_app_df['claim'] = testset_app_df["xml"].map(whole_xml_to_claim)

In [26]:
testset_app_df.head()

Unnamed: 0,app_id,xml,claim
0,14307191,"<us-patent-application lang=""EN"" dtd-version=""...","\n \n 1 . A method to aggregate, filter, and ..."
1,13137006,"<us-patent-application lang=""EN"" dtd-version=""...","\n \n 1 . A display apparatus, comprising:\n ..."
2,12741959,"<us-patent-application lang=""EN"" dtd-version=""...",\n \n 1 - 33 . (canceled) \n \n \n 34 . A co...
3,12643447,"<us-patent-application lang=""EN"" dtd-version=""...",\n \n 1 . A terminal fitting formed by bendin...
4,14200253,"<us-patent-application lang=""EN"" dtd-version=""...",\n \n 1 . A printer for printing a three-dime...


In [27]:
preprocessed_testset_app_data = testset_app_df['claim'].str\
    .replace("\n", "", regex=False).str\
    .replace("[0-9]*\.", "", regex=True).str\
    .replace("[,:;]", "", regex=True).str\
    .replace("a |A |the |The ", "", regex=True)

In [28]:
preprocessed_testset_app_data = testset_app_df['claim']

In [29]:
vec = model.infer_vector( simple_preprocess(preprocessed_testset_app_data[0]) )

In [30]:
vec

array([ 1.4193389e+00,  4.9787712e-01,  1.1977789e+00, -2.3458123e+00,
       -2.5139294e+00,  1.3345280e+00, -1.3750316e+00, -1.1853079e+00,
       -1.0399116e+00, -3.0595894e+00,  1.2986062e+00, -4.6496108e-01,
       -1.0849931e+00,  5.7383204e-01, -8.3700776e-01, -2.4948289e+00,
        3.6967799e-01,  6.3146768e+00,  2.0904517e+00,  3.2004350e-01,
       -3.9391303e+00, -2.3947566e+00,  3.3929551e+00,  4.3394918e+00,
       -3.6151779e+00, -7.7018893e-01,  1.9713643e+00, -1.3699533e+00,
        2.7648537e+00, -6.2558722e-01, -5.1446646e-01,  2.8439567e+00,
        3.8421149e+00,  1.9443833e+00,  2.6537316e+00, -6.0759254e+00,
       -2.2043023e+00,  1.9617833e+00, -3.3114133e+00, -2.5946164e+00,
        2.1793137e-01,  4.1760902e+00, -1.3126152e+00,  1.7040106e+00,
        1.5103260e+00, -1.1026165e+00,  1.6513299e+00,  4.2930894e+00,
        1.1730182e+00, -1.4524143e+00, -1.4717603e+00, -1.2430451e+00,
        6.8538934e-01, -2.4203212e+00,  3.6162165e-01,  1.1418265e+00,
      

In [31]:
model.docvecs.most_similar([vec])

  if np.issubdtype(vec.dtype, np.int):


[('8104041', 0.2958594858646393),
 ('7719957', 0.2648068964481354),
 ('8214829', 0.24946874380111694),
 ('7453937', 0.2493516206741333),
 ('6914242', 0.2491285502910614),
 ('7620127', 0.2432052046060562),
 ('7304681', 0.23867309093475342),
 ('8035392', 0.23826931416988373),
 ('7057392', 0.23457486927509308),
 ('7252786', 0.23387867212295532)]

In [32]:
[int(grantid) for grantid,similarity in model.docvecs.most_similar([vec], topn=1)]

  if np.issubdtype(vec.dtype, np.int):


[8104041]

In [33]:
def predict_test_set():
    predictdf = pd.DataFrame(columns=grants_target_df.parsed.values, dtype=np.bool)
    for idx, appid in enumerate(testset_app_df.app_id):
        text = preprocessed_testset_app_data[idx]
        vec = model.infer_vector(simple_preprocess(text))
        
        similar_grant_indexes = [int(grantid) 
                                 for grantid, similarity 
                                 in model.docvecs.most_similar([vec], topn=1)]

        predictdf.loc[appid] = False
        for similar_grant_index in similar_grant_indexes:
            predictdf.loc[appid, similar_grant_index] = True
        if idx%100 == 0:
            print(idx)
    
    return predictdf

In [34]:
%%time

pred_df = predict_test_set()

  if np.issubdtype(vec.dtype, np.int):


0
100
200
300
400
500
600
700
800
900
CPU times: user 3min 55s, sys: 4min 32s, total: 8min 27s
Wall time: 2min 44s


In [35]:
calc_summary(pred_df, label_df)

Unnamed: 0,acc,prec,recall,f1
0,0.999263,0.195,0.155875,0.173256
