In [25]:
import h5py
import pandas as pd
import numpy as np
import pickle
from gensim.models import Word2Vec

In [3]:
citations_info_target = pd.read_pickle("../data/citations_info_2000.df.gz")
training_app_df = pd.read_pickle("../data/training_app_1000.df.gz")
testset_app_df = pd.read_pickle("../data/testset_app_1000.df.gz")
grants_target_df = pd.read_pickle("../data/grants_for_2000.df.gz")

In [12]:
import re
CLAIM_PAT = re.compile(r'<claims[^>]*>(.*)</claims>',re.MULTILINE|re.DOTALL)
TAG_PAT = re.compile(r"<.*?>")
def whole_xml_to_claim_xml(whole):
    mat = CLAIM_PAT.search(whole)
    return mat.group(1)
def whole_xml_to_claim(whole):
    return TAG_PAT.sub(' ', whole_xml_to_claim_xml(whole))

grants_target_df["claim"] = grants_target_df["xml"].map(whole_xml_to_claim)

In [5]:
def set_one_answer_appid(labeldf, oneappid):
    cited_patids = citations_info_target[citations_info_target.app_id == oneappid].parsed
    labeldf.loc[oneappid] = labeldf.columns.isin(cited_patids)
    
def create_label_df():
    label_df = pd.DataFrame(columns=grants_target_df.parsed.values, dtype=np.bool)
    for appid in testset_app_df.app_id:
        set_one_answer_appid(label_df, appid)
    return label_df

label_df = create_label_df()

In [6]:
def predict_test_set(predict_func):
    """
    predict_func(claims) return NxM of boolean. N is len(claims). M is rownum of grants_target_df.
            value indicate n claim is cite patent of m row of grants_all_df.
    """
    predictdf = pd.DataFrame(columns=grants_target_df.parsed.values, dtype=np.bool)
    res = predict_func(testset_app_df["xml"].map(whole_xml_to_claim))
    for idx, appid in enumerate(testset_app_df.app_id):
        predictdf.loc[appid] = res[idx, :]
    return predictdf

In [7]:
def calc_TPs(preddf, labeldf):
    return sum([sum(preddf.loc[one_appid][labeldf.loc[one_appid]]) for one_appid in testset_app_df.app_id])

def calc_FPs(preddf, labeldf):
    return sum([sum(preddf.loc[one_appid][~labeldf.loc[one_appid]]) for one_appid in testset_app_df.app_id])

def calc_TNs(preddf, labeldf):
    return sum([sum(preddf.loc[one_appid][~labeldf.loc[one_appid]] == False) for one_appid in testset_app_df.app_id])

def calc_FNs(preddf, labeldf):
    return sum([sum(preddf.loc[one_appid][labeldf.loc[one_appid]] == False) for one_appid in testset_app_df.app_id])

def calc_TFPNs(preddf, labeldf):
    return calc_TPs(preddf, labeldf), calc_FPs(preddf, labeldf), calc_TNs(preddf, labeldf), calc_FNs(preddf, labeldf)

In [8]:
def calc_summary_TFPNs(TP, FP, TN, FN):
    "return acc, prec, recall, f1."
    return pd.DataFrame(columns=["acc", "prec", "recall", "f1"], data=[[(TP+TN)/(TP+FP+TN+FN), TP/(TP+FP), TP/(TP+FN), 2*TP/(2*TP+FP+FN)]])
    
def calc_summary(preddf, labeldf):
    TP, FP, TN, FN = calc_TFPNs(preddf, labeldf)
    return calc_summary_TFPNs(TP, FP, TN, FN)

In [39]:
grants_target_claims = grants_target_df['claim'].map(lambda x:x.split()).tolist()

In [40]:
w2v = Word2Vec(grants_target_claims, size=100, window=5, min_count=5, workers=10, iter=10, hs=1)

In [30]:
w2v.most_similar('machine')

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('machine.', 0.5443912148475647),
 ('machines', 0.5442870855331421),
 ('microcontroller', 0.46353399753570557),
 ('engine', 0.462362676858902),
 ('motor', 0.44722825288772583),
 ('machine,', 0.4360313415527344),
 ('resource', 0.43575650453567505),
 ('broadcaster', 0.4032304883003235),
 ('sub-system', 0.3947978913784027),
 ('medium', 0.39188137650489807)]

In [45]:
grants_ids = columns=grants_target_df.parsed.values

In [56]:
np.average(w2v[['machine', 'water']], axis=0)

  """Entry point for launching an IPython kernel.


array([ 1.3742788 , -0.45130956, -0.5538383 ,  0.31462574, -0.6941546 ,
        0.7091661 , -0.11016345,  0.13990042, -0.51719546, -0.9281438 ,
       -0.605462  , -0.00856757,  2.095352  ,  1.3660074 ,  0.4991513 ,
       -0.47060782, -1.3841174 ,  0.23318565,  0.16070549,  0.09736204,
        1.4471285 ,  0.9738065 , -0.176575  , -0.45280743,  1.4742624 ,
        0.46031287,  0.556444  ,  0.08926451, -1.7839073 , -1.8831397 ,
       -0.4073463 ,  1.2866449 ,  0.34124067,  2.2468126 , -0.31271386,
       -0.27899843, -0.68909115,  0.438477  ,  1.1818535 , -0.1537615 ,
        1.3633217 ,  0.11307028, -0.06431782, -0.5783814 ,  2.303399  ,
       -0.2484436 , -0.500126  , -0.2754866 , -0.7855971 , -0.10270226,
        1.9819223 , -0.19825703, -0.47991472, -2.4269872 ,  1.2823287 ,
       -1.5794102 ,  1.1003997 ,  0.34543473, -0.90850174,  1.8117652 ,
        1.12618   ,  0.5546973 ,  1.0521578 , -0.46503502,  0.7912778 ,
       -1.6723936 , -0.8172772 , -0.44266707,  0.19513679,  0.17

In [62]:
def text_to_vec(text):
    words = text.split()
    filtered_words = []
    for word in words:
        if word in w2v.wv.vocab:
            filtered_words.append(word)
    vec = np.average(w2v[filtered_words], axis=0)
    return vec

In [63]:

grants_w2v_vectors = []
for i, id in enumerate(grants_ids):
    vec = text_to_vec(grants_target_df.loc[i]['claim'])
    grants_w2v_vectors.append(vec)
grants_w2v_vectors = np.array(grants_w2v_vectors)

  import sys


In [110]:
np.argsort([6,1,2,4,5])[::-1][0:2]

array([0, 4])

In [121]:
def find_most_similar(vec):
    topN = 10
    similarities = []
    for grants_vec in grants_w2v_vectors:
        sim = 1 - spatial.distance.cosine(vec, grants_vec)
        similarities.append(sim)
    return np.argmax(similarities)
    #return np.argsort(similarities)[::-1][0:topN]

In [116]:
find_most_similar(grants_w2v_vectors[2,:])

array([   2,  534,  644, 1679, 1620,  998, 1637, 2172, 1810, 1834, 1002,
       1356, 1560,  944, 1200, 2484, 1506, 2432, 1106, 2159,  894, 1945,
       1933, 1257, 2522,  426, 1340,  529,  635,  268,  259, 2074, 1900,
       2178, 1619, 2114, 1795,  578,  889, 1890, 1565, 1183, 1499, 1330,
        287, 2174, 1432,  527, 2402, 1182, 2479, 2152, 1215,  436, 1715,
       1129,  910, 2180,  381, 2208, 2093, 1454,  687, 1246, 1754, 1757,
       1401, 1467,   64,  930,  590,  566,  982, 2378, 1921,  921, 1800,
         44, 2233,  890, 2417, 2328, 1626,  884, 1305,  255, 1612,  267,
       1629,   77,  486, 1765,  740,  829, 1535, 2507,  298, 2264,  326,
       1536, 1132,  192,  468,  528,  923, 2460,  262, 2323, 1742, 2278,
       1808,  518, 2424, 1980,  981,   90,  226,  610,  171,   41, 2194,
       1472,  218, 1137,  525, 2189, 1480, 1845, 1541,  958,  530,  588,
        263, 1605,  537, 2474, 2387, 1865,  934,  313,  575,  412, 1603,
        814, 1515, 1281, 1424, 1447, 2405, 1268,  6

In [117]:
def predict_test_set():
    predictdf = pd.DataFrame(columns=grants_target_df.parsed.values, dtype=np.bool)
    for idx, appid in enumerate(testset_app_df.app_id):
        text = whole_xml_to_claim(testset_app_df.loc[idx]['xml'])
        vec = text_to_vec(text)
        similar_grant_indexes = find_most_similar(vec)
        predictdf.loc[appid] = False
        for similar_grant_index in similar_grant_indexes:
            similar_grant_id = grants_ids[similar_grant_index]
            predictdf.loc[appid, similar_grant_id] = True
        if idx%100 == 0:
            print(idx)
    return predictdf

In [120]:
pred_df = predict_test_set()

  import sys


0
100
200


KeyboardInterrupt: 

In [101]:
pred_df

Unnamed: 0,6837383,6837647,6837799,6837893,6837910,6838140,6838207,6838507,6838812,6838925,...,8334161,8334431,8334887,8336128,8336158,8336789,8336964,8337193,8339697,8340894
14307191,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
13137006,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
12741959,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
12643447,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
14200253,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
14745354,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
12592473,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
13518392,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
12742690,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
12612289,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [102]:
# top1
calc_summary(pred_df, label_df)

Unnamed: 0,acc,prec,recall,f1
0,0.999221,0.143,0.114309,0.127055


In [114]:
# top5
calc_summary(pred_df, label_df)

Unnamed: 0,acc,prec,recall,f1
0,0.997756,0.0588,0.235012,0.094065


## Example

In [103]:
testset_app_df

Unnamed: 0,app_id,xml
0,14307191,"<us-patent-application lang=""EN"" dtd-version=""..."
1,13137006,"<us-patent-application lang=""EN"" dtd-version=""..."
2,12741959,"<us-patent-application lang=""EN"" dtd-version=""..."
3,12643447,"<us-patent-application lang=""EN"" dtd-version=""..."
4,14200253,"<us-patent-application lang=""EN"" dtd-version=""..."
5,14745354,"<us-patent-application lang=""EN"" dtd-version=""..."
6,12592473,"<us-patent-application lang=""EN"" dtd-version=""..."
7,13518392,"<us-patent-application lang=""EN"" dtd-version=""..."
8,12742690,"<us-patent-application lang=""EN"" dtd-version=""..."
9,12612289,"<us-patent-application lang=""EN"" dtd-version=""..."


In [106]:
idx = 0
appid = 14307191
text = whole_xml_to_claim(testset_app_df.loc[idx]['xml'])
vec = text_to_vec(text)
similar_grant_index = find_most_similar(vec)
similar_grant_id = grants_ids[similar_grant_index]
print(text)
print(grants_target_df.loc[similar_grant_index]['claim'])


 
  1 . A method to aggregate, filter, and share energy data for analysis, the method comprising:
 receiving first data associated with a first electrical circuit, the first data having a first protocol; 
 sampling the first data at a first sampling rate to generate first digital data, wherein the first sampling rate is substantially continuous; and 
 transmitting reporting digital data over a network having a network protocol different from the first protocol, the reporting digital data comprising at least the first digital data, wherein the reporting digital data is transmitted at a reporting rate that is decoupled from the first sampling rate. 
 
 
 
  2 . The method of  claim 1  further comprising:
 receiving second data associated with a second electrical circuit, the second data having a second protocol different from the first protocol; and 
 sampling the second data at a second sampling rate to generate second digital data, wherein the second sampling rate is substantially con

  import sys


In [122]:
idx = 1
text = whole_xml_to_claim(testset_app_df.loc[idx]['xml'])
vec = text_to_vec(text)
similar_grant_index = find_most_similar(vec)
similar_grant_id = grants_ids[similar_grant_index]
print(text)
print(grants_target_df.loc[similar_grant_index]['claim'])


 
  1 . A display apparatus, comprising:
 a position sensor to sense an eye position of a user; 
 a controller to set a virtual viewing window corresponding to the sensed eye position of the user and to provide a control signal to generate a directional light toward the virtual viewing window; and 
 a light generator to generate a directional light based on the control signal. 
 
 
 
  2 . The display apparatus of  claim 1 , further comprising:
 a light modulator to modulate an intensity of the directional light based on the control signal. 
 
 
 
  3 . The display apparatus of  claim 1 , wherein the position sensor comprises at least one camera to photograph the eye position of the user. 
 
 
  4 . The display apparatus of  claim 1 , wherein the position sensor comprises at least one camera to identify the eye position of the user by photographing an identifier fixed around the eye position of the user. 
 
 
  5 . The display apparatus of  claim 1 , wherein the light generator compri

  import sys


In [43]:
predictdf = pd.DataFrame(columns=grants_target_df.parsed.values, dtype=np.bool)

In [65]:
predictdf.loc[14307191] = False

In [67]:
predictdf.loc[14307191, 6837383] = True