# Baseline model

Just predict true if tf-idf cosin distance is closer than threshold.

This is intended for simplest end-to-end solution.

# Training-Test set setup

Dataset setup is common for all model.
Currently just put in ipynb. Please dup! (maybe factor out to .py file in the future).

In [1]:
import pandas as pd

In [2]:
grants_all_df = pd.read_pickle("../data/grants_2012_from2017_xmldf.dat")

In [3]:
app_all_df = pd.read_pickle("../data/app_2017_by2012_xmldf.dat")

### Split training-test set setup

If app_id is uniq, we can just use dataframe.sample.
But somemodel might want to use multiple xml for the same app_id.

So keep all app_id in split phase.
Also, app_id order would be the same order as apply (maybe not, please confirm somebody!).
So keep order when split, Then shuffle so that every one reproduce split even though they change mind to use multiple xml of each app_id.

In [4]:
all_appid = set(app_all_df['app_id'])

In [5]:
import random

In [17]:
random.seed(1234)

In [18]:
training_id = set(random.sample(all_appid, int(len(all_appid)*0.9)))

In [19]:
testset_id = all_appid - training_id

In [20]:
len(training_id), len(testset_id)

(2769, 308)

In [21]:
training_app_df = app_all_df[app_all_df.app_id.isin(training_id)]

In [22]:
testset_app_df = app_all_df[~app_all_df.app_id.isin(training_id)]

In [24]:
app_all_df.shape, training_app_df.shape, testset_app_df.shape

((3083, 2), (2775, 2), (308, 2))

In [26]:
training_app_df.head().app_id

0    14742496
1    14348426
2    14613336
3    14053984
4    14590141
Name: app_id, dtype: int64

In [32]:
testset_app_df.iloc[1]

app_id                                             15289343
xml       <?xml version="1.0" encoding="UTF-8"?>\n<!DOCT...
Name: 15, dtype: object

In [35]:
def filter_uniq_appid(df):
    ids = set()
    filtermask = []
    for i in range(len(df)):
        app_id = df.iloc[i].app_id
        filtermask.append(app_id not in ids)
        ids.add(app_id)
    return filtermask


### Keep only first app_id

You can use multiple application xml if you want (in this case, skip filter_uniq_appid for training set).
I keep only first app_id in dataframe. I guess it in order of date, but may be not. Please confirm somebody!

In [42]:
training_app_df = training_app_df[filter_uniq_appid(training_app_df)]
testset_app_df = testset_app_df[filter_uniq_appid(testset_app_df)]

In [43]:
training_app_df.shape, testset_app_df.shape

((2769, 2), (308, 2))

### Shuffle

In [44]:
# set seed again for easier interactive shift-enter
random.seed(456)

In [45]:
training_app_df = training_app_df.sample(frac=1).reset_index(drop=True)
testset_app_df = testset_app_df.sample(frac=1).reset_index(drop=True)

### Reset index (may be you don't want, then skip here)

In [127]:
training_app_df = training_app_df.reset_index(drop=True)
testset_app_df = testset_app_df.reset_index(drop=True)

### Retrieve just claim. Remove all tags.

This utility function might necessary for any mode.

In [46]:
import re

In [47]:
CLAIM_PAT = re.compile(r'<claims[^>]*>(.*)</claims>',re.MULTILINE|re.DOTALL)

In [48]:
TAG_PAT = re.compile(r"<.*?>")

In [264]:
def whole_xml_to_claim_xml(whole):
    mat = CLAIM_PAT.search(whole)
    return mat.group(1)

In [265]:
def whole_xml_to_claim(whole):
    return TAG_PAT.sub(' ', whole_xml_to_claim_xml(whole))

# Model evaluation

In [114]:
# this is created in data_collection.ipynb
citations_2012_2017 = pd.read_pickle("../data/citations_2012_2017_merged.dat")

In [256]:
def set_one_answer_appid(labeldf, oneappid):
    cited_patids = citations_2012_2017[citations_2012_2017.app_id == oneappid].parsed
    labeldf.loc[oneappid] = labeldf.columns.isin(cited_patids)

In [258]:
def create_label_df():
    label_df = pd.DataFrame(columns=grants_all_df.parsed.sort_values().values, dtype=np.bool)
    for appid in testset_app_df.app_id:
        set_one_answer_appid(label_df, appid)
    return label_df

In [259]:
label_df = create_label_df()

In [261]:
label_df.shape

(308, 5424)

In [263]:
testset_app_df.head()

Unnamed: 0,app_id,xml
0,15238559,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT..."
1,15069179,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT..."
2,14813856,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT..."
3,14261651,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT..."
4,15187748,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT..."


In [299]:
def predict_training_set(predict_func):
    """
    predict_func(claims) return NxM of boolean. N is len(claims). M is rownum of grants_all_df.
            value indicate n claim is cite patent of m row of grants_all_df.
    """
    predictdf = pd.DataFrame(columns=grants_all_df.parsed.sort_values().values, dtype=np.bool)
    res = predict_func(testset_app_df["xml"].map(whole_xml_to_claim))
    for idx, appid in enumerate(testset_app_df.app_id):
        predictdf.loc[appid] = res[idx, :]
    """
        one_res = predict_func(whole_xml_to_claim(testset_app_df[testset_app_df.app_id == appid]["xml"].iloc[0]))
        predictdf.loc[appid] = one_res
    """
    return predictdf

In [343]:
def calc_TPs(preddf, labeldf):
    return sum([sum(preddf.loc[one_appid][labeldf.loc[one_appid]]) for one_appid in testset_app_df.app_id])

def calc_FPs(preddf, labeldf):
    return sum([sum(preddf.loc[one_appid][~labeldf.loc[one_appid]]) for one_appid in testset_app_df.app_id])

def calc_TNs(preddf, labeldf):
    return sum([sum(preddf.loc[one_appid][~labeldf.loc[one_appid]] == False) for one_appid in testset_app_df.app_id])

def calc_FNs(preddf, labeldf):
    return sum([sum(preddf.loc[one_appid][labeldf.loc[one_appid]] == False) for one_appid in testset_app_df.app_id])

def calc_TFPNs(preddf, labeldf):
    return calc_TPs(preddf, labeldf), calc_FPs(preddf, labeldf), calc_TNs(preddf, labeldf), calc_FNs(preddf, labeldf)

In [352]:
def calc_summary_TFPNs(TP, FP, TN, FN):
    "return acc, prec, recall, f1."
    return pd.DataFrame(columns=["acc", "prec", "recall", "f1"], data=[[(TP+TN)/(TP+FP+TN+FN), TP/(TP+FP), TP/(TP+FN), 2*TP/(2*TP+FP+FN)]])
    
def calc_summary(preddf, labeldf):
    TP, FP, TN, FN = calc_TFPNs(preddf, labeldf)
    return calc_summary_TFPNs(TP, FP, TN, FN)

### Sample evaluation code for baseline model

predict_tfidf_model is defined below

In [300]:
pred_df = predict_training_set(predict_tfidf_model)

In [353]:
calc_summary(pred_df, label_df)

Unnamed: 0,acc,prec,recall,f1
0,0.906136,0.000653,0.126866,0.001299


# Start baseline model dependent code from here

Now common part is done.
Start model specific cells.

In [50]:
grants_all_df.head()["xml"].map(whole_xml_to_claim)

0    \n \n 1. A pacifier clip, comprising:\n a base...
1    \n \n 1. A supporting clasp which supports a s...
2    \n \n 1. A clip of molded plastics material fo...
3    \n \n 1. A tire inflation system comprising:\n...
4    \n \n 1. A cooling system for a heat-generatin...
Name: xml, dtype: object

In [51]:
grants_all_df["claim"] = grants_all_df["xml"].map(whole_xml_to_claim)

In [52]:
grants_all_df.head()

Unnamed: 0,parsed,xml,claim
0,8245358,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT...","\n \n 1. A pacifier clip, comprising:\n a base..."
1,8245460,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT...",\n \n 1. A supporting clasp which supports a s...
2,8245733,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT...",\n \n 1. A clip of molded plastics material fo...
3,8245746,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT...",\n \n 1. A tire inflation system comprising:\n...
4,8245764,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT...",\n \n 1. A cooling system for a heat-generatin...


# Convert to feature vectors and retrieve vocabulary

Doing similar things to scikit learn example  
http://scikit-learn.org/stable/auto_examples/text/document_classification_20newsgroups.html

Also, this document is helpful.  
http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [69]:
random.seed(1234)

In [70]:
vectorizer = TfidfVectorizer(stop_words='english', max_df = 0.5)

In [71]:
grants_features = vectorizer.fit_transform(grants_all_df["claim"])

In [72]:
grants_features.shape

(5424, 28988)

In [73]:
vocab = vectorizer.vocabulary_

In [74]:
idfvec = vectorizer.idf_

In [75]:
len(vocab.keys())

28988

In [76]:
list(vocab.items())[0:5]

[('subsection', 24920),
 ('grommets', 11928),
 ('demagnetization', 7459),
 ('geometrically', 11566),
 ('syndromes', 25376)]

In [77]:
len(idfvec)

28988

In [78]:
idfvec[0:5]

array([ 7.51933164,  5.26803984,  8.905626  ,  8.905626  ,  8.905626  ])

### Save features, vocabulary, idf vector

In [79]:
import pickle

In [80]:
with open("../data/grants2012_tfidf_features.dat", "wb") as f:
    pickle.dump(grants_features, f)

In [81]:
with open("../data/grants2012_vocab_idf_dict.dat", "wb") as f:
    pickle.dump({"vocabulary": vocab, "idf": idfvec}, f)

### Load code

In [None]:
with open("../data/grants2012_tfidf_features.dat", 'rb') as f:
    grants_features = pickle.load(f)

In [None]:
with open("../data/grants2012_vocab_idf_dict.dat", 'rb') as f:
    dic = pickle.load(f)
    vocab, idfvec = dic["vocabulary"], dic["idf"]

### Calculate tf-idf manually using vocabulary and idf vector, and check whether it's coinside.

In [82]:
from sklearn.feature_extraction.text import CountVectorizer

In [83]:
one_claim = grants_all_df.iloc[0]["claim"]

In [84]:
count_vec = CountVectorizer(vocabulary=vocab, stop_words="english", max_df = 0.5)

In [85]:
res = count_vec.fit_transform([one_claim])

In [86]:
res_arr = res.toarray()

In [87]:
res.shape

(1, 28988)

In [88]:
tf = res_arr[0]

In [89]:
answer = grants_features[0, :].toarray()

In [90]:
answer = answer[0]

In [91]:
def print_nonzero_index(arr, maxcount):
    count = 0

    for i, v in enumerate(arr):
        if v != 0:
            count+=1
            print(i)
            if count > maxcount:
                break

In [92]:
print_nonzero_index(answer, 5)

1025
1072
1073
1117
1120
1373


In [93]:
answer[1025]

0.015374346416530774

In [94]:
print_nonzero_index(tf, 5)

1025
1072
1073
1117
1120
1373


In [95]:
sumtf = sum(tf)

In [96]:
unnormalized = [tf[i]*idfvec[i]/sumtf for i, _ in enumerate(tf)]

In [97]:
import numpy as np

In [98]:
unnormalized[1025]/np.linalg.norm(unnormalized)

0.015374346416530772

Try two claim for generarization

In [99]:
tfcsr = count_vec.fit_transform(grants_all_df.iloc[0:2]["claim"])

In [100]:
tf = tfcsr.toarray()

In [101]:
tf.shape

(2, 28988)

In [102]:
unnormalized = np.multiply(tf, idfvec)

In [103]:
lpnorms = np.linalg.norm(unnormalized, axis=1)

In [104]:
manual_tfidf = unnormalized/lpnorms[:, np.newaxis]

In [105]:
manual_tfidf[0, 1025]

0.015374346416530776

In [106]:
all(abs(manual_tfidf[0, :] - grants_features[0].toarray()[0]) < 0.00001)

True

In [107]:
all(abs(manual_tfidf[1] - grants_features[1].toarray()[0]) < 0.00001)

True

Now make calculate tf-idf function

In [108]:
def claims_to_tfidfs(claimarr, count_vec, idfvec):
    tfcsr = count_vec.fit_transform(claimarr)
    tf = tfcsr.toarray()
    unnormalized = np.multiply(tf, idfvec)
    lpnorms = np.linalg.norm(unnormalized, axis=1)
    return unnormalized/lpnorms[:, np.newaxis]

In [109]:
manu3 = claims_to_tfidfs(grants_all_df.iloc[0:2]["claim"], count_vec, idfvec)

In [110]:
all(manu3[0] == manual_tfidf[0]), all(manu3[1] == manual_tfidf[1])

(True, True)

### It's time to calculate tfidf for training set.

In [112]:
training_app_df["claim"] = training_app_df["xml"].map(whole_xml_to_claim)

In [113]:
training_features = claims_to_tfidfs(training_app_df["claim"], count_vec, idfvec)

Calculate one cosine distance

In [116]:
one_appid = training_app_df.iloc[0].app_id

In [118]:
citations_2012_2017[citations_2012_2017.app_id == one_appid]

Unnamed: 0,app_id,citation_pat_pgpub_id,parsed,ifw_number,action_type,action_subtype,form892,form1449,citation_in_oa
3934,15239553,8139098,8139098,,,,0,1,0


In [125]:
answer_patids = set(citations_2012_2017[citations_2012_2017.app_id == one_appid].parsed.astype(int))

In [126]:
answer_patids

{8139098}

In [128]:
type(grants_all_df.iloc[0].parsed)

numpy.int64

In [137]:
answer_idxs = grants_all_df[grants_all_df.parsed.isin(answer_patids)].index

In [142]:
answer_idxs[0]

3721

In [145]:
grants_all_df.iloc[3721].parsed

8139098

In [138]:
answer_patent_features = grants_features[answer_idxs[0], :].toarray()[0]

In [155]:
import scipy

In [163]:
scipy.spatial.distance.cdist(training_features[0, :][np.newaxis, :], grants_features[answer_idxs[0], :].toarray(), 'cosine')

array([[ 0.70069898]])

### Calculate 20 cosine distance

In [165]:
training_app_df.head()

Unnamed: 0,app_id,xml,claim
0,15239553,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT...",\n \n 1 . A method for monitoring an entrance...
1,14794901,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT...","\n \n 1 . A method comprising:\n determining,..."
2,14789694,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT...",\n \n 1 . An apparatus for performing autofoc...
3,14575586,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT...",\n \n 1 - 30 . (canceled) \n \n \n 31 . A me...
4,14792908,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT...",\n \n 1 . A lead frame comprising:\n a die pa...


In [168]:
training_app_df[training_app_df.app_id == 14575586].index[0]

3

In [180]:
def calc_cosin_for_one_app(appid):
    answer_patids = set(citations_2012_2017[citations_2012_2017.app_id == appid].parsed.astype(int))
    answer_idxs = grants_all_df[grants_all_df.parsed.isin(answer_patids)].index
    answer_patent_features = grants_features[answer_idxs, :].toarray()
    training_features_idx = training_app_df[training_app_df.app_id == appid].index[0]
    return scipy.spatial.distance.cdist(training_features[training_features_idx, :][np.newaxis, :], answer_patent_features, 'cosine')[0]


In [181]:
calc_cosin_for_one_app(14575586)

array([ 0.93692449])

In [182]:
calc_cosin_for_one_app(15239553)

array([ 0.70069898])

In [184]:
[calc_cosin_for_one_app(appid) for appid in training_app_df[0:20].app_id]

[array([ 0.70069898]),
 array([ 0.52318658,  0.6161884 ]),
 array([ 0.76994287]),
 array([ 0.93692449]),
 array([ 0.6903398 ,  0.73322171,  0.51425338]),
 array([ 0.94981988,  0.96189893,  0.90963075]),
 array([ 0.84986862,  0.63468337,  0.66806683,  0.71296975,  0.60790852]),
 array([ 0.60105534]),
 array([ 0.65477236]),
 array([ 0.99285149,  0.64298946]),
 array([ 0.93462749]),
 array([ 0.81688252]),
 array([ 0.83651815,  0.91682111]),
 array([ 0.9270821]),
 array([ 0.96752639,  0.85042014,  0.9967196 ]),
 array([ 0.74109417]),
 array([ 0.79338583,  0.71243438,  0.68631328,  0.83968923,  0.66094055,
         0.93768214,  0.85308823]),
 array([ 0.90485808]),
 array([ 0.6755992]),
 array([ 0.63787746])]

In [189]:
calc_cosin_for_one_app(training_app_df.iloc[5].app_id)

array([ 0.94981988,  0.96189893,  0.90963075])

In [190]:
calc_cosin_for_one_app(training_app_df.iloc[5].app_id).mean() < 0.95

0.94044985415441162

### Compare with random pair cosdistance

In [188]:
scipy.spatial.distance.cdist(training_features[0:5, :], grants_features[0:5, :].toarray(), 'cosine')

array([[ 0.99938301,  0.9997547 ,  0.99980016,  0.99910508,  0.99695949],
       [ 0.99858834,  1.        ,  1.        ,  0.997192  ,  0.99853724],
       [ 0.8702664 ,  0.99197115,  0.98392742,  0.92579017,  0.99666809],
       [ 0.98215487,  0.98651508,  0.99884208,  0.99715734,  0.99742717],
       [ 0.99636054,  0.99422001,  0.99546208,  0.99457865,  0.99258553]])

In [None]:
training_features = claims_to_tfidfs(training_app_df["claim"], count_vec, idfvec)

In [197]:
grants_features_arr = grants_features.toarray()

In [205]:
TFIDF_MODEL_THRESHOLD=0.95

def predict_tfidf_model(claims):
    """
    return: NxM of boolean. N is len(claims). M is rownum of grants_all_df.
            value indicate n claim is cite patent of m row of grants_all_df.
    """
    features = claims_to_tfidfs(claims, count_vec, idfvec)
    dists = scipy.spatial.distance.cdist(features, grants_features_arr, 'cosine')
    return dists < TFIDF_MODEL_THRESHOLD


In [206]:
res = predict_tfidf_model(training_app_df[0:5]["claim"])

In [209]:
res.shape

(5, 5424)

In [208]:
res[0, 0:5]

array([False, False, False, False, False], dtype=bool)

In [202]:
tmp = claims_to_tfidfs(training_app_df[0:5]["claim"], count_vec, idfvec)

In [203]:
dists = scipy.spatial.distance.cdist(tmp, grants_features_arr, 'cosine')

In [204]:
dists.shape

(5, 5424)

In [200]:
res.shape

(5,)

ValueError: Scalar operands are not allowed, use '*' instead