# Baseline model

Just predict true if tf-idf cosin distance is closer than threshold.

This is intended for simplest end-to-end solution.

In [1]:
import pandas as pd

In [2]:
grants_all_df = pd.read_pickle("../data/grants_2012_from2017_xmldf.dat")

In [178]:
app_all_df = pd.read_pickle("../data/app_2017_by2012_xmldf.dat")

### Split training set and test set

In [179]:
# NYI

### Retrieve just claim. Remove all tags.

In [8]:
import re

In [41]:
CLAIM_PAT = re.compile(r'<claims[^>]*>(.*)</claims>',re.MULTILINE|re.DOTALL)

In [47]:
TAG_PAT = re.compile(r"<.*?>")

In [51]:
def whole_xml_to_claim(whole):
    mat = CLAIM_PAT.search(whole)
    claim_with_tag = mat.group(1)
    return TAG_PAT.sub(' ', claim_with_tag)

In [53]:
grants_all_df.head()["xml"].map(whole_xml_to_claim)

0    \n \n 1. A pacifier clip, comprising:\n a base...
1    \n \n 1. A supporting clasp which supports a s...
2    \n \n 1. A clip of molded plastics material fo...
3    \n \n 1. A tire inflation system comprising:\n...
4    \n \n 1. A cooling system for a heat-generatin...
Name: xml, dtype: object

In [54]:
grants_all_df["claim"] = grants_all_df["xml"].map(whole_xml_to_claim)

In [55]:
grants_all_df.head()

Unnamed: 0,parsed,xml,claim
0,8245358,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT...","\n \n 1. A pacifier clip, comprising:\n a base..."
1,8245460,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT...",\n \n 1. A supporting clasp which supports a s...
2,8245733,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT...",\n \n 1. A clip of molded plastics material fo...
3,8245746,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT...",\n \n 1. A tire inflation system comprising:\n...
4,8245764,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT...",\n \n 1. A cooling system for a heat-generatin...


# Convert to feature vectors and retrieve vocabulary

Doing similar things to scikit learn example  
http://scikit-learn.org/stable/auto_examples/text/document_classification_20newsgroups.html

Also, this document is helpful.  
http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

In [56]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [57]:
vectorizer = TfidfVectorizer(stop_words='english', max_df = 0.5)

In [58]:
features = vectorizer.fit_transform(grants_all_df["claim"])

In [59]:
features.shape

(5424, 28988)

In [61]:
type(features)

scipy.sparse.csr.csr_matrix

In [63]:
vocab = vectorizer.vocabulary_

In [64]:
idfvec = vectorizer.idf_

In [65]:
len(vocab.keys())

28988

In [66]:
list(vocab.items())[0:5]

[('90', 910),
 ('interdigitale', 13898),
 ('epg', 9696),
 ('shaped', 23417),
 ('chondrus', 5144)]

In [67]:
len(idfvec)

28988

In [68]:
idfvec[0:5]

array([ 7.51933164,  5.26803984,  8.905626  ,  8.905626  ,  8.905626  ])

### Save features, vocabulary, idf vector

In [69]:
import pickle

In [70]:
with open("../data/grants2012_tfidf_features.dat", "wb") as f:
    pickle.dump(features, f)

In [72]:
with open("../data/grants2012_vocab_idf_dict.dat", "wb") as f:
    pickle.dump({"vocabulary": vocab, "idf": idfvec}, f)

### Load code

In [None]:
with open("../data/grants2012_tfidf_features.dat", 'rb') as f:
    features = pickle.load(f)

In [None]:
with open("../data/grants2012_vocab_idf_dict.dat", 'rb') as f:
    dic = pickle.load(f)
    vocab, idfvec = dic["vocabulary"], dic["idf"]

### Calculate tf-idf manually using vocabulary and idf vector, and check whether it's coinside.

In [73]:
from sklearn.feature_extraction.text import CountVectorizer

In [76]:
one_claim = grants_all_df.iloc[0]["claim"]

In [113]:
count_vec = CountVectorizer(vocabulary=vocab, stop_words="english", max_df = 0.5)

In [114]:
res = count_vec.fit_transform([one_claim])

In [115]:
res_arr = res.toarray()

In [116]:
res.shape

(1, 28988)

In [117]:
tf = res_arr[0]

In [91]:
answer = features[0, :].toarray()

In [97]:
answer = answer[0]

In [103]:
def print_nonzero_index(arr, maxcount):
    count = 0

    for i, v in enumerate(arr):
        if v != 0:
            count+=1
            print(i)
            if count > maxcount:
                break

In [104]:
print_nonzero_index(answer, 5)

1025
1072
1073
1117
1120
1373


In [102]:
answer[1025]

0.015374346416530774

In [105]:
print_nonzero_index(tf, 5)

1025
1072
1073
1117
1120
1373


In [121]:
sumtf = sum(tf)

In [123]:
unnormalized = [tf[i]*idfvec[i]/sumtf for i, _ in enumerate(tf)]

In [128]:
import numpy as np

In [129]:
unnormalized[1025]/np.linalg.norm(unnormalized)

0.015374346416530772

Try two claim for generarization

In [131]:
tfcsr = count_vec.fit_transform(grants_all_df.iloc[0:2]["claim"])

In [132]:
tf = tfcsr.toarray()

In [133]:
tf.shape

(2, 28988)

In [141]:
unnormalized = np.multiply(tf, idfvec)

In [145]:
lpnorms = np.linalg.norm(unnormalized, axis=1)

In [147]:
manual_tfidf = unnormalized/lpnorms[:, np.newaxis]

In [148]:
manual_tfidf[0, 1025]

0.015374346416530776

In [167]:
all(abs(manual_tfidf[0, :] - features[0].toarray()[0]) < 0.00001)

True

In [166]:
all(abs(manual_tfidf[1] - features[1].toarray()[0]) < 0.00001)

True

Now make calculate tf-idf function

In [171]:
def claims_to_tfidfs(claimarr, count_vec, idfvec):
    tfcsr = count_vec.fit_transform(claimarr)
    tf = tfcsr.toarray()
    unnormalized = np.multiply(tf, idfvec)
    lpnorms = np.linalg.norm(unnormalized, axis=1)
    return unnormalized/lpnorms[:, np.newaxis]

In [172]:
manu3 = claims_to_tfidfs(grants_all_df.iloc[0:2]["claim"], count_vec, idfvec)

In [177]:
all(manu3[0] == manual_tfidf[0]), all(manu3[1] == manual_tfidf[1])

(True, True)