In [1]:
import pandas as pd
import numpy as np

In [24]:
citations_info_target = pd.read_pickle("../data/citations_info_2000.df.gz")
training_app_df = pd.read_pickle("../data/training_app_1000.df.gz")
testset_app_df = pd.read_pickle("../data/testset_app_1000.df.gz")
grants_target_df = pd.read_pickle("../data/grants_for_2000.df.gz")

In [3]:
import re

CLAIM_PAT = re.compile(r'<claims[^>]*>(.*)</claims>',re.MULTILINE|re.DOTALL)
TAG_PAT = re.compile(r"<.*?>")

def whole_xml_to_claim_xml(whole):
    mat = CLAIM_PAT.search(whole)
    return mat.group(1)

def whole_xml_to_claim(whole):
    return TAG_PAT.sub(' ', whole_xml_to_claim_xml(whole))

In [4]:
def set_one_answer_appid(labeldf, oneappid):
    cited_patids = citations_info_target[citations_info_target.app_id == oneappid].parsed
    labeldf.loc[oneappid] = labeldf.columns.isin(cited_patids)
    
def create_label_df():
    label_df = pd.DataFrame(columns=grants_target_df.parsed.values, dtype=np.bool)
    for appid in testset_app_df.app_id:
        set_one_answer_appid(label_df, appid)
    return label_df

label_df = create_label_df()
label_df.head()

Unnamed: 0,6837383,6837647,6837799,6837893,6837910,6838140,6838207,6838507,6838812,6838925,...,8334161,8334431,8334887,8336128,8336158,8336789,8336964,8337193,8339697,8340894
14307191,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
13137006,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
12741959,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
12643447,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
14200253,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', max_df = 0.5)
grants_target_df["claim"] = grants_target_df["xml"].map(whole_xml_to_claim)
grants_features = vectorizer.fit_transform(grants_target_df["claim"])

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

def claims_to_tfidfs(claimarr, count_vec, idfvec):
    tfcsr = count_vec.fit_transform(claimarr)
    tf = tfcsr.toarray()
    unnormalized = np.multiply(tf, idfvec)
    lpnorms = np.linalg.norm(unnormalized, axis=1)
    return unnormalized/lpnorms[:, np.newaxis]

vocab = vectorizer.vocabulary_
idfvec = vectorizer.idf_
count_vec = CountVectorizer(vocabulary=vocab, stop_words="english", max_df = 0.5)

training_app_df["claim"] = training_app_df["xml"].map(whole_xml_to_claim)
training_features = claims_to_tfidfs(training_app_df["claim"], count_vec, idfvec)

In [14]:
grants_features.shape

(2524, 20124)

In [15]:
training_features.shape

(1000, 20124)

In [18]:
del label_df

In [19]:
def create_training_label_df():
    training_label_df = pd.DataFrame(columns=grants_target_df.parsed.values, dtype=np.bool)
    for appid in training_app_df.app_id:
        set_one_answer_appid(training_label_df, appid)
    return training_label_df

training_label_df = create_training_label_df()
training_label_df.head()

Unnamed: 0,6837383,6837647,6837799,6837893,6837910,6838140,6838207,6838507,6838812,6838925,...,8334161,8334431,8334887,8336128,8336158,8336789,8336964,8337193,8339697,8340894
14222691,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
12515852,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
12033424,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
12402344,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
12155425,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [20]:
citations_info_target.head()

Unnamed: 0,app_id,app_fnm,citation_pat_pgpub_id,parsed,ifw_number,action_type,action_subtype,form892,form1449,citation_in_oa,...,rejection_103,rejection_112,rejection_dp,objection,allowed_claims,cite102_gt1,cite103_gt3,cite103_eq1,cite103_max,signature_type
0,13371769,/work/data/apps/2012/ipa120607/F_2322.xml,7391316,7391316,H20LX5QGPXXIFW4,103.0,a,1,0,1,...,1,0,1,0,0,0,0,1,2,0
1,13371769,/work/data/apps/2012/ipa120607/F_2322.xml,6992580,6992580,H20LX5QGPXXIFW4,102.0,a,1,1,1,...,1,0,1,0,0,0,0,1,2,0
2,13371769,/work/data/apps/2012/ipa120607/F_2322.xml,6992580,6992580,H20LX5QGPXXIFW4,103.0,a,1,1,1,...,1,0,1,0,0,0,0,1,2,0
3,13371769,/work/data/apps/2012/ipa120607/F_2322.xml,7774833,7774833,H20LX5QGPXXIFW4,103.0,a,1,1,1,...,1,0,1,0,0,0,0,1,2,0
4,12282000,/work/data/apps/2009/ipa090312/F_1385.xml,7411209,7411209,G9LENRJ8PPOPPY5,102.0,a,0,1,1,...,1,0,0,0,0,1,0,1,1,3


In [26]:
del citations_info_target
del training_app_df
del testset_app_df
del grants_target_df

del training_features
del grants_features

In [25]:
app_df = pd.DataFrame(training_features, index=training_app_df['app_id'])
grant_df = pd.DataFrame(grants_features.toarray(), index=grants_target_df['parsed'])

In [29]:
training_dataset = pd.DataFrame(columns=[])

for idx, row in app_df.iterrows():
    vec = pd.concat([row, grant_df.loc[training_label_df.loc[idx].idxmax()]], ignore_index=True)
    training_dataset = training_dataset.append(vec, ignore_index=True)
    print(idx)

14222691
12515852
12033424
12402344
12155425
12372374
12434628
13702090
13127020
12077273
12136649
12069047
13656744
14611833
14420299
12650581
12404893
12798692
12505441
12062945
14910352
14635626
13186290
12285822
14447850
12305599
12395363
12203472
13276111
12278002
13436278
14579449
13705491
13261192
13183439
12871059
14325988
12086364
15048742
12316080
12026183
13592298
12631542
14505528
13824932
12731647
12240161
12340208
14777567
12718716
12336625
13032451
12633590
12121588
13657422
12490702
12621088
13450337
14332005
12175647
14059123
13307248
12178180
12774591
13284472
12679333
13422125
12679195
14431782
12181816
13871572
13674450
14757537
13633066
14466011
14532423
12200740
14488080
12743831
14215243
14205387
13396910
13298971
12547555
12811628
14059667
14193563
12528553
12457278
14621253
13821689
13212097
12835576
13652955
12479043
12689959
13275011
12680576
13540857
14266676
12316969
12349366
13924731
13028528
12210162
13349664
12169007
13050089
15005185
12390347
13505708
1

12205801
14554865
13536495
14247301
12447342
12242803
13175085
12138509
13006690
12400414
13944944
13937584
13763474
13574962
12421333
13286163
12857354
12616159
13670556
13748810
13316872
14596705
12180278
14450586
12182763
14119985
12121929
12854357
12492280
13408058
12784373
13248164
14148546
12700121
13371480
14705886
12313941
14920867
12343371
12853131
13636329
14348779
12424856
14396456
12500377
14661024
14797126
13900930
13328491
13371346
12328519
14080189
13073239
12153224
15177512
13461796
12667007
14792638
13225843
14803138
13546425
13153438
13177764
14262880
12456063
13947539
12521956
12369965
12883087
12268319
13878790
13444021
13167480
13938089
12805744
12829016
12849181
12595285
13173790
12544432
12060310
13431608
13305215
12893131
12572144
12271387
13396961
13889563
13496881


<bound method NDFrame.head of      0      1      2      3      4      5      6      7      8      9      \
0      0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
1      0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
2      0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
3      0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
4      0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
5      0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
6      0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
7      0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
8      0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
9      0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
10     0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
11     0.0    0.0    0.0    0.0    0.0    0.0 

In [30]:
training_dataset.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40238,40239,40240,40241,40242,40243,40244,40245,40246,40247
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
import pickle

with open("../data/training_dataset_labels_equal_1.dat", "wb") as f:
    pickle.dump(training_dataset , f)

In [32]:
del training_dataset

In [33]:
import random
random.seed(20181002)

random.random()

0.31472419896424686

In [34]:
grant_df.iloc[0]

0        0.0
1        0.0
2        0.0
3        0.0
4        0.0
5        0.0
6        0.0
7        0.0
8        0.0
9        0.0
10       0.0
11       0.0
12       0.0
13       0.0
14       0.0
15       0.0
16       0.0
17       0.0
18       0.0
19       0.0
20       0.0
21       0.0
22       0.0
23       0.0
24       0.0
25       0.0
26       0.0
27       0.0
28       0.0
29       0.0
        ... 
20094    0.0
20095    0.0
20096    0.0
20097    0.0
20098    0.0
20099    0.0
20100    0.0
20101    0.0
20102    0.0
20103    0.0
20104    0.0
20105    0.0
20106    0.0
20107    0.0
20108    0.0
20109    0.0
20110    0.0
20111    0.0
20112    0.0
20113    0.0
20114    0.0
20115    0.0
20116    0.0
20117    0.0
20118    0.0
20119    0.0
20120    0.0
20121    0.0
20122    0.0
20123    0.0
Name: 6837383, Length: 20124, dtype: float64

In [37]:
training_dataset_0 = pd.DataFrame(columns=[])

for idx, row in app_df.iterrows():
    vec = pd.concat([row, grant_df.iloc[round(len(grant_df) * random.random())]], ignore_index=True)
    training_dataset_0 = training_dataset_0.append(vec, ignore_index=True)

In [38]:
training_dataset_0.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40238,40239,40240,40241,40242,40243,40244,40245,40246,40247
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
with open("../data/training_dataset_labels_equal_0_A.dat", "wb") as f:
    pickle.dump(training_dataset_0 , f)

In [44]:
del training_dataset_0

In [46]:
training_dataset_0 = pd.DataFrame(columns=[], dtype='float32')

for idx, row in app_df.iterrows():
    vec = pd.concat([row, grant_df.iloc[round(len(grant_df) * random.random())]], ignore_index=True)
    training_dataset_0 = training_dataset_0.append(vec, ignore_index=True)

In [47]:
with open("../data/training_dataset_labels_equal_0_B.dat", "wb") as f:
    pickle.dump(training_dataset_0 , f)

In [48]:
del training_dataset_0

In [49]:
training_dataset_0 = pd.DataFrame(columns=[], dtype='float32')

for idx, row in app_df.iterrows():
    vec = pd.concat([row, grant_df.iloc[round(len(grant_df) * random.random())]], ignore_index=True)
    training_dataset_0 = training_dataset_0.append(vec, ignore_index=True)

In [50]:
with open("../data/training_dataset_labels_equal_0_C.dat", "wb") as f:
    pickle.dump(training_dataset_0 , f)

In [51]:
del training_dataset_0

In [53]:
training_dataset_0 = pd.DataFrame(columns=[], dtype='float32')

for idx, row in app_df.iterrows():
    vec = pd.concat([row, grant_df.iloc[round(len(grant_df) * random.random())]], ignore_index=True)
    training_dataset_0 = training_dataset_0.append(vec, ignore_index=True)

In [54]:
with open("../data/training_dataset_labels_equal_0_D.dat", "wb") as f:
    pickle.dump(training_dataset_0 , f)

In [58]:
del training_dataset_0

In [59]:
training_dataset_0 = pd.DataFrame(columns=[], dtype='float32')

for idx, row in app_df.iterrows():
    vec = pd.concat([row, grant_df.iloc[round(len(grant_df) * random.random())]], ignore_index=True)
    training_dataset_0 = training_dataset_0.append(vec, ignore_index=True)

In [60]:
with open("../data/training_dataset_labels_equal_0_E.dat", "wb") as f:
    pickle.dump(training_dataset_0 , f)