In [3]:
!pip install sentence_transformers



In [4]:
import json
import re
from spacy import load as spacyload
from sentence_transformers import SentenceTransformer


In [5]:
def tokenize(doc, lemmatized=False, remove_stopword=False, 
                   remove_punct = True, pos_tag = False, remove_num=False):
    nlp=spacyload("en_core_web_sm")
    doc=nlp(doc.lower().strip())

    tokens=[]

    for token in doc:
        if (remove_stopword and token.is_stop) or (remove_punct and token.is_punct) or (token.text=='') or (token.text.isspace()) or (remove_num and token.text.isnumeric()):
            continue

        token_ls=token.text
        if lemmatized:
            token_ls=token.lemma_
        if pos_tag:
            token_ls=(token_ls, token.pos_)
        tokens.append(token_ls)
    return tokens


In [8]:
patent_file=json.load(open("./dataset/US-11604547-B2/US-11604547-B2.json"))

In [9]:
# re.split(r"([0-9]+\.)", patent_file['claims2']) # Use this regex if you want claim number included in the list. 

claims = [ claim.replace("\n", " ")for claim in re.split(r"[0-9]+\.", patent_file['claims2']) if claim!='']
claims = [ ' '.join(tokenize(doc=claim, lemmatized=True, remove_punct=True, remove_stopword=True, remove_num=True)) for claim in claims ]


In [10]:
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
embeddings = model.encode(claims, show_progress_bar=True, convert_to_numpy=True)
print(embeddings.shape)


Downloading (…)99753/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)0cdb299753/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)db299753/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)753/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)99753/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)9753/train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading (…)0cdb299753/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)b299753/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

(20, 768)
