In [53]:
import json
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from sklearn.metrics.pairwise import cosine_similarity
from src.utils.UsefulPaths import Paths

In [54]:
class BertSimilarity:
    def __init__(self, model_name='bert-base-uncased'):
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = TFBertModel.from_pretrained(model_name)
        self.model.trainable = False

        # Peso para as keywords
        self.keyword_weight = 1.0

        # Peso para as non-keywords
        self.non_keyword_weight = -2

    def encode(self, text):
        inputs = self.tokenizer(text, return_tensors="tf", truncation=True, padding=True, max_length=8000)
        outputs = self.model(inputs)
        return outputs['pooler_output']

    def encode_with_keywords(self, text, keywords, non_keywords):
        # Encode texto principal
        emb_text = self.encode(text)

        # Encode keywords e non-keywords e ponderá-los
        emb_keywords = self.encode(keywords) * self.keyword_weight
        emb_non_keywords = self.encode(non_keywords) * self.non_keyword_weight

        # Combine os embeddings
        combined_emb = emb_text + emb_keywords + emb_non_keywords
        return combined_emb

    def compute_similarity(self, patent_abstract, subsetor_definiton, subsetor_keywords, subsetor_non_keywords):
        patent_embbedings = self.encode_with_keywords(patent_abstract, subsetor_keywords, subsetor_non_keywords)
        subsetor_embeddings = self.encode(subsetor_definiton)

        sim = cosine_similarity(patent_embbedings, subsetor_embeddings)

        return sim[0][0]

In [55]:
paths = Paths()

with open(paths.json_subsectors, 'r') as file:
    subsectors = json.load(file)

In [59]:
# 20080063564
patent_1 = 'A holographic optical accessing system includes a light source for emitting a light beam; an optical assembly module for receiving the light beam and generating a signal beam and a reference beam that are parallel to each other rather than overlap with each other, and have the same first polarization state; a lens module for focusing the signal beam and the reference beam on a focal point at the same time; and a storage medium for recording the focal point. The optical assembly module includes at least a data plane for displaying image information so that the signal beam contains the image information.'

# 20080015942
patent_2 = 'A product sampling and recommendation system uses customer profile data and/or real-time information from a point-of-sale system to tailor specific product recommendations to a customer using a sampling station.'

# 20080035176
patent_3 = 'The present invention relates to a mobile or stationary waste container cleaning system used for residential, commercial and industrial waste, garbage, trash, storage or operations containers or receptacles. Other applications include, but are not limited to cleaning of chemical drums, grease dumpsters (e.g. behind restaurants), rain barrels and non-uniform residential, commercial or industrial dumpsters or waste containers. The container cleaning system can alternatively be used for rural areas, farms or ranches.'

bert_sim = BertSimilarity()

patent = patent_1
sim_dict = {}
for key, value in subsectors.items():

    subsector_definition = value.get('Definition', '')
    subsector_keywords = value.get('Keywords', '')
    subsector_does_not_include = value.get('Does not include', '')
    subsector_does_include = value.get('Does include', '')

    pt_sub_sim = bert_sim.compute_similarity(patent, subsector_definition, subsector_keywords, subsector_does_not_include)
    sim_dict[key] = pt_sub_sim

sorted_data = dict(sorted(sim_dict.items(), key=lambda item: item[1], reverse=True))

for key, value in sorted_data.items():
    print(f'{key}: {value:.4f}')



2023-08-28 22:14:48,066 - DEBUG - https://huggingface.co:443 "HEAD /bert-base-uncased/resolve/main/vocab.txt HTTP/1.1" 200 0
2023-08-28 22:14:48,244 - DEBUG - https://huggingface.co:443 "HEAD /bert-base-uncased/resolve/main/config.json HTTP/1.1" 200 0
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification 

Medtech / Medical devices: 0.9704
New Food: 0.9600
Adtech : 0.9503
Cleantech: 0.9239
Gaming (Digital Media Sub-Cluster): 0.8923
Artificial Intelligence, Big Data and Analytics: 0.8914
Blockchain: 0.8861
Biopharma / Biotech: 0.8838
Advanced Manufacturing and Robotics: 0.8763
AR / VR (Digital Media Sub-Cluster): 0.8689
Blue Economy: 0.8612
Cybersecurity: 0.8547
Industry 4.0 : 0.8425
Fintech: 0.7707
Digital Media: 0.7660
Agtech: 0.4277
Edtech: -0.0940
