In [1]:
import json
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from src.utils.UsefulPaths import Paths

In [2]:
paths = Paths()

with open(paths.json_subsectors, 'r') as file:
    subsectors = json.load(file)

In [3]:
df_abstract = pd.read_csv(paths.raw_abstract)
df_abstract.drop_duplicates(inplace=True)

In [4]:
df_abstract.head(5)

Unnamed: 0,publication_number,abstract
0,20080063564,Embodiments of techniques for determining the ...
1,20080025285,A method for supporting frequency hopping of a...
2,20080056857,To correct any positional misalignment of a su...
3,20080031117,A holographic optical accessing system include...
4,20080056179,Transmitting an acknowledgement/negative ackno...


In [5]:
class PatentSimilarity:
    # bert-large-uncased-whole-word-masking
    # bert-base-multilingual-cased
    def __init__(self,
                 df_patents: pd.DataFrame = pd.DataFrame(),
                 model_name: str = 'bert-base-uncased',
                 definition_weight: int = 1,
                 keywords_weight: int = 1,
                 does_include: int = 1,
                 does_not_include: int = -1):

        self.df_patents = df_patents
        self.model = SentenceTransformer(model_name)

        # Peso para a definição
        self.definition_weight = definition_weight

        # Peso para as keywords
        self.keywords_weight = keywords_weight

        # Peso para as Does Include
        self.does_include = does_include

        # Peso para as Does Not Include
        self.does_not_include = does_not_include

        self.subsetors_embeddings = {}
        for subsetor_name, subsetor_values in subsectors.items():

            subsetor_definition = subsetor_values.get('Definition', '')
            subsetor_keywords = subsetor_values.get('Keywords', '')
            subsetor_does_include = subsetor_values.get('Does include', '')
            subsetor_does_not_include = subsetor_values.get('Does not include', '')

            subsetor_definition_emb = self.model.encode(subsetor_definition, convert_to_tensor=True) * self.definition_weight
            subsetor_keywords_emb = self.model.encode(subsetor_keywords, convert_to_tensor=True) * self.keywords_weight
            subsetor_does_include_emb = self.model.encode(subsetor_does_include, convert_to_tensor=True) * self.does_include
            subsetor_does_not_include_emb = self.model.encode(subsetor_does_not_include, convert_to_tensor=True) * self.does_not_include

            self.subsetors_embeddings[subsetor_name] = [subsetor_definition_emb, subsetor_keywords_emb, subsetor_does_include_emb, subsetor_does_not_include_emb]

    def similarity_between_one_patent_with_one_subsetor(self, patent_abstract, subsetor_name):

        # We pass the convert_to_tensor=True parameter to the encode function. This will return a pytorch tensor containing our embeddings. We can then call util.cos_sim(A, B) which computes the cosine similarity between all vectors in A and all vectors in B.
        patent_abstract_emb = self.model.encode(patent_abstract, convert_to_tensor=True)

        subsetor_definition_emb = self.subsetors_embeddings[subsetor_name][0]
        subsetor_keywords_emb = self.subsetors_embeddings[subsetor_name][1]
        subsetor_does_include_emb = self.subsetors_embeddings[subsetor_name][2]
        subsetor_does_not_include_emb = self.subsetors_embeddings[subsetor_name][3]

        # Combine os embeddings
        combined_emb = subsetor_definition_emb + subsetor_keywords_emb + subsetor_does_include_emb + subsetor_does_not_include_emb

        similarity = util.cos_sim(patent_abstract_emb, combined_emb).item()

        return similarity

    def similarity_between_one_patent_with_all_subsetors(self, publication_number, patent_abstract):
        col_names = ['publication_number', 'patent_abstract']
        patents_similarities = [publication_number, patent_abstract]
        for subsetor_name, embeddings in self.subsetors_embeddings.items():
            col_names.append(subsetor_name)
            specific_sim  = self.similarity_between_one_patent_with_one_subsetor(patent_abstract, subsetor_name)
            patents_similarities.append(specific_sim)

        df_one_patent = pd.DataFrame([patents_similarities], columns=col_names)
        return df_one_patent

    def similarity_between_all_patents_with_all_subsetors(self, break_index=-1):
        df_all = pd.DataFrame()
        for index, row in self.df_patents.iterrows():
            row_sim = self.similarity_between_one_patent_with_all_subsetors(row['publication_number'], row['abstract'])
            df_all = pd.concat([df_all, row_sim])
            if break_index == index:
                break

        return df_all

In [6]:
# 20080063564
patent_1 = 'A holographic optical accessing system includes a light source for emitting a light beam; an optical assembly module for receiving the light beam and generating a signal beam and a reference beam that are parallel to each other rather than overlap with each other, and have the same first polarization state; a lens module for focusing the signal beam and the reference beam on a focal point at the same time; and a storage medium for recording the focal point. The optical assembly module includes at least a data plane for displaying image information so that the signal beam contains the image information.'

# 20080015942
patent_2 = 'A product sampling and recommendation system uses customer profile data and/or real-time information from a point-of-sale system to tailor specific product recommendations to a customer using a sampling station.'

# 20080035176
patent_3 = 'The present invention relates to a mobile or stationary waste container cleaning system used for residential, commercial and industrial waste, garbage, trash, storage or operations containers or receptacles. Other applications include, but are not limited to cleaning of chemical drums, grease dumpsters (e.g. behind restaurants), rain barrels and non-uniform residential, commercial or industrial dumpsters or waste containers. The container cleaning system can alternatively be used for rural areas, farms or ranches.'

bert_sim = PatentSimilarity(df_patents=df_abstract, model_name='bert-base-uncased')
df_all_sim = bert_sim.similarity_between_all_patents_with_all_subsetors(break_index=3)

2023-08-30 15:40:29,314 - INFO - Load pretrained SentenceTransformer: bert-base-uncased
2023-08-30 15:40:29,317 - DEBUG - Starting new HTTPS connection (1): huggingface.co:443
2023-08-30 15:40:29,965 - DEBUG - https://huggingface.co:443 "GET /api/models/bert-base-uncased HTTP/1.1" 200 18010
2023-08-30 15:40:30,137 - DEBUG - https://huggingface.co:443 "HEAD /bert-base-uncased/resolve/1dbc166cf8765166998eff31ade2eb64c8a40076/.gitattributes HTTP/1.1" 200 0
2023-08-30 15:40:30,303 - DEBUG - https://huggingface.co:443 "HEAD /bert-base-uncased/resolve/1dbc166cf8765166998eff31ade2eb64c8a40076/LICENSE HTTP/1.1" 200 0
2023-08-30 15:40:30,478 - DEBUG - https://huggingface.co:443 "HEAD /bert-base-uncased/resolve/1dbc166cf8765166998eff31ade2eb64c8a40076/README.md HTTP/1.1" 200 0
2023-08-30 15:40:30,635 - DEBUG - https://huggingface.co:443 "HEAD /bert-base-uncased/resolve/1dbc166cf8765166998eff31ade2eb64c8a40076/config.json HTTP/1.1" 200 0
2023-08-30 15:40:30,800 - DEBUG - https://huggingface.co:44

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [8]:
df_all_sim.head()

Unnamed: 0,publication_number,patent_abstract,"Artificial Intelligence, Big Data and Analytics",Advanced Manufacturing and Robotics,Cleantech,Fintech,Blockchain,Cybersecurity,Agtech,New Food,Adtech,Blue Economy,Digital Media,Gaming (Digital Media Sub-Cluster),AR / VR (Digital Media Sub-Cluster),Edtech,Industry 4.0,Biopharma / Biotech,Medtech / Medical devices
0,20080063564,Embodiments of techniques for determining the ...,0.65833,0.590078,0.5248,0.555367,0.640767,0.473358,0.574478,0.569781,0.577763,0.632208,0.510824,0.486649,0.617445,0.642794,0.634167,0.585564,0.637038
0,20080025285,A method for supporting frequency hopping of a...,0.743863,0.653513,0.5727,0.566916,0.779217,0.620916,0.611177,0.539966,0.542358,0.669229,0.627278,0.524552,0.696973,0.682301,0.727113,0.59889,0.577785
0,20080056857,To correct any positional misalignment of a su...,0.704181,0.647438,0.540204,0.548495,0.704533,0.572491,0.60366,0.530414,0.50333,0.672701,0.623186,0.53136,0.672791,0.681367,0.690687,0.611993,0.573065
0,20080031117,A holographic optical accessing system include...,0.749519,0.674578,0.565756,0.575424,0.726539,0.56599,0.63279,0.569732,0.55398,0.667351,0.637344,0.514433,0.730669,0.672056,0.717934,0.620908,0.609627


In [20]:

df_all_sim['score_max'] = df_all_sim.iloc[: , 2:].max(axis=1)
df_all_sim['score_name'] = df_all_sim.iloc[: , 2:].idxmax(axis=1)


In [24]:
df_all_sim[['publication_number', 'patent_abstract', 'score_name', 'score_max']].head(10)

Unnamed: 0,publication_number,patent_abstract,score_name,score_max
0,20080063564,Embodiments of techniques for determining the ...,"Artificial Intelligence, Big Data and Analytics",0.65833
0,20080025285,A method for supporting frequency hopping of a...,Blockchain,0.779217
0,20080056857,To correct any positional misalignment of a su...,Blockchain,0.704533
0,20080031117,A holographic optical accessing system include...,"Artificial Intelligence, Big Data and Analytics",0.749519
