In [1]:
import json
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from src.utils.UsefulPaths import Paths

In [2]:
paths = Paths()

with open(paths.json_subsectors, 'r') as file:
    subsectors = json.load(file)

In [4]:
df_abstract = pd.read_parquet(paths.raw_parquet_abstract)

offset_div = 50
df_joao = df_abstract.iloc[:offset_div].copy().reset_index(drop='index')
df_leo = df_abstract.iloc[offset_div:offset_div*2].copy().reset_index(drop='index')
df_maxm = df_abstract.iloc[offset_div*2:offset_div*3].copy().reset_index(drop='index')
df_rafael = df_abstract.iloc[offset_div*4:offset_div*5].copy().reset_index(drop='index')
df_thiago = df_abstract.iloc[offset_div*6:offset_div*7].copy().reset_index(drop='index')

In [37]:
class PatentSimilarity:
    def __init__(self,
                 df_patents: pd.DataFrame = pd.DataFrame(),
                 model_name: str = 'bert-base-uncased',
                 definition_weight: float = 1,
                 keywords_weight: float = 1,
                 does_include: float = 1,
                 does_not_include: float = -1,
                 show_progress_bar=False):

        self.df_patents = df_patents
        self.model = SentenceTransformer(model_name)

        # Peso para a definição
        self.definition_weight = definition_weight

        # Peso para as keywords
        self.keywords_weight = keywords_weight

        # Peso para as Does Include
        self.does_include = does_include

        # Peso para as Does Not Include
        self.does_not_include = does_not_include

        self.show_progress_bar = show_progress_bar

        self.subsetors_embeddings = {}
        for subsetor_name, subsetor_values in subsectors.items():

            subsetor_definition = subsetor_values.get('Definition', '')
            subsetor_keywords = subsetor_values.get('Keywords', '')
            subsetor_does_include = subsetor_values.get('Does include', '')
            subsetor_does_not_include = subsetor_values.get('Does not include', '')

            subsetor_definition_emb = self.model.encode(subsetor_definition, convert_to_tensor=True, show_progress_bar=self.show_progress_bar) * self.definition_weight
            subsetor_keywords_emb = self.model.encode(subsetor_keywords, convert_to_tensor=True, show_progress_bar=self.show_progress_bar) * self.keywords_weight
            subsetor_does_include_emb = self.model.encode(subsetor_does_include, convert_to_tensor=True, show_progress_bar=self.show_progress_bar) * self.does_include
            subsetor_does_not_include_emb = self.model.encode(subsetor_does_not_include, convert_to_tensor=True, show_progress_bar=self.show_progress_bar) * self.does_not_include

            self.subsetors_embeddings[subsetor_name] = [subsetor_definition_emb, subsetor_keywords_emb, subsetor_does_include_emb, subsetor_does_not_include_emb]

    def similarity_between_one_patent_with_one_subsetor(self, patent_abstract, subsetor_name):

        # We pass the convert_to_tensor=True parameter to the encode function. This will return a pytorch tensor containing our embeddings. We can then call util.cos_sim(A, B) which computes the cosine similarity between all vectors in A and all vectors in B.
        patent_abstract_emb = self.model.encode(patent_abstract, convert_to_tensor=True, show_progress_bar=self.show_progress_bar)

        subsetor_definition_emb = self.subsetors_embeddings[subsetor_name][0]
        subsetor_keywords_emb = self.subsetors_embeddings[subsetor_name][1]
        subsetor_does_include_emb = self.subsetors_embeddings[subsetor_name][2]
        subsetor_does_not_include_emb = self.subsetors_embeddings[subsetor_name][3]

        # Combine os embeddings
        combined_emb = subsetor_definition_emb + subsetor_keywords_emb + subsetor_does_include_emb + subsetor_does_not_include_emb

        similarity = util.cos_sim(patent_abstract_emb, combined_emb).item()

        # similarity = util.cos_sim(patent_abstract_emb, subsetor_definition_emb).item()

        return similarity

    def similarity_between_one_patent_with_all_subsetors(self, publication_number, patent_abstract):
        col_names = ['publication_number', 'patent_abstract']
        patents_similarities = [publication_number, patent_abstract]
        for subsetor_name, embeddings in self.subsetors_embeddings.items():
            col_names.append(subsetor_name)
            specific_sim  = self.similarity_between_one_patent_with_one_subsetor(patent_abstract, subsetor_name)
            patents_similarities.append(specific_sim)

        df_one_patent = pd.DataFrame([patents_similarities], columns=col_names)
        return df_one_patent

    def similarity_between_all_patents_with_all_subsetors(self, break_index=-1):
        df_all = pd.DataFrame()
        for index, row in self.df_patents.iterrows():
            row_sim = self.similarity_between_one_patent_with_all_subsetors(row['publication_number'], row['abstract'])
            df_all = pd.concat([df_all, row_sim])
            if break_index == index:
                break

        df_all['score_max'] = df_all.iloc[: , 2:].max(axis=1)
        df_all['score_name'] = df_all.iloc[: , 2:].idxmax(axis=1)

        return df_all

In [38]:
# bert-large-uncased-whole-word-masking
# bert-base-multilingual-cased
bert_sim = PatentSimilarity(df_patents=df_thiago, model_name='bert-base-multilingual-cased', does_include=2, does_not_include=-2)
df_all_sim = bert_sim.similarity_between_all_patents_with_all_subsetors(break_index=2)

2023-08-30 18:14:57,745 - INFO - Load pretrained SentenceTransformer: bert-base-multilingual-cased
2023-08-30 18:14:57,745 - DEBUG - Resetting dropped connection: huggingface.co
2023-08-30 18:14:58,104 - DEBUG - https://huggingface.co:443 "GET /api/models/bert-base-multilingual-cased HTTP/1.1" 200 4337
2023-08-30 18:14:58,268 - DEBUG - https://huggingface.co:443 "HEAD /bert-base-multilingual-cased/resolve/fdfce55e83dbed325647a63e7e1f5de19f0382ba/.gitattributes HTTP/1.1" 200 0
2023-08-30 18:14:58,432 - DEBUG - https://huggingface.co:443 "HEAD /bert-base-multilingual-cased/resolve/fdfce55e83dbed325647a63e7e1f5de19f0382ba/README.md HTTP/1.1" 200 0
2023-08-30 18:14:58,589 - DEBUG - https://huggingface.co:443 "HEAD /bert-base-multilingual-cased/resolve/fdfce55e83dbed325647a63e7e1f5de19f0382ba/config.json HTTP/1.1" 200 0
2023-08-30 18:14:58,762 - DEBUG - https://huggingface.co:443 "HEAD /bert-base-multilingual-cased/resolve/fdfce55e83dbed325647a63e7e1f5de19f0382ba/model.safetensors HTTP/1.1"

In [39]:
df_all_sim[['publication_number', 'patent_abstract', 'score_name', 'score_max']].head(50)

Unnamed: 0,publication_number,patent_abstract,score_name,score_max
0,20080017043,A brew stick for brewing a measure of brewable...,Industry 4.0,0.650784
0,20080050603,Polylactide polymers are reacted with an epoxy...,Blockchain,0.612674
0,20080007823,An interferometer comprises a light source uni...,Industry 4.0,0.646756


In [40]:
for index, row in df_all_sim.iterrows():
    print(f'{row["patent_abstract"]}\n\n{row["score_name"]}\n{row["score_max"]}\n\n')

A brew stick for brewing a measure of brewable material. The brew stick may include a bag with the measure of brewable material positioned therein and a plate. The plate may include a pair of wings and a central tab. The bag is connected to the central tab such that the bag can be raised within the wings.

Industry 4.0 
0.6507836580276489


Polylactide polymers are reacted with an epoxy-functional acrylate polymer to introduce long-chain branching into the polymer. The acrylate polymer provides a flexible means for introducing a controllable amount of branching into the polylactide polymer, with little risk of forming gelled or highly crosslinked structures. The branched polylactide polymers have excellent melt rheological properties that make them more easily processable in various melt-processing applications.

Blockchain
0.6126735806465149


An interferometer comprises a light source unit, a first splitter, a reference beam unit and a detection unit. The light source unit provides a