In [1]:
import json
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from src.utils.UsefulPaths import Paths

In [3]:
paths = Paths()

with open(paths.json_subsectors, 'r') as file:
    subsectors = json.load(file)

In [14]:
df_abstract = pd.read_parquet(paths.raw_parquet_abstract)

offset_div = 50
df_joao = df_abstract.iloc[:offset_div].copy().reset_index(drop='index')
df_leo = df_abstract.iloc[offset_div:offset_div*2].copy().reset_index(drop='index')
df_maxm = df_abstract.iloc[offset_div*2:offset_div*3].copy().reset_index(drop='index')
df_rafael = df_abstract.iloc[offset_div*4:offset_div*5].copy().reset_index(drop='index')
df_thiago = df_abstract.iloc[offset_div*6:offset_div*7].copy().reset_index(drop='index')

In [23]:
class PatentSimilarity:
    def __init__(self,
                 df_patents: pd.DataFrame = pd.DataFrame(),
                 model_name: str = 'bert-base-uncased',
                 definition_weight: float = 1,
                 keywords_weight: float = 1,
                 does_include: float = 1,
                 does_not_include: float = -1,
                 show_progress_bar=False,
                 top_scores=3):

        self.df_patents = df_patents
        self.model = SentenceTransformer(model_name)

        # Peso para a definição
        self.definition_weight = definition_weight

        # Peso para as keywords
        self.keywords_weight = keywords_weight

        # Peso para as Does Include
        self.does_include = does_include

        # Peso para as Does Not Include
        self.does_not_include = does_not_include

        self.show_progress_bar = show_progress_bar

        self.top_scores = top_scores

        self.subsetors_embeddings = {}
        self.subsetors_definitions = []
        for subsetor_name, subsetor_values in subsectors.items():

            subsetor_definition = subsetor_values.get('Definition', '')
            subsetor_keywords = subsetor_values.get('Keywords', '')
            subsetor_does_include = subsetor_values.get('Does include', '')
            subsetor_does_not_include = subsetor_values.get('Does not include', '')

            self.subsetors_definitions.append(f'{subsetor_keywords}')
            # self.subsetors_definitions.append(f'{subsetor_definition}. {subsetor_keywords}.')
            # self.subsetors_definitions.append(f'{subsetor_definition}. {subsetor_keywords}. {subsetor_does_include}')

            subsetor_definition_emb = self.model.encode(subsetor_definition, convert_to_tensor=True, show_progress_bar=self.show_progress_bar) * self.definition_weight
            subsetor_keywords_emb = self.model.encode(subsetor_keywords, convert_to_tensor=True, show_progress_bar=self.show_progress_bar) * self.keywords_weight
            subsetor_does_include_emb = self.model.encode(subsetor_does_include, convert_to_tensor=True, show_progress_bar=self.show_progress_bar) * self.does_include
            subsetor_does_not_include_emb = self.model.encode(subsetor_does_not_include, convert_to_tensor=True, show_progress_bar=self.show_progress_bar) * self.does_not_include

            self.subsetors_embeddings[subsetor_name] = [subsetor_definition_emb, subsetor_keywords_emb, subsetor_does_include_emb, subsetor_does_not_include_emb]

    def similarity_between_one_patent_with_one_subsetor(self, patent_abstract, subsetor_name):

        # We pass the convert_to_tensor=True parameter to the encode function. This will return a pytorch tensor containing our embeddings. We can then call util.cos_sim(A, B) which computes the cosine similarity between all vectors in A and all vectors in B.

        # query_embedding = f'In this patent abstract "{patent_abstract}", what is the subsetor definiton that best describes this patent?'
        query_embedding = f'In this patent abstract "{patent_abstract}", what are the keywords that best describes this patent?'
        patent_abstract_emb = self.model.encode(query_embedding, convert_to_tensor=True, show_progress_bar=self.show_progress_bar, normalize_embeddings=False)

        subsetor_definition_emb = self.subsetors_embeddings[subsetor_name][0]
        subsetor_keywords_emb = self.subsetors_embeddings[subsetor_name][1]
        subsetor_does_include_emb = self.subsetors_embeddings[subsetor_name][2]
        subsetor_does_not_include_emb = self.subsetors_embeddings[subsetor_name][3]

        # Combine os embeddings
        # combined_emb = subsetor_definition_emb + subsetor_keywords_emb + subsetor_does_include_emb - subsetor_does_not_include_emb

        similarity = util.cos_sim(patent_abstract_emb, subsetor_definition_emb).item()

    def similarity_between_one_patent_with_all_subsetors(self, publication_number, patent_abstract):
        col_names = ['publication_number', 'patent_abstract']
        patents_similarities = [publication_number, patent_abstract]
        for subsetor_name, embeddings in self.subsetors_embeddings.items():
            col_names.append(subsetor_name)
            specific_sim  = self.similarity_between_one_patent_with_one_subsetor(patent_abstract, subsetor_name)
            patents_similarities.append(specific_sim)

        df_one_patent = pd.DataFrame([patents_similarities], columns=col_names)
        return df_one_patent

    def similarity_between_all_patents_with_all_subsetors(self, break_index=-1):
        df_all = pd.DataFrame()
        for index, row in self.df_patents.iterrows():
            row_sim = self.similarity_between_one_patent_with_all_subsetors(row['publication_number'], row['abstract'])
            df_all = pd.concat([df_all, row_sim])
            if break_index == index:
                break

        df_all['score_max'] = df_all.iloc[: , 2:].max(axis=1)
        df_all['score_name'] = df_all.iloc[: , 2:].idxmax(axis=1)

        return df_all

    def similarity_between_one_patent_with_all_subsetors_query(self, publication_number, patent_abstract):
        col_names = ['publication_number', 'patent_abstract']
        patents_similarities = [publication_number, patent_abstract]

        query = f'In this patent abstract "{patent_abstract}", what is the subsetor definiton that best describes this patent?'
        query_emb = self.model.encode(query, convert_to_tensor=True, show_progress_bar=self.show_progress_bar, normalize_embeddings=False)

        setor_definitions_emb = self.model.encode(self.subsetors_definitions, convert_to_tensor=True, show_progress_bar=self.show_progress_bar, normalize_embeddings=False)
        cos_scores = util.cos_sim(query_emb, setor_definitions_emb)

        i = 0
        for subsetor_name, subsetor_values in subsectors.items():
            col_names.append(subsetor_name)
            patents_similarities.append(cos_scores[0][i].item())
            i = i + 1

        df_one_patent = pd.DataFrame([patents_similarities], columns=col_names)
        df_one_patent_sorted = df_one_patent.iloc[0, 2:].to_frame().T.sort_values(by=0, axis=1, ascending=False)

        for i in range(0, self.top_scores):
            col = f'score_max_{i + 1}'
            max_col_name = df_one_patent_sorted.columns[i]
            max_value = df_one_patent_sorted.iloc[0, i]
            df_one_patent[col] = f'{max_col_name} - {max_value}'

        return df_one_patent

    def similarity_between_all_patents_with_all_subsetors_query(self, break_index=-1):
        df_all = pd.DataFrame()
        for index, row in self.df_patents.iterrows():
            row_sim = self.similarity_between_one_patent_with_all_subsetors_query(row['publication_number'], row['abstract'])
            df_all = pd.concat([df_all, row_sim])
            if break_index == index:
                break

        return df_all

# 'bert-base-uncased'
# bert-large-uncased-whole-word-masking
# bert-base-multilingual-cased
# 'all-mpnet-base-v2'
bert_sim = PatentSimilarity(df_patents=df_joao, model_name='roberta-base-squad2', does_include=1, does_not_include=-1)
df_all_sim = bert_sim.similarity_between_all_patents_with_all_subsetors_query(break_index=50)
print(df_all_sim[['publication_number', 'patent_abstract', 'score_max_1', 'score_max_2', 'score_max_3']].head(50))
df_all_sim[['publication_number', 'patent_abstract', 'score_max_1', 'score_max_2', 'score_max_3']].to_excel(f'C:\\Users\\thiag\\PycharmProjects\\genome\\data\\processed\\class_patent.xlsx', index=False, header=True)

2023-08-31 20:31:21,209 - INFO - Load pretrained SentenceTransformer: roberta-base-squad2
2023-08-31 20:31:21,383 - DEBUG - https://huggingface.co:443 "GET /api/models/sentence-transformers/roberta-base-squad2 HTTP/1.1" 401 41


RepositoryNotFoundError: 401 Client Error. (Request ID: Root=1-64f122c7-39387d992bdcd9fe59ad28b3;fdf794c6-1ddc-4525-b64d-34706e6b010e)

Repository Not Found for url: https://huggingface.co/api/models/sentence-transformers/roberta-base-squad2.
Please make sure you specified the correct `repo_id` and `repo_type`.
If you are trying to access a private or gated repo, make sure you are authenticated.
Invalid username or password.

In [7]:
# bert-large-uncased-whole-word-masking
# bert-base-multilingual-cased
# 'all-mpnet-base-v2'
bert_sim = PatentSimilarity(df_patents=df_thiago, model_name='bert-large-uncased-whole-word-masking', does_include=1, does_not_include=-1)
df_all_sim = bert_sim.similarity_between_all_patents_with_all_subsetors(break_index=2)


2023-08-31 14:41:21,090 - INFO - Load pretrained SentenceTransformer: bert-large-uncased-whole-word-masking
2023-08-31 14:41:21,341 - DEBUG - https://huggingface.co:443 "GET /api/models/bert-large-uncased-whole-word-masking HTTP/1.1" 200 2227
2023-08-31 14:41:21,517 - DEBUG - https://huggingface.co:443 "HEAD /bert-large-uncased-whole-word-masking/resolve/8b35f05561d0d917087166b71d3c2c83a39104b1/.gitattributes HTTP/1.1" 200 0
2023-08-31 14:41:21,680 - DEBUG - https://huggingface.co:443 "HEAD /bert-large-uncased-whole-word-masking/resolve/8b35f05561d0d917087166b71d3c2c83a39104b1/README.md HTTP/1.1" 200 0
2023-08-31 14:41:21,844 - DEBUG - https://huggingface.co:443 "HEAD /bert-large-uncased-whole-word-masking/resolve/8b35f05561d0d917087166b71d3c2c83a39104b1/config.json HTTP/1.1" 200 0
2023-08-31 14:41:22,017 - DEBUG - https://huggingface.co:443 "HEAD /bert-large-uncased-whole-word-masking/resolve/8b35f05561d0d917087166b71d3c2c83a39104b1/model.safetensors HTTP/1.1" 302 0
2023-08-31 14:41:2

TypeError: new(): invalid data type 'str'

In [85]:
df_all_sim[['publication_number', 'patent_abstract', 'score_name', 'score_max']].head(50)

Unnamed: 0,publication_number,patent_abstract,score_name,score_max
0,20080017043,A brew stick for brewing a measure of brewable...,Agtech,0.780324
0,20080050603,Polylactide polymers are reacted with an epoxy...,Agtech,0.802485
0,20080007823,An interferometer comprises a light source uni...,Agtech,0.759884


In [86]:
for index, row in df_all_sim.iterrows():
    print(f'{row["patent_abstract"]}\n\n{row["score_name"]}\n{row["score_max"]}\n\n')

A brew stick for brewing a measure of brewable material. The brew stick may include a bag with the measure of brewable material positioned therein and a plate. The plate may include a pair of wings and a central tab. The bag is connected to the central tab such that the bag can be raised within the wings.

Agtech
0.7803242206573486


Polylactide polymers are reacted with an epoxy-functional acrylate polymer to introduce long-chain branching into the polymer. The acrylate polymer provides a flexible means for introducing a controllable amount of branching into the polylactide polymer, with little risk of forming gelled or highly crosslinked structures. The branched polylactide polymers have excellent melt rheological properties that make them more easily processable in various melt-processing applications.

Agtech
0.802484929561615


An interferometer comprises a light source unit, a first splitter, a reference beam unit and a detection unit. The light source unit provides a laser beam.