In [1]:
import json
import pandas as pd
from src.utils.UsefulPaths import Paths
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util

2023-09-01 21:20:57,900 - DEBUG - Falling back to TensorFlow client; we recommended you install the Cloud TPU client directly with pip install cloud-tpu-client.
2023-09-01 21:20:57,962 - DEBUG - Creating converter from 7 to 5
2023-09-01 21:20:57,962 - DEBUG - Creating converter from 5 to 7
2023-09-01 21:20:57,962 - DEBUG - Creating converter from 7 to 5
2023-09-01 21:20:57,962 - DEBUG - Creating converter from 5 to 7


In [2]:
paths = Paths()

with open(paths.json_subsectors, 'r') as file:
    subsectors = json.load(file)

In [3]:
df_ipc = pd.read_excel(paths.raw_ipc_titles)
df_ipc.head(5)

Unnamed: 0,IPC,title
0,G16Y,INFORMATION AND COMMUNICATION TECHNOLOGY SPECI...
1,F24J,HEATING; RANGES; VENTILATING
2,F21H,INCANDESCENT MANTLES; OTHER INCANDESCENT BODIE...
3,G16B,"BIOINFORMATICS, i.e. INFORMATION AND COMMUNICA..."
4,A01P,"BIOCIDAL, PEST REPELLANT, PEST ATTRACTANT OR P..."


In [4]:
subsetor_names = []
subsetor_definitions = []
for subsetor_name, subsetor_values in subsectors.items():
    subsetor_definition = subsetor_values.get('Definition', '')

    subsetor_names.append(subsetor_name)
    subsetor_definitions.append(subsetor_definition)

subsetor_names.append('Other')
subsetor_definitions.append('When there is no subsector that patent fits')

In [5]:
subsetor_names

['Artificial Intelligence, Big Data and Analytics',
 'Advanced Manufacturing and Robotics',
 'Cleantech',
 'Fintech',
 'Blockchain',
 'Cybersecurity',
 'Agtech',
 'New Food',
 'Adtech ',
 'Blue Economy',
 'Digital Media',
 'Gaming (Digital Media Sub-Cluster)',
 'AR / VR (Digital Media Sub-Cluster)',
 'Edtech',
 'Industry 4.0 ',
 'Biopharma / Biotech',
 'Medtech / Medical devices',
 'Other']

In [6]:
# "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli"
# "facebook/bart-large-mnli"
# "sileod/deberta-v3-base-tasksource-nli"
classifier = pipeline("zero-shot-classification", model="MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli")

col_names = ['IPC', 'title']
for i in range(1, len(subsetor_names) + 1):
    col_names.append(f'score_{i}')

df_ipc['zero_shot_hypothesis'] = ''
# df_ipc['zero_shot_hypothesis_values'] = ''

for index, row in df_ipc.iterrows():

    output = classifier(row['title'], subsetor_names, multi_label=False)

    df_scores = pd.DataFrame([output['scores']], columns=output['labels'])

    zero_shot_hypothesis = ', '.join(output['labels'])
    # zero_shot_hypothesis_values = ', '.join([format(num, '.4f') for num in output['scores']])

    df_ipc.at[index, 'zero_shot_hypothesis'] = zero_shot_hypothesis
    # df_ipc.at[index, 'zero_shot_hypothesis_values'] = zero_shot_hypothesis_values

2023-09-01 21:21:06,038 - DEBUG - Starting new HTTPS connection (1): huggingface.co:443
2023-09-01 21:21:06,296 - DEBUG - https://huggingface.co:443 "HEAD /MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli/resolve/main/config.json HTTP/1.1" 200 0


In [7]:
df_ipc.head()

Unnamed: 0,IPC,title,zero_shot_hypothesis
0,G16Y,INFORMATION AND COMMUNICATION TECHNOLOGY SPECI...,"Industry 4.0 , Other, Fintech, Edtech, Artific..."
1,F24J,HEATING; RANGES; VENTILATING,"Medtech / Medical devices, Industry 4.0 , Biop..."
2,F21H,INCANDESCENT MANTLES; OTHER INCANDESCENT BODIE...,"Other, Medtech / Medical devices, Biopharma / ..."
3,G16B,"BIOINFORMATICS, i.e. INFORMATION AND COMMUNICA...","Biopharma / Biotech, Artificial Intelligence, ..."
4,A01P,"BIOCIDAL, PEST REPELLANT, PEST ATTRACTANT OR P...","Biopharma / Biotech, Other, Industry 4.0 , Med..."


In [9]:
# 'bert-base-uncased'
# 'bert-large-uncased-whole-word-masking'
# 'bert-base-multilingual-cased'
# 'all-mpnet-base-v2'
model = SentenceTransformer('all-mpnet-base-v2')

2023-09-01 21:38:39,695 - INFO - Load pretrained SentenceTransformer: all-mpnet-base-v2
2023-09-01 21:38:40,170 - INFO - Use pytorch device: cpu


In [10]:
definitions_embbedings = model.encode(subsetor_definitions)

df_ipc['similarities_hypothesis'] = ''
for index, row in df_ipc.iterrows():
    query = f'What subsetor best describes this International Patent Classification of number {row["IPC"]} and Title {row["title"]}?'
    query_emb = model.encode(query, convert_to_tensor=True, show_progress_bar=False, normalize_embeddings=False)

    cos_scores = util.cos_sim(query_emb, definitions_embbedings)
    similarities = cos_scores.cpu()[0].numpy().tolist()

    output_df = pd.DataFrame([similarities], columns=subsetor_names)

    similarities_hypothesis = ', '.join(output_df.iloc[0, :].to_frame().T.sort_values(by=0, axis=1, ascending=False).columns.tolist())

    df_ipc.at[index, 'similarities_hypothesis'] = similarities_hypothesis

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [11]:
df_ipc.head()

Unnamed: 0,IPC,title,zero_shot_hypothesis,similarities_hypothesis
0,G16Y,INFORMATION AND COMMUNICATION TECHNOLOGY SPECI...,"Industry 4.0 , Other, Fintech, Edtech, Artific...","Other, Industry 4.0 , Agtech, Artificial Intel..."
1,F24J,HEATING; RANGES; VENTILATING,"Medtech / Medical devices, Industry 4.0 , Biop...","Other, Medtech / Medical devices, Biopharma / ..."
2,F21H,INCANDESCENT MANTLES; OTHER INCANDESCENT BODIE...,"Other, Medtech / Medical devices, Biopharma / ...","Other, Edtech, Cleantech, Medtech / Medical de..."
3,G16B,"BIOINFORMATICS, i.e. INFORMATION AND COMMUNICA...","Biopharma / Biotech, Artificial Intelligence, ...","Other, Biopharma / Biotech, Agtech, Artificial..."
4,A01P,"BIOCIDAL, PEST REPELLANT, PEST ATTRACTANT OR P...","Biopharma / Biotech, Other, Industry 4.0 , Med...","Other, Agtech, Biopharma / Biotech, New Food, ..."


In [13]:
df_ipc.to_csv(paths.ipc_subsetor_hypothesis, index=False)