In [12]:
import json
import pandas as pd
from src.utils.UsefulPaths import Paths
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util

In [13]:
paths = Paths()

with open(paths.json_subsectors, 'r') as file:
    subsectors = json.load(file)

In [14]:
df_ipc = pd.read_excel(paths.raw_ipc_titles)
df_ipc.head(5)

Unnamed: 0,IPC,title
0,G16Y,INFORMATION AND COMMUNICATION TECHNOLOGY SPECI...
1,F24J,HEATING; RANGES; VENTILATING
2,F21H,INCANDESCENT MANTLES; OTHER INCANDESCENT BODIE...
3,G16B,"BIOINFORMATICS, i.e. INFORMATION AND COMMUNICA..."
4,A01P,"BIOCIDAL, PEST REPELLANT, PEST ATTRACTANT OR P..."


In [15]:
subsetor_names = []
subsetor_definitions = []
for subsetor_name, subsetor_values in subsectors.items():
    subsetor_definition = subsetor_values.get('Definition', '')

    subsetor_names.append(subsetor_name)
    subsetor_definitions.append(subsetor_definition)

subsetor_names.append('Other')
subsetor_definitions.append('When there is no subsector that patent fits')

In [16]:
subsetor_names

['Artificial Intelligence; Big Data and Analytics',
 'Advanced Manufacturing and Robotics',
 'Clean Technology',
 'Financial Technology',
 'Blockchain',
 'Cybersecurity',
 'Agriculture Technology',
 'New Food',
 'Advertising Technology',
 'Blue Economy',
 'Digital Media',
 'Gaming',
 'Augmented Reality; Vitual Reality',
 'Educational Technology',
 'Industry 4.0',
 'Biopharmaceutical; Biotechonology',
 'Medical Technology; Medical devices',
 'Other']

In [20]:
# "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli"
# "facebook/bart-large-mnli"
# "sileod/deberta-v3-base-tasksource-nli"
classifier = pipeline("zero-shot-classification", model="sileod/deberta-v3-base-tasksource-nli")

col_names = ['IPC', 'title']
for i in range(1, len(subsetor_names) + 1):
    col_names.append(f'score_{i}')

df_ipc['zero_shot_hypothesis'] = ''
# df_ipc['zero_shot_hypothesis_values'] = ''

for index, row in df_ipc.iterrows():

    output = classifier(row['title'], subsetor_names, multi_label=False)

    df_scores = pd.DataFrame([output['scores']], columns=output['labels'])

    zero_shot_hypothesis = ', '.join(output['labels'])
    # zero_shot_hypothesis_values = ', '.join([format(num, '.4f') for num in output['scores']])

    df_ipc.at[index, 'zero_shot_hypothesis'] = zero_shot_hypothesis
    # df_ipc.at[index, 'zero_shot_hypothesis_values'] = zero_shot_hypothesis_values

2023-09-02 15:21:57,049 - DEBUG - https://huggingface.co:443 "HEAD /sileod/deberta-v3-base-tasksource-nli/resolve/main/config.json HTTP/1.1" 200 0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [21]:
df_ipc.head()

Unnamed: 0,IPC,title,zero_shot_hypothesis
0,G16Y,INFORMATION AND COMMUNICATION TECHNOLOGY SPECI...,"Other, Industry 4.0, Digital Media, Medical Te..."
1,F24J,HEATING; RANGES; VENTILATING,"Medical Technology; Medical devices, Clean Tec..."
2,F21H,INCANDESCENT MANTLES; OTHER INCANDESCENT BODIE...,"Other, Medical Technology; Medical devices, Bi..."
3,G16B,"BIOINFORMATICS, i.e. INFORMATION AND COMMUNICA...","Other, Biopharmaceutical; Biotechonology, Indu..."
4,A01P,"BIOCIDAL, PEST REPELLANT, PEST ATTRACTANT OR P...","Medical Technology; Medical devices, Biopharma..."


In [22]:
# 'bert-base-uncased'
# 'bert-large-uncased-whole-word-masking'
# 'bert-base-multilingual-cased'
# 'all-mpnet-base-v2'
model = SentenceTransformer('all-mpnet-base-v2')

2023-09-02 15:40:06,311 - INFO - Load pretrained SentenceTransformer: all-mpnet-base-v2
2023-09-02 15:40:06,784 - INFO - Use pytorch device: cpu


In [23]:
definitions_embbedings = model.encode(subsetor_definitions)

df_ipc['similarities_hypothesis'] = ''
for index, row in df_ipc.iterrows():
    query = f'What subsetor best describes this International Patent Classification of number {row["IPC"]} and Title {row["title"]}?'
    query_emb = model.encode(query, convert_to_tensor=True, show_progress_bar=False, normalize_embeddings=False)

    cos_scores = util.cos_sim(query_emb, definitions_embbedings)
    similarities = cos_scores.cpu()[0].numpy().tolist()

    output_df = pd.DataFrame([similarities], columns=subsetor_names)

    similarities_hypothesis = ', '.join(output_df.iloc[0, :].to_frame().T.sort_values(by=0, axis=1, ascending=False).columns.tolist())

    df_ipc.at[index, 'similarities_hypothesis'] = similarities_hypothesis

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [24]:
df_ipc.head()

Unnamed: 0,IPC,title,zero_shot_hypothesis,similarities_hypothesis
0,G16Y,INFORMATION AND COMMUNICATION TECHNOLOGY SPECI...,"Other, Industry 4.0, Digital Media, Medical Te...","Other, Industry 4.0, Agriculture Technology, A..."
1,F24J,HEATING; RANGES; VENTILATING,"Medical Technology; Medical devices, Clean Tec...","Other, Medical Technology; Medical devices, Bi..."
2,F21H,INCANDESCENT MANTLES; OTHER INCANDESCENT BODIE...,"Other, Medical Technology; Medical devices, Bi...","Other, Educational Technology, Clean Technolog..."
3,G16B,"BIOINFORMATICS, i.e. INFORMATION AND COMMUNICA...","Other, Biopharmaceutical; Biotechonology, Indu...","Other, Biopharmaceutical; Biotechonology, Agri..."
4,A01P,"BIOCIDAL, PEST REPELLANT, PEST ATTRACTANT OR P...","Medical Technology; Medical devices, Biopharma...","Other, Agriculture Technology, Biopharmaceutic..."


In [25]:
df_ipc.to_csv(paths.ipc_subsetor_hypothesis, index=False)