Import and set-up

In [1]:
import torch
import pandas as pd
from transformers import pipeline
from transformers import DebertaTokenizer, DebertaForSequenceClassification
from transformers import DataCollatorWithPadding

tokenizer = DebertaTokenizer.from_pretrained("microsoft/deberta-large")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

model = DebertaForSequenceClassification.from_pretrained("microsoft/deberta-large", num_labels=5)

model.config.dropout = 0.5
model.classifier = torch.nn.Linear(in_features=model.config.hidden_size, out_features=5, bias=True)

  from .autonotebook import tqdm as notebook_tqdm
Downloading (…)okenizer_config.json: 100%|██████████| 52.0/52.0 [00:00<00:00, 4.57kB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading (…)olve/main/vocab.json: 100%|██████████| 899k/899k [00:00<00:00, 1.24MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 804kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 475/475 [00:00<00:00, 674kB/s]
Downloading pytorch_model.bin: 100%|██████████| 1.63G/1.63G [00:16<00:00, 102MB/s] 
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-large and are newly initialized: ['pooler.dense.bias', 'classifier.bias', 'classifier.weight', 'pooler.dense.weight']
You should probabl

Dataset and Feature Engineering

In [2]:
from sklearn.model_selection import train_test_split
train_dataset = pd.DataFrame(pd.read_csv('train_patents.csv'))

In [3]:
score_mapping = {
    0: 0,
    0.25: 1,
    0.5: 2,
    0.75: 3,
    1: 4
}

train_dataset['score'] = train_dataset['score'].map(score_mapping)

In [4]:
df_train_dataset, cross_verify_data = train_test_split(train_dataset, test_size=0.1)  # 10% for cross-verification

In [5]:
import nltk
from nltk.corpus import wordnet
from random import randint

nltk.download('wordnet')

def replace_synonym(sentence, num_replacements=1):
    words = sentence.split()
    new_sentence = sentence
    for _ in range(num_replacements):
        word_to_replace = words[randint(0, len(words)-1)]
        synonyms = [syn.name().split('.')[0] for syn in wordnet.synsets(word_to_replace) if syn.name().split('.')[0] != word_to_replace]
        if synonyms:
            new_sentence = new_sentence.replace(word_to_replace, synonyms[0], 1)
    return new_sentence

# Augmenting data
df_train_dataset['target'] = df_train_dataset['target'].apply(lambda x: replace_synonym(x, num_replacements=5))
df_train_dataset['anchor'] = df_train_dataset['anchor'].apply(lambda x: replace_synonym(x, num_replacements=5))
cross_verify_data['target'] = cross_verify_data['target'].apply(lambda x: replace_synonym(x, num_replacements=5))
cross_verify_data['anchor'] = cross_verify_data['anchor'].apply(lambda x: replace_synonym(x, num_replacements=5))

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\cl502_20\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Data Preprocessing

In [6]:
score_actual = df_train_dataset['score']
score_list = score_actual[1:1001].tolist()
score_train = pd.DataFrame(score_actual[1:1001])
score_train.value_counts()

score
2        17
1        16
0        11
3         4
4         2
Name: count, dtype: int64

In [7]:
df_train_dataset['hypothesis'] = df_train_dataset['anchor']
inputs = df_train_dataset['hypothesis']
hypothesis_list = inputs[1:1001].tolist()
hypothesis_list

['acme position',
 'prolog',
 'encapsulate paint',
 'axile propagation',
 'carburization',
 'opc barrel',
 'wearability',
 'planar gearinginging set',
 'conductor atom',
 'acerb assimilation',
 'selectively predetermine',
 'wall military_military_military_post',
 'tax_tax_tax_return social_organization',
 'transport aside chopine',
 'push fall',
 'radio_receiver wave transmittance',
 'electric newcomer',
 'pneumatic logic',
 'relational recipe',
 'traffic_circle electric',
 'hinge mechanism',
 'angular liaison carriage',
 'run control_condition_condition valve',
 'ammonia_water convalescence',
 'combine with ocular component',
 'good heart',
 'unlike circumferential position',
 'goal of parallel_parallel_parallel_bars',
 'angular contact carriage',
 'boom hydraulic cylinder',
 'rotatable bedroom',
 'good heart',
 'determine substrate',
 'determine substrate',
 'propyl platitude',
 'fabric constitution',
 'chief pulsation laser',
 'important_person',
 'succession conservation',
 'hydroc

In [8]:
premise = df_train_dataset['target']
premise_list = premise[1:1001].tolist()
premise_for_testing = pd.DataFrame(premise_list)
premise_list

['acme plan acme',
 'warhead',
 'encapsulate component',
 'propagation corner',
 'hotness center infection',
 'actuator',
 'absorbent_material_material_material polymer',
 'coplanar stage_stage_set',
 'metallic_element decoration',
 'acidic submergence',
 'selective bias',
 'radio',
 'tax_tax_tax_return argument',
 'transport aside rake circulation',
 'push fall mechanism',
 'watery radio_receiver_receiver wave',
 'electric newcomer drive',
 'pneumatic control_condition_condition_condition device',
 'environment component data',
 'dynamo electric',
 'hinge mechanism',
 'curler carriage',
 'hydraulic aerodynamic_lift',
 'recovery from wound',
 'merge with ocular component',
 'syringe',
 'lapp intervals',
 'dwell of parallel_parallel_parallel_bars',
 'shape angular liaison bearing',
 'air press instrument',
 'chattel enclosure',
 'fracture with telescoping extremity',
 'layer',
 'phonograph_record determine substrate',
 'methyl iodide',
 'process',
 'chief laser',
 'significant fictional

In [9]:
import torch

input_ids = tokenizer(premise_list, hypothesis_list, truncation=True, padding=True, return_tensors="pt")
attention_masks = input_ids["attention_mask"]
labels = torch.tensor(score_list, dtype=torch.long)
labels_one_hot = torch.nn.functional.one_hot(labels, num_classes=5)
labels_one_hot = labels_one_hot.float()

Setting Up Data Loaders

In [10]:
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, encoded_texts, labels):
        self.encoded_texts = encoded_texts
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encoded_texts.items()}
        item["labels"] = self.labels[idx]
        return item

train_dataset = CustomDataset(input_ids, labels_one_hot)
train_dataloader = DataLoader(train_dataset, batch_size=5, shuffle=True)

Training Loop

In [11]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir = "./patents-output",
    per_device_train_batch_size = 5,
    num_train_epochs = 1,
    learning_rate = 1e-2,
    save_steps = 500,
    save_total_limit = 2,
    evaluation_strategy = "steps",
    eval_steps = 500,
    max_grad_norm=1.0,
    lr_scheduler_type="linear",
    weight_decay=0.01,
)

trainer = Trainer(
    model = model, # type: ignore
    args = training_args,
    data_collator = data_collator,
    train_dataset = train_dataset,
)

trainer.train()

100%|██████████| 10/10 [00:07<00:00,  1.33it/s]

{'train_runtime': 7.5497, 'train_samples_per_second': 6.623, 'train_steps_per_second': 1.325, 'train_loss': 10.793844604492188, 'epoch': 1.0}





TrainOutput(global_step=10, training_loss=10.793844604492188, metrics={'train_runtime': 7.5497, 'train_samples_per_second': 6.623, 'train_steps_per_second': 1.325, 'train_loss': 10.793844604492188, 'epoch': 1.0})

In [12]:
cross_verify_hypothesis = cross_verify_data['anchor'].tolist()
cross_verify_premise = cross_verify_data['target'].tolist()
cross_verify_score = cross_verify_data['score'].tolist()

In [13]:
cross_verify_input_ids = tokenizer(cross_verify_premise, cross_verify_hypothesis, truncation=True, padding=True, return_tensors="pt")
cross_verify_score_tensor = torch.tensor(cross_verify_score, dtype=torch.long)
cross_verify_labels_one_hot = torch.nn.functional.one_hot(cross_verify_score_tensor, num_classes=5).float()
cross_verify_dataset = CustomDataset(cross_verify_input_ids, cross_verify_labels_one_hot)

In [14]:
import numpy as np

cross_verify_results = trainer.predict(cross_verify_dataset)
cross_verify_predictions = cross_verify_results.predictions
cross_verify_label_ids = cross_verify_results.label_ids

100%|██████████| 456/456 [00:40<00:00, 11.25it/s]


In [15]:
from sklearn.metrics import classification_report, accuracy_score

print(classification_report(cross_verify_labels_one_hot, cross_verify_label_ids)) # type: ignore
print("Accuracy:", accuracy_score(cross_verify_labels_one_hot, cross_verify_label_ids)) # type: ignore

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       767
           1       1.00      1.00      1.00      1152
           2       1.00      1.00      1.00      1191
           3       1.00      1.00      1.00       406
           4       1.00      1.00      1.00       132

   micro avg       1.00      1.00      1.00      3648
   macro avg       1.00      1.00      1.00      3648
weighted avg       1.00      1.00      1.00      3648
 samples avg       1.00      1.00      1.00      3648

Accuracy: 1.0


In [16]:
for input, true_label, predicted_label in zip(cross_verify_data[:10], cross_verify_labels_one_hot[:10], cross_verify_label_ids[:10]): # type: ignore
    print(f"Input: {input}")
    print(f"True Label: {true_label}")
    print(f"Predicted Label: {predicted_label}")
    print("-" * 50)

Input: id
True Label: tensor([0., 0., 1., 0., 0.])
Predicted Label: [0. 0. 1. 0. 0.]
--------------------------------------------------
Input: anchor
True Label: tensor([0., 0., 1., 0., 0.])
Predicted Label: [0. 0. 1. 0. 0.]
--------------------------------------------------
Input: target
True Label: tensor([0., 1., 0., 0., 0.])
Predicted Label: [0. 1. 0. 0. 0.]
--------------------------------------------------
Input: context
True Label: tensor([0., 0., 1., 0., 0.])
Predicted Label: [0. 0. 1. 0. 0.]
--------------------------------------------------
Input: score
True Label: tensor([0., 0., 1., 0., 0.])
Predicted Label: [0. 0. 1. 0. 0.]
--------------------------------------------------


In [17]:
output_dir = "./patents-output/deberta"

model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('./patents-output/deberta\\tokenizer_config.json',
 './patents-output/deberta\\special_tokens_map.json',
 './patents-output/deberta\\vocab.json',
 './patents-output/deberta\\merges.txt',
 './patents-output/deberta\\added_tokens.json')

In [18]:
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoConfig

config = AutoConfig.from_pretrained(output_dir)
tf_model = TFAutoModelForSequenceClassification.from_pretrained(output_dir, from_pt=True, config=config)
tf_model.save_pretrained(output_dir, saved_model=True)
tf_model.save_weights(output_dir + '/tf_model.h5')

UnpicklingError: invalid load key, '\x18'.

In [None]:
#loaded_model = model.from_pretrained('./patents-output/bart')
#loaded_tokenizer = tokenizer.from_pretrained('./patents-output/bart')

In [None]:
import torch

# Check if CUDA is available
if torch.cuda.is_available():
    # Get the current CUDA device index
    current_device = torch.cuda.current_device()
    print(f"Current CUDA device index: {current_device}")

    # Get the name of the current CUDA device
    current_device_name = torch.cuda.get_device_name(current_device)
    print(f"Current CUDA device name: {current_device_name}")
else:
    print("CUDA is not available.")


Current CUDA device index: 0
Current CUDA device name: NVIDIA GeForce GTX 1660 Ti


In [None]:
from transformers import TFBartForSequenceClassification, BartTokenizer
import tensorflow as tf

# Assign the model and tokenizer
model = TFBartForSequenceClassification.from_pretrained("facebook/bart-large-mnli")
model.config.dropout = 0.5

from tensorflow.keras.layers import Dense

model.classification_head.dense = Dense(model.config.d_model, activation='linear', use_bias=True)
model.classification_head.out_proj = Dense(5, activation='linear', use_bias=True)

tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-mnli",  return_tensors="tf")

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBartForSequenceClassification: ['model.decoder.version', 'model.encoder.version']
- This IS expected if you are initializing TFBartForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBartForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBartForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBartForSequenceClassification for predictions without further training.


In [None]:
loaded_model = model.from_pretrained('C:/Users/aryan/Actual-Coding/CDAC/patents-output/bart') # type: ignore
loaded_tokenizer = tokenizer.from_pretrained('C:/Users/aryan/Actual-Coding/CDAC/patents-output/bart')

Some layers from the model checkpoint at C:/Users/aryan/Actual-Coding/CDAC/patents-output/bart were not used when initializing TFBartForSequenceClassification: ['final_logits_bias']
- This IS expected if you are initializing TFBartForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBartForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFBartForSequenceClassification were not initialized from the model checkpoint at C:/Users/aryan/Actual-Coding/CDAC/patents-output/bart and are newly initialized: ['classification_head']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
premise = "ai powered multi stage adjustment"
hypothesis = "smart water filtration device"
input_ids = tokenizer(premise, hypothesis, truncation=True, padding=True, return_tensors="tf")

In [None]:
outputs = loaded_model(input_ids)
logits = outputs.logits

probabilities = tf.nn.softmax(logits, axis=-1)
predicted_class = tf.argmax(probabilities, axis=-1)
predicted_class

<tf.Tensor: shape=(1,), dtype=int64, numpy=array([2], dtype=int64)>

In [None]:
scores = tf.linspace(1.0, 3.0, num=3)
expected_score = tf.reduce_sum(probabilities * scores, axis=-1)

In [None]:
max_score = 3.0
normalized_score = expected_score / max_score
rounded_score = tf.round(normalized_score * 4) / 4
clamped_score = tf.minimum(rounded_score, tf.constant(1.00))

formatted_output = clamped_score.numpy()
formatted_output_str = ["{:.2f}".format(float(score)) for score in formatted_output]

In [None]:
score_to_label_mapping = {
    0.00: "Very close match",
    0.25: "Close synonym",
    0.50: "Synonyms which don’t have the same meaning (same function, same properties)",
    0.75: "Somewhat related",
    1.00: "Unrelated"
}

# Make sure to convert the numpy array to a float
rounded_score_value = float(rounded_score.numpy()[0])
print(float(normalized_score.numpy()[0]))

# You don't need to format it as a string, use the float value directly for lookup
label = score_to_label_mapping.get(rounded_score_value, "Label not found")
print(label)

0.7048661708831787
Somewhat related
