Import and set-up

In [1]:
import torch
import pandas as pd
from transformers import pipeline
from transformers import BartForSequenceClassification
from transformers import BartTokenizer
from transformers import DataCollatorWithPadding

tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-mnli")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

model = BartForSequenceClassification.from_pretrained("facebook/bart-large-mnli")
model.config.dropout = 0.5
model.classification_head.dense = torch.nn.Linear(in_features=model.config.d_model, out_features=model.config.d_model, bias=True)
model.classification_head.out_proj = torch.nn.Linear(in_features=model.config.d_model, out_features=5, bias=True)

Dataset and Feature Engineering

In [2]:
from sklearn.model_selection import train_test_split
train_dataset = pd.DataFrame(pd.read_csv('/Users/aryan/Actual-Coding/CDAC/us-patent-phrase-to-phrase-matching/train.csv'))

In [3]:
score_mapping = {
    0: 0,
    0.25: 1,
    0.5: 2,
    0.75: 3,
    1: 4
}

train_dataset['score'] = train_dataset['score'].map(score_mapping)

In [4]:
df_train_dataset, cross_verify_data = train_test_split(train_dataset, test_size=0.1)  # 10% for cross-verification

In [5]:
import nltk
from nltk.corpus import wordnet
from random import randint

nltk.download('wordnet')

def replace_synonym(sentence, num_replacements=1):
    words = sentence.split()
    new_sentence = sentence
    for _ in range(num_replacements):
        word_to_replace = words[randint(0, len(words)-1)]
        synonyms = [syn.name().split('.')[0] for syn in wordnet.synsets(word_to_replace) if syn.name().split('.')[0] != word_to_replace]
        if synonyms:
            new_sentence = new_sentence.replace(word_to_replace, synonyms[0], 1)
    return new_sentence

# Augmenting data
df_train_dataset['target'] = df_train_dataset['target'].apply(lambda x: replace_synonym(x, num_replacements=5))
df_train_dataset['anchor'] = df_train_dataset['anchor'].apply(lambda x: replace_synonym(x, num_replacements=5))
cross_verify_data['target'] = cross_verify_data['target'].apply(lambda x: replace_synonym(x, num_replacements=5))
cross_verify_data['anchor'] = cross_verify_data['anchor'].apply(lambda x: replace_synonym(x, num_replacements=5))

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\aryan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
score_actual = df_train_dataset['score']
score_list = score_actual[1:51].tolist()
score_train = pd.DataFrame(score_actual[1:51])

In [7]:
score_train.value_counts()

score
2        21
1        17
0         8
3         3
4         1
dtype: int64

In [8]:
df_train_dataset.head(10)

Unnamed: 0,id,anchor,target,context,score
15255,3792820488d01427,learn care parameter,learn music,H04,0
17272,aebda813e7f88214,complect lever,employment aircraft_aircraft_aircraft_carrier,F16,1
30425,e01731d390bb9846,transformation connection,temperature transformation,A47,0
11162,781b2819ec953e95,acme position,acme design acme,B66,2
15580,007d8b9272c77ad3,senior_senior_high_school_school gradient char...,toe centrifuge,B03,0
268,7c0ead66438745af,acerb assimilation,acerb,B01,2
13579,95adeb55a1ca9876,magnetic_field governor,aide control_condition_condition cringle,H01,2
16909,d558f6327da12c15,inorganic loanblend,luminosity emit,B32,1
29355,26c3c6dc6174b589,sealing_waxing_waxing_wax dentition,dentition whiten,F01,0
20647,36b3f277c9861282,motion to scope,pry,F15,2


Data Preprocessing

In [9]:
df_train_dataset['hypothesis'] = df_train_dataset['anchor']
inputs = df_train_dataset['hypothesis']
hypothesis_list = inputs[1:51].tolist()

In [10]:
premise = df_train_dataset['target']
premise_list = premise[1:51].tolist()
premise_for_testing = pd.DataFrame(premise_list)

In [11]:
import torch

input_ids = tokenizer(premise_list, hypothesis_list, truncation=True, padding=True, return_tensors="pt")
attention_masks = input_ids["attention_mask"]
labels = torch.tensor(score_list, dtype=torch.long)
labels_one_hot = torch.nn.functional.one_hot(labels, num_classes=5)
labels_one_hot = labels_one_hot.float()

Setting Up Data Loaders

In [12]:
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, encoded_texts, labels):
        self.encoded_texts = encoded_texts
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encoded_texts.items()}
        item["labels"] = self.labels[idx]
        return item

train_dataset = CustomDataset(input_ids, labels_one_hot)
train_dataloader = DataLoader(train_dataset, batch_size=5, shuffle=True)

Training Loop

In [13]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir = "./patents-output",
    per_device_train_batch_size = 5,
    num_train_epochs = 1,
    learning_rate = 1e-2,
    save_steps = 500,
    save_total_limit = 2,
    evaluation_strategy = "steps",
    eval_steps = 500,
    max_grad_norm=1.0,
    lr_scheduler_type="linear",
    weight_decay=0.01,
)

trainer = Trainer(
    model = model, # type: ignore
    args = training_args,
    data_collator = data_collator,
    train_dataset = train_dataset,
)

trainer.train()



  0%|          | 0/10 [00:00<?, ?it/s]

{'train_runtime': 26.1537, 'train_samples_per_second': 1.912, 'train_steps_per_second': 0.382, 'train_loss': 0.9305903434753418, 'epoch': 1.0}


TrainOutput(global_step=10, training_loss=0.9305903434753418, metrics={'train_runtime': 26.1537, 'train_samples_per_second': 1.912, 'train_steps_per_second': 0.382, 'train_loss': 0.9305903434753418, 'epoch': 1.0})

In [14]:
cross_verify_hypothesis = cross_verify_data['anchor'].tolist()
cross_verify_premise = cross_verify_data['target'].tolist()
cross_verify_score = cross_verify_data['score'].tolist()

In [15]:
cross_verify_input_ids = tokenizer(cross_verify_premise, cross_verify_hypothesis, truncation=True, padding=True, return_tensors="pt")
cross_verify_score_tensor = torch.tensor(cross_verify_score, dtype=torch.long)
cross_verify_labels_one_hot = torch.nn.functional.one_hot(cross_verify_score_tensor, num_classes=5).float()
cross_verify_dataset = CustomDataset(cross_verify_input_ids, cross_verify_labels_one_hot)

In [16]:
import numpy as np

cross_verify_results = trainer.predict(cross_verify_dataset)
cross_verify_predictions = cross_verify_results.predictions
cross_verify_label_ids = cross_verify_results.label_ids

  0%|          | 0/456 [00:00<?, ?it/s]

In [17]:
from sklearn.metrics import classification_report, accuracy_score

print(classification_report(cross_verify_labels_one_hot, cross_verify_label_ids)) # type: ignore
print("Accuracy:", accuracy_score(cross_verify_labels_one_hot, cross_verify_label_ids)) # type: ignore

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       755
           1       1.00      1.00      1.00      1127
           2       1.00      1.00      1.00      1273
           3       1.00      1.00      1.00       382
           4       1.00      1.00      1.00       111

   micro avg       1.00      1.00      1.00      3648
   macro avg       1.00      1.00      1.00      3648
weighted avg       1.00      1.00      1.00      3648
 samples avg       1.00      1.00      1.00      3648

Accuracy: 1.0


In [None]:
for input, true_label, predicted_label in zip(cross_verify_data[:10], cross_verify_labels_one_hot[:10], cross_verify_label_ids[:10]): # type: ignore
    print(f"Input: {input}")
    print(f"True Label: {true_label}")
    print(f"Predicted Label: {predicted_label}")
    print("-" * 50)

Input: id
True Label: tensor([0., 0., 1., 0., 0.])
Predicted Label: [0. 0. 1. 0. 0.]
--------------------------------------------------
Input: anchor
True Label: tensor([0., 1., 0., 0., 0.])
Predicted Label: [0. 1. 0. 0. 0.]
--------------------------------------------------
Input: target
True Label: tensor([0., 1., 0., 0., 0.])
Predicted Label: [0. 1. 0. 0. 0.]
--------------------------------------------------
Input: context
True Label: tensor([0., 1., 0., 0., 0.])
Predicted Label: [0. 1. 0. 0. 0.]
--------------------------------------------------
Input: score
True Label: tensor([0., 0., 0., 0., 1.])
Predicted Label: [0. 0. 0. 0. 1.]
--------------------------------------------------


In [22]:
output_dir = "./patents-output/bart"

model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

RuntimeError: [enforce fail at alloc_cpu.cpp:72] data. DefaultCPUAllocator: not enough memory: you tried to allocate 205885440 bytes.

: 

In [None]:
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoConfig

config = AutoConfig.from_pretrained(output_dir)
tf_model = TFAutoModelForSequenceClassification.from_pretrained(output_dir, from_pt=True, config=config)
tf_model.save_pretrained(output_dir, saved_model=True)
tf_model.save_weights(output_dir + '/tf_model.h5')

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "c:\Users\aryan\AppData\Local\Programs\Python\Python39\lib\site-packages\IPython\core\interactiveshell.py", line 3378, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\aryan\AppData\Local\Temp\ipykernel_12212\4188270300.py", line 5, in <module>
    tf_model = TFAutoModelForSequenceClassification.from_pretrained(output_dir, from_pt=True, config=config)
  File "c:\Users\aryan\AppData\Local\Programs\Python\Python39\lib\site-packages\transformers\models\auto\auto_factory.py", line 493, in from_pretrained
    return model_class.from_pretrained(
  File "c:\Users\aryan\AppData\Local\Programs\Python\Python39\lib\site-packages\transformers\modeling_tf_utils.py", line 2880, in from_pretrained
    return load_pytorch_checkpoint_in_tf2_model(
  File "c:\Users\aryan\AppData\Local\Programs\Python\Python39\lib\site-packages\transformers\modeling_tf_pytorch_utils.py", line 185, in load_pytorch_checkpoint_in_tf2_model
    pt_st

In [None]:
#loaded_model = model.from_pretrained('./patents-output/deberta')
#loaded_tokenizer = tokenizer.from_pretrained('./patents-output/deberta')

In [None]:
model.save(output_dir)

import json
tokenizer_json = tokenizer.to_json()
with open(output_dir + "/tokenizer.json", 'w') as json_file:
    json.dump(tokenizer_json, json_file)

AttributeError: 'BartForSequenceClassification' object has no attribute 'save'

In [None]:
import torch

# Check if CUDA is available
if torch.cuda.is_available():
    # Get the current CUDA device index
    current_device = torch.cuda.current_device()
    print(f"Current CUDA device index: {current_device}")

    # Get the name of the current CUDA device
    current_device_name = torch.cuda.get_device_name(current_device)
    print(f"Current CUDA device name: {current_device_name}")
else:
    print("CUDA is not available.")


Current CUDA device index: 0
Current CUDA device name: NVIDIA GeForce GTX 1660 Ti


In [None]:
from transformers import TFBartForSequenceClassification, BartTokenizer
import tensorflow as tf

# Assign the model and tokenizer
model = TFBartForSequenceClassification.from_pretrained("facebook/bart-large-mnli")
model.config.dropout = 0.5

from tensorflow.keras.layers import Dense

model.classification_head.dense = Dense(model.config.d_model, activation='linear', use_bias=True)
model.classification_head.out_proj = Dense(5, activation='linear', use_bias=True)

tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-mnli",  return_tensors="tf")

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBartForSequenceClassification: ['model.decoder.version', 'model.encoder.version']
- This IS expected if you are initializing TFBartForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBartForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBartForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBartForSequenceClassification for predictions without further training.


In [None]:
loaded_model = model.from_pretrained('C:/Users/aryan/Actual-Coding/CDAC/patents-output/bart') # type: ignore
loaded_tokenizer = tokenizer.from_pretrained('C:/Users/aryan/Actual-Coding/CDAC/patents-output/bart')

Some layers from the model checkpoint at C:/Users/aryan/Actual-Coding/CDAC/patents-output/bart were not used when initializing TFBartForSequenceClassification: ['final_logits_bias']
- This IS expected if you are initializing TFBartForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBartForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFBartForSequenceClassification were not initialized from the model checkpoint at C:/Users/aryan/Actual-Coding/CDAC/patents-output/bart and are newly initialized: ['classification_head']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
premise = "ai powered multi stage adjustment"
hypothesis = "smart water filtration device"
input_ids = tokenizer(premise, hypothesis, truncation=True, padding=True, return_tensors="tf")

In [None]:
outputs = loaded_model(input_ids)
logits = outputs.logits

probabilities = tf.nn.softmax(logits, axis=-1)
predicted_class = tf.argmax(probabilities, axis=-1)
predicted_class

<tf.Tensor: shape=(1,), dtype=int64, numpy=array([2], dtype=int64)>

In [None]:
scores = tf.linspace(1.0, 3.0, num=3)
expected_score = tf.reduce_sum(probabilities * scores, axis=-1)

In [None]:
max_score = 3.0
normalized_score = expected_score / max_score
rounded_score = tf.round(normalized_score * 4) / 4
clamped_score = tf.minimum(rounded_score, tf.constant(1.00))

formatted_output = clamped_score.numpy()
formatted_output_str = ["{:.2f}".format(float(score)) for score in formatted_output]

In [None]:
score_to_label_mapping = {
    0.00: "Very close match",
    0.25: "Close synonym",
    0.50: "Synonyms which don’t have the same meaning (same function, same properties)",
    0.75: "Somewhat related",
    1.00: "Unrelated"
}

# Make sure to convert the numpy array to a float
rounded_score_value = float(rounded_score.numpy()[0])
print(float(normalized_score.numpy()[0]))

# You don't need to format it as a string, use the float value directly for lookup
label = score_to_label_mapping.get(rounded_score_value, "Label not found")
print(label)

0.9221207499504089
Unrelated


: 