In [1]:
import torch

# Check if CUDA is available
if torch.cuda.is_available():
    # Get the current CUDA device index
    current_device = torch.cuda.current_device()
    print(f"Current CUDA device index: {current_device}")

    # Get the name of the current CUDA device
    current_device_name = torch.cuda.get_device_name(current_device)
    print(f"Current CUDA device name: {current_device_name}")
else:
    print("CUDA is not available.")

Current CUDA device index: 0
Current CUDA device name: NVIDIA RTX A4000


Import and set-up

In [2]:
import tensorflow as tf
print(tf.__version__)
print(tf.data)  # This should not raise any error if TensorFlow is correctly installed

2.10.1
<module 'tensorflow._api.v2.data' from 'c:\\Users\\cl502_20\\Downloads\\Vishrut Aryan\\tf-venv\\lib\\site-packages\\tensorflow\\_api\\v2\\data\\__init__.py'>


In [3]:
import torch
import pandas as pd
from transformers import pipeline
from transformers import BartForSequenceClassification
from transformers import BartTokenizer
from transformers import DataCollatorWithPadding

tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-mnli")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

model = BartForSequenceClassification.from_pretrained("facebook/bart-large-mnli")
model.config.dropout = 0.5
model.classification_head.dense = torch.nn.Linear(in_features=model.config.d_model, out_features=model.config.d_model, bias=True)
model.classification_head.out_proj = torch.nn.Linear(in_features=model.config.d_model, out_features=5, bias=True)

Dataset and Feature Engineering

In [4]:
from sklearn.model_selection import train_test_split
train_dataset = pd.DataFrame(pd.read_csv('train_patents.csv'))

In [5]:
score_mapping = {
    0: 0,
    0.25: 1,
    0.5: 2,
    0.75: 3,
    1: 4
}

train_dataset['score'] = train_dataset['score'].map(score_mapping)

In [6]:
df_train_dataset, cross_verify_data = train_test_split(train_dataset, test_size=0.4)  # 10% for cross-verification

In [7]:
import nltk
from nltk.corpus import wordnet
from random import randint

nltk.download('wordnet')

def replace_synonym(sentence, num_replacements=1):
    words = sentence.split()
    new_sentence = sentence
    for _ in range(num_replacements):
        word_to_replace = words[randint(0, len(words)-1)]
        synonyms = [syn.name().split('.')[0] for syn in wordnet.synsets(word_to_replace) if syn.name().split('.')[0] != word_to_replace]
        if synonyms:
            new_sentence = new_sentence.replace(word_to_replace, synonyms[0], 1)
    return new_sentence

# Augmenting data
df_train_dataset['target'] = df_train_dataset['target'].apply(lambda x: replace_synonym(x, num_replacements=5))
df_train_dataset['anchor'] = df_train_dataset['anchor'].apply(lambda x: replace_synonym(x, num_replacements=5))
cross_verify_data['target'] = cross_verify_data['target'].apply(lambda x: replace_synonym(x, num_replacements=5))
cross_verify_data['anchor'] = cross_verify_data['anchor'].apply(lambda x: replace_synonym(x, num_replacements=5))

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\cl502_20\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
score_actual = df_train_dataset['score']
score_list = score_actual[1:10001].tolist()
eval_score_list = score_actual[10001:12001].tolist()
score_train = pd.DataFrame(score_actual[1:10001])

In [9]:
score_train.value_counts()

score
2        3284
1        3226
0        2038
3        1134
4         318
Name: count, dtype: int64

In [10]:
df_train_dataset.head(10)

Unnamed: 0,id,anchor,target,context,score
15837,058ede162eeabb86,loanblend arrangement,security arrangement,C07,0
3115,075aa525d34a1e42,barrage cellular_telephoneular_telephoneular_t...,electromechanical apparatus,F28,2
9242,748a3d3c9a7fae14,dimensional placement,software mise_en_scene,E02,0
19425,2573613d8f2dca98,fabric constitution,component constitution,A21,3
35050,6af7ba75cc53d0e4,vibratory actuator,vibratory stadium,A61,0
24457,93f73962920dc83b,organ_pipe corner,hosiery organ_pipe covering,C21,2
12637,e41129c3173b73e0,run control_condition valve,stream complete valve,F25,2
32303,6f743cd5e277e359,acme,crest extremum,B65,3
21344,c1945e591ab7177e,nvm range,memory range,G11,2
32893,0d18b7867c0e8de0,terephthalate polyester,naphthalate,D06,2


Data Preprocessing

In [11]:
df_train_dataset['hypothesis'] = df_train_dataset['anchor']
inputs = df_train_dataset['hypothesis']
hypothesis_list = inputs[1:10001].tolist()
eval_hypothesis_list = inputs[10001:12001].tolist()

In [12]:
premise = df_train_dataset['target']
premise_list = premise[1:10001].tolist()
eval_premise_list = premise[10001:12001].tolist()

In [13]:
import torch

input_ids = tokenizer(premise_list, hypothesis_list, truncation=True, padding=True, return_tensors="pt")
attention_masks = input_ids["attention_mask"]
labels = torch.tensor(score_list, dtype=torch.long)
labels_one_hot = torch.nn.functional.one_hot(labels, num_classes=5)
labels_one_hot = labels_one_hot.float()

Setting Up Data Loaders

In [14]:
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, encoded_texts, attention_masks, labels):
        self.encoded_texts = encoded_texts
        self.attention_masks = attention_masks
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encoded_texts.items()}
        item["attention_mask"] = self.attention_masks[idx]
        item["labels"] = self.labels[idx]
        return item

train_dataset = CustomDataset(input_ids, attention_masks, labels_one_hot)
train_dataloader = DataLoader(train_dataset, batch_size=5, shuffle=True)

In [17]:
eval_input_ids = tokenizer(eval_premise_list, eval_hypothesis_list, truncation=True, padding=True, return_tensors="pt")
eval_attention_masks = eval_input_ids["attention_mask"]
eval_labels = torch.tensor(eval_score_list, dtype=torch.long)
eval_labels_one_hot = torch.nn.functional.one_hot(eval_labels, num_classes=5)
eval_labels_one_hot = eval_labels_one_hot.float()

eval_dataset = CustomDataset(eval_input_ids, eval_attention_masks, eval_labels_one_hot)
eval_dataloader = DataLoader(eval_dataset, batch_size=5, shuffle=True)

Training Loop

In [20]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir = "./patents-output",
    per_device_train_batch_size = 5,
    num_train_epochs = 3,
    learning_rate = 1e-2,
    save_steps = 500,
    save_total_limit = 2,
    evaluation_strategy = "steps",
    eval_steps = 500,
    max_grad_norm=1.0,
    lr_scheduler_type="linear",
    weight_decay=0.01,
)

trainer = Trainer(
    model = model, # type: ignore
    args = training_args,
    data_collator = data_collator,
    train_dataset = train_dataset,
    eval_dataset = eval_dataset,
    #eval_dataloader = eval_dataloader,
)

trainer.train()

  0%|          | 0/6000 [00:00<?, ?it/s]

In [None]:
cross_verify_hypothesis = cross_verify_data['anchor'].tolist()
cross_verify_premise = cross_verify_data['target'].tolist()
cross_verify_score = cross_verify_data['score'].tolist()

In [None]:
cross_verify_input_ids = tokenizer(cross_verify_premise, cross_verify_hypothesis, truncation=True, padding=True, return_tensors="pt")
cross_verify_score_tensor = torch.tensor(cross_verify_score, dtype=torch.long)
cross_verify_labels_one_hot = torch.nn.functional.one_hot(cross_verify_score_tensor, num_classes=5).float()
cross_verify_dataset = CustomDataset(cross_verify_input_ids, cross_verify_labels_one_hot)

In [None]:
import numpy as np

cross_verify_results = trainer.predict(cross_verify_dataset)
cross_verify_predictions = cross_verify_results.predictions
cross_verify_label_ids = cross_verify_results.label_ids

  0%|          | 0/456 [00:00<?, ?it/s]

In [None]:
from sklearn.metrics import classification_report, accuracy_score

print(classification_report(cross_verify_labels_one_hot, cross_verify_label_ids)) # type: ignore
print("Accuracy:", accuracy_score(cross_verify_labels_one_hot, cross_verify_label_ids)) # type: ignore

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       769
           1       1.00      1.00      1.00      1154
           2       1.00      1.00      1.00      1209
           3       1.00      1.00      1.00       397
           4       1.00      1.00      1.00       119

   micro avg       1.00      1.00      1.00      3648
   macro avg       1.00      1.00      1.00      3648
weighted avg       1.00      1.00      1.00      3648
 samples avg       1.00      1.00      1.00      3648

Accuracy: 1.0


In [None]:
for input, true_label, predicted_label in zip(cross_verify_data[:10], cross_verify_labels_one_hot[:10], cross_verify_label_ids[:10]): # type: ignore
    print(f"Input: {input}")
    print(f"True Label: {true_label}")
    print(f"Predicted Label: {predicted_label}")
    print("-" * 50)

Input: id
True Label: tensor([1., 0., 0., 0., 0.])
Predicted Label: [1. 0. 0. 0. 0.]
--------------------------------------------------
Input: anchor
True Label: tensor([0., 1., 0., 0., 0.])
Predicted Label: [0. 1. 0. 0. 0.]
--------------------------------------------------
Input: target
True Label: tensor([1., 0., 0., 0., 0.])
Predicted Label: [1. 0. 0. 0. 0.]
--------------------------------------------------
Input: context
True Label: tensor([0., 0., 1., 0., 0.])
Predicted Label: [0. 0. 1. 0. 0.]
--------------------------------------------------
Input: score
True Label: tensor([0., 0., 0., 1., 0.])
Predicted Label: [0. 0. 0. 1. 0.]
--------------------------------------------------


In [None]:
output_dir = "./patents-output/bart"

model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
model.save_weights(output_dir + '/bart_model.h5')

AttributeError: 'BartForSequenceClassification' object has no attribute 'save_weights'

In [None]:
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoConfig

config = AutoConfig.from_pretrained(output_dir)
tf_model = TFAutoModelForSequenceClassification.from_pretrained(output_dir, from_pt=True, config=config)
tf_model.save_pretrained(output_dir, saved_model=True, use_auth_token=False)
tf_model.save_weights(output_dir + '/tf_bart_model.h5')

UnpicklingError: invalid load key, '\x10'.

In [None]:
#loaded_model = model.from_pretrained('./patents-output/deberta')
#loaded_tokenizer = tokenizer.from_pretrained('./patents-output/deberta')

In [None]:
model.save(output_dir)

import json
tokenizer_json = tokenizer.to_json()
with open(output_dir + "/tokenizer.json", 'w') as json_file:
    json.dump(tokenizer_json, json_file)



INFO:tensorflow:Assets written to: ./patents-output/bart\assets


INFO:tensorflow:Assets written to: ./patents-output/bart\assets


AttributeError: 'BartTokenizer' object has no attribute 'to_json'

In [None]:
from transformers import TFBartForSequenceClassification, BartTokenizer
import tensorflow as tf

# Initialize the model
model = TFBartForSequenceClassification.from_pretrained("facebook/bart-large-mnli")
model.config.dropout = 0.5

from tensorflow.keras.layers import Dense
model.classification_head.dense = Dense(model.config.d_model, activation='linear', use_bias=True)
model.classification_head.out_proj = Dense(5, activation='linear', use_bias=True)

# Initialize the tokenizer
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-mnli")

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBartForSequenceClassification: ['model.encoder.version', 'model.decoder.version']
- This IS expected if you are initializing TFBartForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBartForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBartForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBartForSequenceClassification for predictions without further training.


In [None]:
loaded_model = model.from_pretrained(output_dir) # type: ignore
loaded_tokenizer = tokenizer.from_pretrained(output_dir)

TypeError: __init__() missing 2 required positional arguments: 'op' and 'message'

In [None]:
premise = "ai powered multi stage adjustment"
hypothesis = "smart water filtration device"
input_ids = tokenizer(premise, hypothesis, truncation=True, padding=True, return_tensors="tf")

In [None]:
outputs = loaded_model(input_ids)
logits = outputs.logits

probabilities = tf.nn.softmax(logits, axis=-1)
predicted_class = tf.argmax(probabilities, axis=-1)
predicted_class

NameError: name 'loaded_model' is not defined

In [None]:
scores = tf.linspace(1.0, 3.0, num=3)
expected_score = tf.reduce_sum(probabilities * scores, axis=-1)

In [None]:
max_score = 3.0
normalized_score = expected_score / max_score
rounded_score = tf.round(normalized_score * 4) / 4
clamped_score = tf.minimum(rounded_score, tf.constant(1.00))

formatted_output = clamped_score.numpy()
formatted_output_str = ["{:.2f}".format(float(score)) for score in formatted_output]

In [None]:
score_to_label_mapping = {
    0.00: "Very close match",
    0.25: "Close synonym",
    0.50: "Synonyms which don’t have the same meaning (same function, same properties)",
    0.75: "Somewhat related",
    1.00: "Unrelated"
}

# Make sure to convert the numpy array to a float
rounded_score_value = float(rounded_score.numpy()[0])
print(float(normalized_score.numpy()[0]))

# You don't need to format it as a string, use the float value directly for lookup
label = score_to_label_mapping.get(rounded_score_value, "Label not found")
print(label)

0.9221207499504089
Unrelated


: 