Import and set-up

In [1]:
import torch
import pandas as pd
from transformers import pipeline
from transformers import DebertaTokenizer, DebertaForSequenceClassification
from transformers import DataCollatorWithPadding

tokenizer = DebertaTokenizer.from_pretrained("microsoft/deberta-large")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

model = DebertaForSequenceClassification.from_pretrained("microsoft/deberta-large", num_labels=5)

hidden_size = model.config.hidden_size
model.classifier = torch.nn.Linear(hidden_size, 5)

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-large and are newly initialized: ['classifier.bias', 'pooler.dense.bias', 'classifier.weight', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Dataset and Feature Engineering

In [2]:
from sklearn.model_selection import train_test_split

train_dataset = pd.DataFrame(pd.read_csv('/Users/aryan/Actual-Coding/CDAC/us-patent-phrase-to-phrase-matching/train.csv'))
df_train_dataset, cross_verify_data = train_test_split(train_dataset, test_size=0.1)  # 10% for cross-verification

score_actual = df_train_dataset['score']
score_list = score_actual[1:501].tolist()
score_train = pd.DataFrame(score_actual[1:501])
score_train

Unnamed: 0,score
3629,0.25
15839,0.25
12526,0.50
3058,0.50
12743,0.50
...,...
4050,0.25
5054,0.50
27125,0.50
14729,0.50


In [3]:
score_mapping = {
    0: 0,
    0.25: 1,
    0.5: 2,
    0.75: 3,
    1: 4
}

score_train['score'] = score_train['score'].map(score_mapping)
score_list = [score_mapping[score] for score in score_list]

Data Preprocessing

In [4]:
df_train_dataset['hypothesis'] = df_train_dataset['anchor']
inputs = df_train_dataset['hypothesis']
hypothesis_list = inputs[1:501].tolist()
hypothesis_list

['board id',
 'hybrid system',
 'faucet assembly',
 'based propellant',
 'fence post',
 'mat sections',
 'filled interior',
 'radio wave transmission',
 'page file',
 'antiatherosclerotic',
 'form tables',
 'sustained delivery',
 'speed control means',
 'overall weight',
 'oil tankers',
 'rotatable chamber',
 'lower trunnion',
 'silicide formation',
 'pneumatic logic',
 'tap portion',
 'membrane vesicle',
 'protocol component',
 'el display',
 'overflow compartment',
 'polymeric ester',
 'speed control means',
 'overall weight',
 'generated electrical power',
 'committee',
 'shaped substrates',
 'lower stretches',
 'hybrid system',
 'source channel',
 'membrane vesicle',
 'display object',
 'use solid materials',
 'friction lock',
 'inner contact',
 'based writing',
 'kraft cooking',
 'connecting lines',
 'decoy oligonucleotide',
 'carpet tiles',
 'run during interval',
 'cement composite',
 'fire ring',
 'perfluoroalkyl group',
 'interconnected levers',
 'electromagnetic input',
 'che

In [5]:
premise = df_train_dataset['target']
premise_list = premise[1:501].tolist()
premise_for_testing = pd.DataFrame(premise_list)
premise_list

['sequencing code',
 'selection system',
 'mixing faucet',
 'propellant',
 'support structure',
 'multiple mat sections',
 'filled internal',
 'microwave',
 'fast space',
 'metal deposition',
 'light',
 'novel drug delivery',
 'control system',
 'weight total weight',
 'hair oil composition',
 'rotatable housing',
 'lower trunnion arm',
 'silicon compound',
 'pneumatic multiplexer',
 'channel regions',
 'mitochondrial',
 'secure protocols',
 'liquid el display',
 'overflow dam',
 'polyethylene glycol succinate',
 'switch means',
 'composition',
 'driving electrical generators',
 'research',
 'shape materials',
 'top stretches',
 'phase',
 'source conduit',
 'vesicle',
 'exhibit object',
 'liquid materials can be used',
 'prevent movement lock',
 'contact lenses',
 'compression',
 'alkaline processes',
 'electrical connection cables',
 'decoy system',
 'carpet formaldehyde',
 'operate during run',
 'composite theory',
 'chain design',
 'hydroxyl',
 'connected levers',
 'filtered lower f

In [6]:
import torch

input_ids = tokenizer(premise_list, hypothesis_list, truncation=True, padding=True, return_tensors="pt")
labels = torch.tensor(score_list, dtype=torch.long)
labels_one_hot = torch.nn.functional.one_hot(labels, num_classes=5)
labels_one_hot = labels_one_hot.float()

Setting Up Data Loaders

In [7]:
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, encoded_texts, labels):
        self.encoded_texts = encoded_texts
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encoded_texts.items()}
        item["labels"] = self.labels[idx]
        return item

train_dataset = CustomDataset(input_ids, labels_one_hot)
train_dataloader = DataLoader(train_dataset, batch_size=5, shuffle=True)

Training Loop

In [9]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir = "./patents-output",
    per_device_train_batch_size = 5,
    num_train_epochs = 10,
    learning_rate = 1e-4,
    save_steps = 10_000,
    save_total_limit = 2,
    evaluation_strategy = "steps",
    eval_steps = 10_000,
)

trainer = Trainer(
    model = model, # type: ignore
    args = training_args,
    data_collator = data_collator,
    train_dataset = train_dataset,
)

trainer.train()



  0%|          | 0/1000 [00:00<?, ?it/s]

{'loss': 1.4292, 'learning_rate': 5e-05, 'epoch': 5.0}
{'loss': 1.3933, 'learning_rate': 0.0, 'epoch': 10.0}
{'train_runtime': 1334.0109, 'train_samples_per_second': 3.748, 'train_steps_per_second': 0.75, 'train_loss': 1.4112182006835938, 'epoch': 10.0}


TrainOutput(global_step=1000, training_loss=1.4112182006835938, metrics={'train_runtime': 1334.0109, 'train_samples_per_second': 3.748, 'train_steps_per_second': 0.75, 'train_loss': 1.4112182006835938, 'epoch': 10.0})

In [10]:
cross_verify_hypothesis = cross_verify_data['anchor'].tolist()
cross_verify_premise = cross_verify_data['target'].tolist()
cross_verify_score = cross_verify_data['score'].tolist()

cross_verify_input_ids = tokenizer(cross_verify_premise, cross_verify_hypothesis, truncation=True, padding=True, return_tensors="pt")
cross_verify_labels = torch.tensor(cross_verify_score, dtype=torch.long)
cross_verify_labels_one_hot = torch.nn.functional.one_hot(cross_verify_labels, num_classes=5).float()
cross_verify_dataset = CustomDataset(cross_verify_input_ids, cross_verify_labels_one_hot)

  cross_verify_labels = torch.tensor(cross_verify_score, dtype=torch.long)


In [11]:
cross_verify_labels_one_hot

tensor([[1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        ...,
        [1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0.]])

In [12]:
import numpy as np

cross_verify_results = trainer.predict(cross_verify_dataset)
cross_verify_predictions = cross_verify_results.predictions
cross_verify_label_ids = cross_verify_results.label_ids

  0%|          | 0/456 [00:00<?, ?it/s]

In [13]:
cross_verify_label_ids

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       ...,
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.]], dtype=float32)

In [14]:
from sklearn.metrics import classification_report, accuracy_score

print(classification_report(cross_verify_labels_one_hot, cross_verify_label_ids)) # type: ignore
print("Accuracy:", accuracy_score(cross_verify_labels_one_hot, cross_verify_label_ids)) # type: ignore

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3530
           1       1.00      1.00      1.00       118
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         0

   micro avg       1.00      1.00      1.00      3648
   macro avg       0.40      0.40      0.40      3648
weighted avg       1.00      1.00      1.00      3648
 samples avg       1.00      1.00      1.00      3648

Accuracy: 1.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
for input, true_label, predicted_label in zip(cross_verify_data[:10], cross_verify_labels_one_hot[:10], cross_verify_label_ids[:10]): # type: ignore
    print(f"Input: {input}")
    print(f"True Label: {true_label}")
    print(f"Predicted Label: {predicted_label}")
    print("-" * 50)

Input: id
True Label: tensor([1., 0., 0., 0., 0.])
Predicted Label: [1. 0. 0. 0. 0.]
--------------------------------------------------
Input: anchor
True Label: tensor([1., 0., 0., 0., 0.])
Predicted Label: [1. 0. 0. 0. 0.]
--------------------------------------------------
Input: target
True Label: tensor([1., 0., 0., 0., 0.])
Predicted Label: [1. 0. 0. 0. 0.]
--------------------------------------------------
Input: context
True Label: tensor([1., 0., 0., 0., 0.])
Predicted Label: [1. 0. 0. 0. 0.]
--------------------------------------------------
Input: score
True Label: tensor([1., 0., 0., 0., 0.])
Predicted Label: [1. 0. 0. 0. 0.]
--------------------------------------------------


In [16]:
model.type

<bound method Module.type of DebertaForSequenceClassification(
  (deberta): DebertaModel(
    (embeddings): DebertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=0)
      (LayerNorm): DebertaLayerNorm()
      (dropout): StableDropout()
    )
    (encoder): DebertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x DebertaLayer(
          (attention): DebertaAttention(
            (self): DisentangledSelfAttention(
              (in_proj): Linear(in_features=1024, out_features=3072, bias=False)
              (pos_dropout): StableDropout()
              (pos_proj): Linear(in_features=1024, out_features=1024, bias=False)
              (pos_q_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): StableDropout()
            )
            (output): DebertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): DebertaLayerNorm()
              (dropout): StableDropout()


In [17]:
output_dir = "./patents-output/deberta"

model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('./patents-output/deberta\\tokenizer_config.json',
 './patents-output/deberta\\special_tokens_map.json',
 './patents-output/deberta\\vocab.json',
 './patents-output/deberta\\merges.txt',
 './patents-output/deberta\\added_tokens.json')

In [18]:
loaded_model = model.from_pretrained('./patents-output/bart')
loaded_tokenizer = tokenizer.from_pretrained('./patents-output/bart')

OSError: ./patents-output/bart does not appear to have a file named config.json. Checkout 'https://huggingface.co/./patents-output/bart/main' for available files.

In [None]:
import torch

# Check if CUDA is available
if torch.cuda.is_available():
    # Get the current CUDA device index
    current_device = torch.cuda.current_device()
    print(f"Current CUDA device index: {current_device}")

    # Get the name of the current CUDA device
    current_device_name = torch.cuda.get_device_name(current_device)
    print(f"Current CUDA device name: {current_device_name}")
else:
    print("CUDA is not available.")


Current CUDA device index: 0
Current CUDA device name: NVIDIA GeForce GTX 1660 Ti
