Import and set-up

In [1]:
import torch
import pandas as pd
from transformers import pipeline
from transformers import BartForSequenceClassification
from transformers import BartTokenizer
from transformers import DataCollatorWithPadding

tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-mnli")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

model = BartForSequenceClassification.from_pretrained("facebook/bart-large-mnli")

model.classification_head.dense = torch.nn.Linear(in_features=model.config.d_model, out_features=model.config.d_model, bias=True)
model.classification_head.out_proj = torch.nn.Linear(in_features=model.config.d_model, out_features=5, bias=True)

Dataset and Feature Engineering

In [2]:
from sklearn.model_selection import train_test_split

train_dataset = pd.DataFrame(pd.read_csv('/Users/aryan/Actual-Coding/CDAC/us-patent-phrase-to-phrase-matching/train.csv'))
df_train_dataset, cross_verify_data = train_test_split(train_dataset, test_size=0.1)  # 10% for cross-verification

score_actual = df_train_dataset['score']
score_list = score_actual[1:501].tolist()
score_train = pd.DataFrame(score_actual[1:501])
score_train

Unnamed: 0,score
35087,0.50
5862,0.00
16107,0.50
12305,0.25
27328,1.00
...,...
32299,0.25
8927,0.25
32766,0.50
34848,0.50


In [3]:
score_train.value_counts()

score
0.25     169
0.50     147
0.00      90
0.75      74
1.00      20
dtype: int64

In [4]:
score_mapping = {
    0: 0,
    0.25: 1,
    0.5: 2,
    0.75: 3,
    1: 4
}

score_train['score'] = score_train['score'].map(score_mapping)
score_list = [score_mapping[score] for score in score_list]

In [5]:
df_train_dataset.head(10)

Unnamed: 0,id,anchor,target,context,score
21174,afcb977f6e3755f0,non polar carrier,solvent,C09,0.25
35087,df185da5953534e6,vibratory actuator,vibrating linear actuator,H02,0.5
5862,9ed9c43c856f78f9,cochineal,sulphuric acid,C13,0.0
16107,129d4181cb4e3029,imaging axis,axial,A61,0.5
12305,3e35e67b2eeb56fe,extracting process,process to incorporate,C13,0.25
27328,4b6e7138c56d7a99,reflection type liquid crystal display,total reflection type liquid crystal display p...,G02,1.0
7937,93ecd935aef85f41,coupling arms,inputs,H03,0.25
31526,25feb14b476bdabd,split into flows,flow system,F16,0.5
3663,fa615a76f3b65cca,boom hydraulic cylinder,boom,E02,0.5
10929,e07ae08a4c9413c7,electric starter,externally driven starter,F16,0.5


Data Preprocessing

In [6]:
df_train_dataset['hypothesis'] = df_train_dataset['anchor']
inputs = df_train_dataset['hypothesis']
hypothesis_list = inputs[1:501].tolist()
hypothesis_list

['vibratory actuator',
 'cochineal',
 'imaging axis',
 'extracting process',
 'reflection type liquid crystal display',
 'coupling arms',
 'split into flows',
 'boom hydraulic cylinder',
 'electric starter',
 'lifting finger',
 'fan',
 'pipe box',
 'slot open',
 'alumino silicates',
 'coaxial cable transmission',
 'duplex device',
 'elevation view',
 'pulsed plasma',
 'hybrid system',
 'biocytin',
 'hardware blocks',
 'cochineal',
 'apply to muscle',
 'stationary rod',
 'opening assembly',
 'pendent',
 'rhodium carbonyl',
 'elastic assembly',
 'stepped pin',
 'microchambers',
 'display object',
 'pressure signal',
 'different circumferential positions',
 'morpholin',
 'material formation',
 'transmit over interface',
 'frame handle',
 'insulation sleeve',
 'sensitive photographic',
 'electric starter',
 'polls',
 'signal sender',
 'protograph',
 'board id',
 'fence post',
 'sustained delivery',
 'photocleavable linker',
 'magnetically actuated',
 'oxidizing enzyme',
 'disperse in plast

In [7]:
premise = df_train_dataset['target']
premise_list = premise[1:501].tolist()
premise_for_testing = pd.DataFrame(premise_list)
premise_list

['vibrating linear actuator',
 'sulphuric acid',
 'axial',
 'process to incorporate',
 'total reflection type liquid crystal display panel',
 'inputs',
 'flow system',
 'boom',
 'externally driven starter',
 'crop lifting fingers',
 'a device with rotating blades',
 'smoking pipe',
 'open ended slot',
 'zeolite',
 'arrange in coaxial relationship',
 'full duplex network',
 'transister',
 'pulse oximeter',
 'artificially constructed genetic system',
 'biotin',
 'blockchain',
 'food colouring',
 'apply to smooth muscle',
 'stationary car',
 'worm gear',
 'pendent functional',
 'functional group',
 'tension',
 'stepped pin',
 'microvessels',
 'tv',
 'magnetic',
 'different circumferential',
 'chemical',
 'rolling',
 'user interface',
 'saw handle',
 'outer metal shield',
 'sensitive environment',
 'device to rotate an engine crank',
 'security response action',
 'dispatch signal',
 'pay graph',
 'thickness image',
 'speed post tracking',
 'prolonged drug delivery',
 'noncovalent approach'

In [8]:
import torch

input_ids = tokenizer(premise_list, hypothesis_list, truncation=True, padding=True, return_tensors="pt")
labels = torch.tensor(score_list, dtype=torch.long)
labels_one_hot = torch.nn.functional.one_hot(labels, num_classes=5)
labels_one_hot = labels_one_hot.float()

Setting Up Data Loaders

In [9]:
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, encoded_texts, labels):
        self.encoded_texts = encoded_texts
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encoded_texts.items()}
        item["labels"] = self.labels[idx]
        return item

train_dataset = CustomDataset(input_ids, labels_one_hot)
train_dataloader = DataLoader(train_dataset, batch_size=5, shuffle=True)

Training Loop

In [10]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir = "./patents-output",
    per_device_train_batch_size = 5,
    num_train_epochs = 10,
    learning_rate = 1e-4,
    save_steps = 10_000,
    save_total_limit = 2,
    evaluation_strategy = "steps",
    eval_steps = 10_000,
)

trainer = Trainer(
    model = model, # type: ignore
    args = training_args,
    data_collator = data_collator,
    train_dataset = train_dataset,
)

trainer.train()



  0%|          | 0/1000 [00:00<?, ?it/s]

In [None]:
cross_verify_hypothesis = cross_verify_data['anchor'].tolist()
cross_verify_premise = cross_verify_data['target'].tolist()
cross_verify_score = cross_verify_data['score'].tolist()

cross_verify_input_ids = tokenizer(cross_verify_premise, cross_verify_hypothesis, truncation=True, padding=True, return_tensors="pt")
cross_verify_labels = torch.tensor(cross_verify_score, dtype=torch.long)
cross_verify_labels_one_hot = torch.nn.functional.one_hot(cross_verify_labels, num_classes=5).float()
cross_verify_dataset = CustomDataset(cross_verify_input_ids, cross_verify_labels_one_hot)

  cross_verify_labels = torch.tensor(cross_verify_score, dtype=torch.long)


In [None]:
cross_verify_labels_one_hot

tensor([[1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        ...,
        [1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.]])

In [None]:
import numpy as np

cross_verify_results = trainer.predict(cross_verify_dataset)
cross_verify_predictions = cross_verify_results.predictions
cross_verify_label_ids = cross_verify_results.label_ids

  0%|          | 0/456 [00:00<?, ?it/s]

In [None]:
cross_verify_label_ids

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       ...,
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.]], dtype=float32)

In [None]:
predicted_class_indices = [np.argmax(row) for row in cross_verify_predictions]

In [None]:
from sklearn.metrics import classification_report, accuracy_score

print(classification_report(cross_verify_labels_one_hot, cross_verify_label_ids)) # type: ignore
print("Accuracy:", accuracy_score(cross_verify_labels_one_hot, cross_verify_label_ids)) # type: ignore

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3561
           1       1.00      1.00      1.00        87
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         0

   micro avg       1.00      1.00      1.00      3648
   macro avg       0.40      0.40      0.40      3648
weighted avg       1.00      1.00      1.00      3648
 samples avg       1.00      1.00      1.00      3648

Accuracy: 1.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
for input, true_label, predicted_label in zip(cross_verify_data[:10], cross_verify_labels_one_hot[:10], cross_verify_label_ids[:10]): # type: ignore
    print(f"Input: {input}")
    print(f"True Label: {true_label}")
    print(f"Predicted Label: {predicted_label}")
    print("-" * 50)

Input: id
True Label: tensor([1., 0., 0., 0., 0.])
Predicted Label: [1. 0. 0. 0. 0.]
--------------------------------------------------
Input: anchor
True Label: tensor([1., 0., 0., 0., 0.])
Predicted Label: [1. 0. 0. 0. 0.]
--------------------------------------------------
Input: target
True Label: tensor([1., 0., 0., 0., 0.])
Predicted Label: [1. 0. 0. 0. 0.]
--------------------------------------------------
Input: context
True Label: tensor([0., 1., 0., 0., 0.])
Predicted Label: [0. 1. 0. 0. 0.]
--------------------------------------------------
Input: score
True Label: tensor([1., 0., 0., 0., 0.])
Predicted Label: [1. 0. 0. 0. 0.]
--------------------------------------------------


In [None]:
model.type

<bound method Module.type of BartForSequenceClassification(
  (model): BartModel(
    (shared): Embedding(50265, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
      

In [None]:
output_dir = "./patents-output/bart"

model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('./patents-output/bart\\tokenizer_config.json',
 './patents-output/bart\\special_tokens_map.json',
 './patents-output/bart\\vocab.json',
 './patents-output/bart\\merges.txt',
 './patents-output/bart\\added_tokens.json')

In [None]:
loaded_model = model.from_pretrained('./patents-output/deberta')
loaded_tokenizer = tokenizer.from_pretrained('./patents-output/deberta')

You are using a model of type deberta to instantiate a model of type bart. This is not supported for all configurations of models and can yield errors.
Some weights of BartForSequenceClassification were not initialized from the model checkpoint at ./patents-output/deberta and are newly initialized: ['encoder.layers.2.fc2.weight', 'encoder.layers.8.self_attn_layer_norm.bias', 'decoder.layers.9.self_attn.v_proj.weight', 'decoder.layers.4.self_attn.out_proj.bias', 'encoder.layers.10.fc1.weight', 'encoder.layers.7.self_attn.q_proj.bias', 'encoder.layers.1.self_attn.v_proj.weight', 'decoder.layers.10.self_attn.q_proj.bias', 'decoder.layers.0.fc2.weight', 'decoder.embed_positions.weight', 'decoder.layers.6.encoder_attn.v_proj.bias', 'decoder.layers.9.self_attn.v_proj.bias', 'encoder.layers.2.self_attn.k_proj.weight', 'decoder.layers.7.fc1.bias', 'decoder.layers.4.fc2.weight', 'decoder.layers.10.encoder_attn.q_proj.bias', 'encoder.layers.6.self_attn.k_proj.bias', 'classification_head.dense.we

In [10]:
import torch

# Check if CUDA is available
if torch.cuda.is_available():
    # Get the current CUDA device index
    current_device = torch.cuda.current_device()
    print(f"Current CUDA device index: {current_device}")

    # Get the name of the current CUDA device
    current_device_name = torch.cuda.get_device_name(current_device)
    print(f"Current CUDA device name: {current_device_name}")
else:
    print("CUDA is not available.")


Current CUDA device index: 0
Current CUDA device name: NVIDIA GeForce GTX 1660 Ti


In [None]:
from flask import Flask, request, jsonify
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
import numpy as np

app = Flask(__name__)

@app.route("/predict", methods=["POST"])
def predict(input_text):
    inputs = loaded_tokenizer(input_text, return_tensors="tf", padding=True, truncation=True, max_length=128)
    prediction = loaded_model.predict(inputs) # type: ignore

    return jsonify({"prediction": prediction.tolist()})

if __name__ == "__main__":
    app.run(debug=True)