In [2]:
!pip install -q transformers datasets torch scikit-learn pandas numpy

In [3]:
import re
import numpy as np
import pandas as pd
import torch

from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score, precision_score, recall_score

In [None]:
dataset = load_dataset(
    "json",
    data_files="CUAD_v1.json",
    field="data"
)

dataset

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['title', 'paragraphs'],
        num_rows: 510
    })
})

In [None]:
samples = []

for contract in dataset["train"]:
    for para in contract["paragraphs"]:
        context = para["context"]

        for qa in para["qas"]:
            question = qa["question"]

            for ans in qa["answers"]:
                text = ans["text"].strip()
                if text:
                    samples.append({
                        "text": text,
                        "question": question
                    })

df = pd.DataFrame(samples)
df.head()

Unnamed: 0,text,question
0,DISTRIBUTOR AGREEMENT,Highlight the parts (if any) of this contract ...
1,Distributor,Highlight the parts (if any) of this contract ...
2,Electric City Corp.,Highlight the parts (if any) of this contract ...
3,Electric City of Illinois L.L.C.,Highlight the parts (if any) of this contract ...
4,Company,Highlight the parts (if any) of this contract ...


In [None]:
def extract_clause(question):
    match = re.search(r'"([^"]+)"', question)
    return match.group(1) if match else None

df["clause_type"] = df["question"].apply(extract_clause)

In [12]:
CLAUSE_TYPES = ["Document Name", "Parties", "Agreement Date"
"Effective Date", "Expiration Date", "Renewal Term", "Notice Period to Terminate Renewal",
"Governing Law", "Most Favored Nation", "Non-Compete", "Exclusivity", "No-Solicit of Customers",
"Competitive Restriction Exception", "No-Solicit of Employees","Non-Disparagement", "Termination for Convenience",
"Rofr/Rofo/Rofn", "Change of Control", "Anti-Assignment", "Revenue/Profit Sharing", "Price Restrictions",
"Minimum Commitment", "Volume Restriction", "IP Ownership Assignment", "Joint IP Ownership", "License Grant"
,"Non-Transferable License", "Affiliate License-Licensor", "Affiliate License-Licensee"
,"Unlimited/All-You-Can-Eat-License", "Irrevocable or Perpetual License", "Source Code Escrow", "Post-Termination Services"
,"Audit Rights", "Uncapped Liability", "Cap on Liability", "Liquidated Damages", "Warranty Duration"
,"Insurance", "Covenant Not to Sue", "Third Party Beneficiary"]

In [None]:
print(df["clause_type"].tolist())

['Document Name', 'Parties', 'Parties', 'Parties', 'Parties', 'Parties', 'Agreement Date', 'Effective Date', 'Effective Date', 'Expiration Date', 'Renewal Term', 'Governing Law', 'Exclusivity', 'Exclusivity', 'Exclusivity', 'No-Solicit Of Customers', 'No-Solicit Of Customers', 'No-Solicit Of Employees', 'Rofr/Rofo/Rofn', 'Rofr/Rofo/Rofn', 'Rofr/Rofo/Rofn', 'Anti-Assignment', 'Anti-Assignment', 'Price Restrictions', 'Price Restrictions', 'Minimum Commitment', 'Minimum Commitment', 'Minimum Commitment', 'Minimum Commitment', 'Minimum Commitment', 'License Grant', 'License Grant', 'Post-Termination Services', 'Post-Termination Services', 'Post-Termination Services', 'Post-Termination Services', 'Warranty Duration', 'Warranty Duration', 'Warranty Duration', 'Warranty Duration', 'Warranty Duration', 'Warranty Duration', 'Warranty Duration', 'Warranty Duration', 'Insurance', 'Covenant Not To Sue', 'Document Name', 'Parties', 'Parties', 'Parties', 'Parties', 'Effective Date', 'Expiration Date

In [None]:
df.shape

(13823, 3)

In [None]:
df = df[df["clause_type"].isin(CLAUSE_TYPES)]
df.shape

(10687, 3)

In [None]:
df["labels"] = df["clause_type"].apply(lambda x: [x])

In [None]:
print(df["labels"].tolist())

[['Document Name'], ['Parties'], ['Parties'], ['Parties'], ['Parties'], ['Parties'], ['Expiration Date'], ['Renewal Term'], ['Governing Law'], ['Exclusivity'], ['Exclusivity'], ['Exclusivity'], ['Rofr/Rofo/Rofn'], ['Rofr/Rofo/Rofn'], ['Rofr/Rofo/Rofn'], ['Anti-Assignment'], ['Anti-Assignment'], ['Price Restrictions'], ['Price Restrictions'], ['Minimum Commitment'], ['Minimum Commitment'], ['Minimum Commitment'], ['Minimum Commitment'], ['Minimum Commitment'], ['License Grant'], ['License Grant'], ['Post-Termination Services'], ['Post-Termination Services'], ['Post-Termination Services'], ['Post-Termination Services'], ['Warranty Duration'], ['Warranty Duration'], ['Warranty Duration'], ['Warranty Duration'], ['Warranty Duration'], ['Warranty Duration'], ['Warranty Duration'], ['Warranty Duration'], ['Insurance'], ['Document Name'], ['Parties'], ['Parties'], ['Parties'], ['Parties'], ['Expiration Date'], ['Expiration Date'], ['Governing Law'], ['Anti-Assignment'], ['License Grant'], ['L

In [None]:
mlb = MultiLabelBinarizer(classes=CLAUSE_TYPES)
y = mlb.fit_transform(df["labels"])

print(y.shape)  # (num_samples, 41)

(10687, 40)


In [None]:
(y.sum(axis=1) > 0).mean()

np.float64(1.0)

In [None]:
hf_dataset = Dataset.from_dict({
    "text": df["text"].tolist(),
    "labels": y.astype(float).tolist()
})

hf_dataset

Dataset({
    features: ['text', 'labels'],
    num_rows: 10687
})

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    "nlpaueb/legal-bert-base-uncased"
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
def tokenize(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=512
    )

hf_dataset = hf_dataset.map(tokenize, batched=True)
hf_dataset = hf_dataset.remove_columns(["text"])
hf_dataset = hf_dataset.map(
    lambda x: {"labels": torch.tensor(x["labels"], dtype=torch.float)}
)

hf_dataset.set_format(type="torch")

Map:   0%|          | 0/10687 [00:00<?, ? examples/s]

Map:   0%|          | 0/10687 [00:00<?, ? examples/s]

In [None]:
dataset_split = hf_dataset.train_test_split(test_size=0.1)
train_ds = dataset_split["train"]
val_ds = dataset_split["test"]

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    "nlpaueb/legal-bert-base-uncased",
    num_labels=len(CLAUSE_TYPES),
    problem_type="multi_label_classification"
)

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir="./legal-bert-cuad",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_steps=100,
    load_best_model_at_end=True
)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = torch.sigmoid(torch.tensor(logits))
    preds = (probs > 0.5).int().numpy()

    return {
        "micro_f1": f1_score(labels, preds, average="micro"),
        "macro_f1": f1_score(labels, preds, average="macro"),
        "precision": precision_score(labels, preds, average="micro"),
        "recall": recall_score(labels, preds, average="micro")
    }

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [None]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.
  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 3


[34m[1mwandb[0m: You chose "Don't visualize my results"


Epoch,Training Loss,Validation Loss,Micro F1,Macro F1,Precision,Recall
1,0.0501,0.047102,0.679245,0.186361,0.972125,0.521983
2,0.0292,0.029068,0.840874,0.361371,0.920935,0.77362
3,0.0229,0.024962,0.845309,0.397758,0.905882,0.792329
4,0.018,0.022965,0.853767,0.467348,0.901247,0.811038
5,0.0173,0.022206,0.853333,0.473992,0.903766,0.808232


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=6015, training_loss=0.04077081423893831, metrics={'train_runtime': 4615.3882, 'train_samples_per_second': 10.419, 'train_steps_per_second': 1.303, 'total_flos': 1.265732769079296e+16, 'train_loss': 0.04077081423893831, 'epoch': 5.0})

In [None]:
text = "This agreement shall be governed by the laws of India."

inputs = tokenizer(text, return_tensors="pt", truncation=True)
outputs = model(**inputs.to(model.device))

preds = (torch.sigmoid(outputs.logits) > 0.5).squeeze()

predicted_clauses = [
    CLAUSE_TYPES[i] for i, val in enumerate(preds) if val
]

predicted_clauses

['Governing Law']

In [None]:
SAVE_DIR = "/content/legal-bert-cuad-finetuned"

trainer.save_model(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)

('/content/legal-bert-cuad-finetuned/tokenizer_config.json',
 '/content/legal-bert-cuad-finetuned/special_tokens_map.json',
 '/content/legal-bert-cuad-finetuned/vocab.txt',
 '/content/legal-bert-cuad-finetuned/added_tokens.json',
 '/content/legal-bert-cuad-finetuned/tokenizer.json')

In [None]:
SAVE_DIR = "/content/drive/MyDrive/models/legal-bert-cuad-finetuned"

In [None]:
import os
os.makedirs(SAVE_DIR, exist_ok=True)

In [None]:
trainer.save_model(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)

('/content/drive/MyDrive/models/legal-bert-cuad-finetuned/tokenizer_config.json',
 '/content/drive/MyDrive/models/legal-bert-cuad-finetuned/special_tokens_map.json',
 '/content/drive/MyDrive/models/legal-bert-cuad-finetuned/vocab.txt',
 '/content/drive/MyDrive/models/legal-bert-cuad-finetuned/added_tokens.json',
 '/content/drive/MyDrive/models/legal-bert-cuad-finetuned/tokenizer.json')

In [None]:
import joblib
joblib.dump(mlb, f"{SAVE_DIR}/label_binarizer.pkl")

['/content/drive/MyDrive/models/legal-bert-cuad-finetuned/label_binarizer.pkl']

In [1]:
import json

In [6]:
SAVE_DIR = "/content/drive/MyDrive/models/legal-bert-cuad-finetuned"

In [7]:
tokenizer = AutoTokenizer.from_pretrained(SAVE_DIR)
model = AutoModelForSequenceClassification.from_pretrained(SAVE_DIR)
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [8]:
id2label = model.config.id2label

In [9]:
def predict(text):
  inputs = tokenizer(
      text,
      return_tensors="pt",
      truncation=True,
      max_length=256
  )

  with torch.no_grad():
    outputs = model(**inputs)

  probs = torch.softmax(outputs.logits, dim = 1)
  confidence, pred = torch.max(probs, dim = 1)

  return id2label[pred.item()], round(confidence.item(), 3)

In [15]:
with open("/content/manual-cluases.json") as f:
    clauses = json.load(f)

results = []

for c in clauses:
    predicted, confidence = predict(c["text"])
    label_idx = int(predicted.replace("LABEL_", ""))
    predicted = CLAUSE_TYPES[label_idx]
    results.append({
        "text": c["text"],
        "expected": c["expected_label"],
        "predicted": predicted,
        "confidence": confidence,
        "correct": predicted == c["expected_label"]
    })


In [17]:
count = 0
for r in results:
    print("-" * 80)
    print(f"Clause: {r['text']}")
    print(f"Expected : {r['expected']}")
    print(f"Predicted: {r['predicted']} (confidence={r['confidence']})")
    print(f"Correct  : {r['correct']}")
    if r["correct"]:
        count += 1

print(f"Accuracy: {count / len(results)}")

--------------------------------------------------------------------------------
Clause: This Master Services Agreement ("Agreement") sets forth the terms governing the services described herein.
Expected : Document Name
Predicted: Document Name (confidence=0.787)
Correct  : True
--------------------------------------------------------------------------------
Clause: This Agreement is entered into between Alpha Technologies Pvt. Ltd. and Beta Solutions LLP.
Expected : Parties
Predicted: Parties (confidence=0.998)
Correct  : True
--------------------------------------------------------------------------------
Clause: This Agreement is made and entered into on the 10th day of January, 2024.
Expected : Agreement Date
Predicted: Expiration Date (confidence=0.969)
Correct  : False
--------------------------------------------------------------------------------
Clause: The obligations under this Agreement shall commence on the Effective Date of February 1, 2024.
Expected : Effective Date
Pre