In [None]:
!pip install datasets
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess
  Downloading multiprocess-0.70.14-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19
  Do

In [None]:
# Pretty print
from pprint import pprint
# Datasets load_dataset function
from datasets import load_dataset
# Transformers Autokenizer
from transformers import AutoTokenizer,pipeline
# Standard PyTorch DataLoader
from torch.utils.data import DataLoader


import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import torch
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification,AutoTokenizer,AutoModelForSequenceClassification

In [None]:
dataset_dict = load_dataset('HUPD/hupd',
    name='sample',
    data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather", 
    icpr_label=None,
    train_filing_start_date='2016-01-01',
    train_filing_end_date='2016-01-21',
    val_filing_start_date='2016-01-22',
    val_filing_end_date='2016-01-31',
)

print('Loading is done!')

Downloading builder script:   0%|          | 0.00/14.7k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/10.9k [00:00<?, ?B/s]

Downloading and preparing dataset hupd/sample to /root/.cache/huggingface/datasets/HUPD___hupd/sample-23bcfec45c886e8c/0.0.0/6920d2def8fd7767046c0470603357f76866e5a09c97e19571896bfdca521142...
Loading dataset with config: PatentsConfig(name='sample', version=0.0.0, data_dir='sample', data_files={'train': ['https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather']}, description='Patent data from January 2016, for debugging')


Downloading data:   0%|          | 0.00/6.67M [00:00<?, ?B/s]

Using metadata file: /root/.cache/huggingface/datasets/downloads/bac34b767c2799633010fa78ecd401d2eeffd62eff58abdb4db75829f8932710


Downloading data:   0%|          | 0.00/388M [00:00<?, ?B/s]

Reading metadata file: /root/.cache/huggingface/datasets/downloads/bac34b767c2799633010fa78ecd401d2eeffd62eff58abdb4db75829f8932710
Filtering train dataset by filing start date: 2016-01-01
Filtering train dataset by filing end date: 2016-01-21
Filtering val dataset by filing start date: 2016-01-22
Filtering val dataset by filing end date: 2016-01-31


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset hupd downloaded and prepared to /root/.cache/huggingface/datasets/HUPD___hupd/sample-23bcfec45c886e8c/0.0.0/6920d2def8fd7767046c0470603357f76866e5a09c97e19571896bfdca521142. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Loading is done!


In [None]:
print(dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['patent_number', 'decision', 'title', 'abstract', 'claims', 'background', 'summary', 'description', 'cpc_label', 'ipc_label', 'filing_date', 'patent_issue_date', 'date_published', 'examiner_id'],
        num_rows: 16153
    })
    validation: Dataset({
        features: ['patent_number', 'decision', 'title', 'abstract', 'claims', 'background', 'summary', 'description', 'cpc_label', 'ipc_label', 'filing_date', 'patent_issue_date', 'date_published', 'examiner_id'],
        num_rows: 9094
    })
})


In [None]:

print(f'Train dataset size: {dataset_dict["train"].shape}')
print(f'Validation dataset size: {dataset_dict["validation"].shape}')

Train dataset size: (16153, 14)
Validation dataset size: (9094, 14)


In [None]:
decision_to_str = {'REJECTED': 0, 'ACCEPTED': 1, 'PENDING': 2, 'CONT-REJECTED': 3, 'CONT-ACCEPTED': 4, 'CONT-PENDING': 5}
def map_decision_to_string(example):
    return {'decision': decision_to_str[example['decision']]}

In [None]:
# Re-labeling/mapping.
train_set = dataset_dict['train'].map(map_decision_to_string)
val_set = dataset_dict['validation'].map(map_decision_to_string)

Map:   0%|          | 0/16153 [00:00<?, ? examples/s]

Map:   0%|          | 0/9094 [00:00<?, ? examples/s]

In [None]:
print(train_set)

Dataset({
    features: ['patent_number', 'decision', 'title', 'abstract', 'claims', 'background', 'summary', 'description', 'cpc_label', 'ipc_label', 'filing_date', 'patent_issue_date', 'date_published', 'examiner_id'],
    num_rows: 16153
})


In [None]:
train_set_reduced = train_set.remove_columns(['title','background','summary','description','cpc_label','ipc_label','filing_date','patent_issue_date','date_published','examiner_id'])

val_set_reduced = val_set.remove_columns(['title','background','summary','description','cpc_label','ipc_label','filing_date','patent_issue_date','date_published','examiner_id'])

In [None]:
print(train_set_reduced)

Dataset({
    features: ['patent_number', 'decision', 'abstract', 'claims'],
    num_rows: 16153
})


In [None]:
train_set_reduced = train_set_reduced.filter(lambda row: row["decision"] < 2)
val_set_reduced = val_set_reduced.filter(lambda row: row["decision"] < 2)

Filter:   0%|          | 0/16153 [00:00<?, ? examples/s]

Filter:   0%|          | 0/9094 [00:00<?, ? examples/s]

In [None]:
print(train_set_reduced['decision'])
print(type(train_set_reduced))

In [None]:
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, DistilBertConfig
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=2)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
for row in train_set_reduced:
  row["abstract"] = tokenizer(row["abstract"], padding=True, truncation=True,
                              max_length=512)
  row["claims"] = tokenizer(row["claims"], padding=True, truncation=True,
                              max_length=512)

In [None]:
X_train_col = train_set_reduced.remove_columns(['decision'])
Y_train_col = train_set_reduced.remove_columns(['patent_number','abstract','claims'])

print(X_train_col)
print(Y_train_col)

Dataset({
    features: ['patent_number', 'abstract', 'claims'],
    num_rows: 8719
})
Dataset({
    features: ['decision'],
    num_rows: 8719
})


In [None]:
print(dataset[0])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_train_col, Y_train_col, test_size=0.2)

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])


In [None]:
X_train_encodings = tokenizer(list(X_train),padding = True, truncation = True,max_length=512)
X_test_encodings = tokenizer(list(X_test),padding = True, truncation = True,max_length=512)


    

In [None]:
Y_train_encodings = tokenizer(list(y_train),padding = True, truncation = True,max_length=512)
y_test_encodings = tokenizer(list(y_test),padding = True, truncation = True,max_length=512)

In [64]:
print(X_train_encodings.items())

dict_items([('input_ids', [[101, 7353, 1035, 2193, 102], [101, 10061, 102, 0, 0], [101, 4447, 102, 0, 0]]), ('token_type_ids', [[0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]]), ('attention_mask', [[1, 1, 1, 1, 1], [1, 1, 1, 0, 0], [1, 1, 1, 0, 0]])])


In [None]:
print(Y_train_encodings)

{'input_ids': [[101, 3247, 102]], 'token_type_ids': [[0, 0, 0]], 'attention_mask': [[1, 1, 1]]}


In [None]:
x_train_dataset = Dataset(X_train_encodings,Y_train_encodings)
X_test_dataset = Dataset(X_test_encodings,y_test_encodings)




In [None]:
print(x_train_dataset)

<__main__.Dataset object at 0x7f4b9860e650>


In [None]:
print(type(X_train_encodings))
print(type(x_train_dataset))

<class 'transformers.tokenization_utils_base.BatchEncoding'>
<class '__main__.Dataset'>


In [None]:
def compute_metrics(p):
    print(type(p))
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}
     

In [None]:
# Define Trainer
args = TrainingArguments(
    output_dir="output",
    num_train_epochs=1,
    per_device_train_batch_size=8

)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=x_train_dataset,
    eval_dataset=X_test_dataset,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()