In [49]:
# importing required libraries
import tensorflow as tf
from tensorflow.keras import activations,optimizers,losses
from transformers import DistilBertTokenizer,TFDistilBertForSequenceClassification
import pickle

In [30]:
model_name='distilbert-base-uncased'#using distilbert model
max_len=512#defining max length of the words to be taken
tokenizer=DistilBertTokenizer.from_pretrained(model_name)#intializing distilbert tokenizer

In [31]:
x_train=[]
# importing datasets and splitiing into training and validation
import datasets
from datasets import load_dataset

dataset_dict = load_dataset('HUPD/hupd',
    name='sample',
    data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather", 
    icpr_label=None,
    train_filing_start_date='2016-01-01',
    train_filing_end_date='2016-01-21',
    val_filing_start_date='2016-01-22',
    val_filing_end_date='2016-01-31',
)


Found cached dataset hupd (C:/Users/91800/.cache/huggingface/datasets/HUPD___hupd/sample-23bcfec45c886e8c/0.0.0/6920d2def8fd7767046c0470603357f76866e5a09c97e19571896bfdca521142)


  0%|          | 0/2 [00:00<?, ?it/s]

In [32]:
# encoding the decisions field in patents dataset because they are in the strings
decision_to_str = {'REJECTED': 0, 'ACCEPTED': 1, 'PENDING': 2, 'CONT-REJECTED': 3, 'CONT-ACCEPTED': 4, 'CONT-PENDING': 5}
#function to encode
def map_decision_to_string(example):
    return {'decision': decision_to_str[example['decision']]}

In [33]:
#mapping the encoding function to training and validation data
for name in ['train', 'validation']:
        dataset_dict[name] = dataset_dict[name].map(map_decision_to_string)
        # Remove the pending and CONT-patent applications
        dataset_dict[name] = dataset_dict[name].filter(lambda e: e['decision'] <= 1)

Loading cached processed dataset at C:\Users\91800\.cache\huggingface\datasets\HUPD___hupd\sample-23bcfec45c886e8c\0.0.0\6920d2def8fd7767046c0470603357f76866e5a09c97e19571896bfdca521142\cache-93d3351fa3e3a5e1.arrow
Loading cached processed dataset at C:\Users\91800\.cache\huggingface\datasets\HUPD___hupd\sample-23bcfec45c886e8c\0.0.0\6920d2def8fd7767046c0470603357f76866e5a09c97e19571896bfdca521142\cache-0da8d16936d2b5f5.arrow
Loading cached processed dataset at C:\Users\91800\.cache\huggingface\datasets\HUPD___hupd\sample-23bcfec45c886e8c\0.0.0\6920d2def8fd7767046c0470603357f76866e5a09c97e19571896bfdca521142\cache-8e7c04f871a5f3dd.arrow
Loading cached processed dataset at C:\Users\91800\.cache\huggingface\datasets\HUPD___hupd\sample-23bcfec45c886e8c\0.0.0\6920d2def8fd7767046c0470603357f76866e5a09c97e19571896bfdca521142\cache-47be42eb7b704c2e.arrow


In [35]:
#Collecting the absatact values of the training dataset
x_train=dataset_dict['train']['abstract']

In [52]:
#Collecting the absatact values of the validation dataset
x_val=dataset_dict['validation']['abstract']

In [36]:
#function for creating masks and input_ids and this function takes input and type of tokenizer we declared and returns output encodings 
def construct_encodings(x,tk,max_len,truncation=True,padding=True):
    return tk(x,max_length=max_len,truncation=truncation,padding=padding)
encodings=construct_encodings(x_train,tokenizer,max_len=max_len)

In [38]:
# taking decisions of the patent into seperate list
y_train=[]
y_train=dataset_dict['train']['decision']

In [58]:
# taking decisions of the patent into seperate list
y_val=[]
y_val=dataset_dict['validation']['decision']

In [40]:
#function for making the data to fit into the model.Here in tensorflow to fit the model for transformers we have to give input_ids and mask to tensor slices
def construct_tfdataset(encodings, y=None):
    if y:
        return tf.data.Dataset.from_tensor_slices((dict(encodings),y))
    else:
        # this case is used when making predictions on unseen samples after training
        return tf.data.Dataset.from_tensor_slices(dict(encodings))
    
tfdataset = construct_tfdataset(encodings, y_train)

In [42]:
BATCH_SIZE=8 #declaring batch size

In [43]:

tfdataset = tfdataset.batch(BATCH_SIZE)

In [47]:
N_EPOCHS = 2  #number of epoch to train the data

model = TFDistilBertForSequenceClassification.from_pretrained(model_name)#preparing the model
optimizer = optimizers.Adam(learning_rate=3e-3)#Instantiating the optimizer
loss = losses.SparseCategoricalCrossentropy(from_logits=True)#declaring the loss
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])#compiling the model

model.fit(tfdataset, batch_size=BATCH_SIZE, epochs=N_EPOCHS)#fitting the model

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_projector', 'activation_13', 'vocab_layer_norm', 'vocab_transform']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['dropout_179', 'pre_classifier', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use 

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x1fabcc2d4f0>

In [50]:
# save the model trained weights to be used in future purpose
model.save_pretrained('./model/clf')
with open('./model/info.pkl', 'wb') as f:
    pickle.dump((model_name, max_len), f)

In [53]:
encodings_val=construct_encodings(x_val,tokenizer,max_len=max_len)#encoding validation data

In [59]:
tfdataset_test = construct_tfdataset(encodings_val,y_val)#encoding validation data
tfdataset_test = tfdataset_test.batch(BATCH_SIZE)#splitting validation data into batches

In [60]:
#after training the model we are evaluating the accuracy on the validation data
benchmarks = model.evaluate(tfdataset_test, return_dict=True, batch_size=BATCH_SIZE)
print(benchmarks)

{'loss': 0.5054701566696167, 'accuracy': 0.7964402437210083}


In [51]:
#This is a function written to predict probabilities of the occurence for the new data point
def create_predictor(model, model_name, max_len):
  tkzr = DistilBertTokenizer.from_pretrained(model_name)
  def predict_proba(text):
      x = [text]

      encodings = construct_encodings(x, tkzr, max_len=max_len)
      tfdataset = construct_tfdataset(encodings)
      tfdataset = tfdataset.batch(1)

      preds = model.predict(tfdataset).logits
      preds = activations.softmax(tf.convert_to_tensor(preds)).numpy()
      return preds[0][0]
    
  return predict_proba

In [63]:
#sample data point
x=("Embodiments of the invention proving a tag based on inherent disorder during a manufacturing process. The method includes using a first reader to take a first reading of an inherent disorder feature of the tag, and using a second reader to take a second reading of the inherent disorder feature of the tag. The method further includes matching the first reading with the second reading, and determining one or more acceptance criteria, wherein at least one of the acceptance criteria is based on whether the first reading and the second reading match within a predetermined threshold. If the acceptance criteria are met, then the tag is accepted, and a fingerprint for the tag is recorded. The invention further provides a method of testing and characterizing a reader of inherent disorder tags during a manufacturing process. The method includes taking a reading of a known inherent disorder tag, using the reading to measure a characteristic of the reader, and storing the measured characteristic for use when reading inherent disorder tags.', 'claims': '1. A method comprising: using a first reader to take a first reading of an inherent disorder feature of a tag; using at least a second reader to take at least a second reading of the inherent disorder feature of the tag; matching the first reading with at least the second reading; determining one or more acceptance criteria, wherein at least one of the acceptance criteria is based on whether the first reading and the second reading match within a predetermined threshold; accepting the tag if the acceptance criteria are met; and recording a fingerprint for the tag if the tag was accepted. 2. The method of claim 1, wherein determining one or more acceptance criteria further comprises: determining an acceptance criterion based on an individual reading. 3. The method of claim 2, wherein determining an acceptance criterion based on an individual reading comprises determining an acceptance criterion based on a strength of a signal in at least one of the first reading and the second reading. 4. The method of claim 2, wherein determining an acceptance criterion based on an individual reading comprises determining an acceptance criterion based on a complexity of a signal in at least one of the first reading and the second reading. 5. The method of claim 1, further comprising: rejecting the tag if it is not accepted. 6. The method of claim 5, wherein rejecting the tag comprises removing the tag without stopping the flow of production. 7. The method of claim 6, wherein removing the tag comprises one or more of marking the tag as rejected, cutting out the tag, punching out the tag, and removing a tag using a suction method. 8. The method of claim 5, wherein rejecting the tag further comprises noting the rejected tag in a database. 9. The method of claim 1, further comprising: using at least a third reader to take at least a third reading of the inherent disorder feature of the tag if the acceptance criteria are not met; matching the third reading with the first reading and the second reading; determining one or more further acceptance criteria, wherein at least one of the further acceptance criteria is based on whether the first reading and the third reading match within the predetermined threshold or whether the second reading and the third reading match within the predetermined threshold; and accepting the tag if the further acceptance criteria are met; and if the tag is accepted, recording a fingerprint for the tag based on the first reading if the first reading and the third reading match within the predetermined threshold or based on the second reading if the second reading")
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     

In [65]:
#For the sample data point based on the training weights we are calculating the probability of acceptance of patent.
new_model = TFDistilBertForSequenceClassification.from_pretrained('./model/clf')
model_name, max_len = pickle.load(open('./model/info.pkl', 'rb'))

clf = create_predictor(new_model, model_name, max_len)
print(clf(x))

Some layers from the model checkpoint at ./model/clf were not used when initializing TFDistilBertForSequenceClassification: ['dropout_179']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at ./model/clf and are newly initialized: ['dropout_199']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0.21113838
