In [1]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split
import random
import os
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoConfig, AutoTokenizer, get_linear_schedule_with_warmup,TrainingArguments, Trainer
from datasets import load_metric
from transformers import BertModel
import torch.nn as nn
from torch.optim import Adam
from tqdm import tqdm

In [2]:
if torch.cuda.is_available():     
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")
print( 'device set to =>', device)

There are 1 GPU(s) available.
We will use the GPU: GeForce GTX 1660 SUPER
device set to => cuda


In [3]:
class Config:
    train_csv='./test.csv'
    test_csv='./train.csv'
    sub_csv='./sample_submission.csv'

    model='anferico/bert-for-patents'    
    max_len=32
    num_epoch=2
    batch_size=64
    epochs=3
    lr=1e-6

In [4]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED']=str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic=True

seed_everything(seed=42)

    

In [5]:
train_df = pd.read_csv( Config.train_csv )
test_df = pd.read_csv( Config.test_csv )
sub_df = pd.read_csv( Config.sub_csv )

In [6]:
train_df.head()


Unnamed: 0,id,anchor,target,context
0,4112d61851461f60,opc drum,inorganic photoconductor drum,G02
1,09e418c93a776564,adjust gas flow,altering gas flow,F23
2,36baf228038e314b,lower trunnion,lower locating,B60
3,1f37ead645e7f0c8,cap component,upper portion,D06
4,71a5b6ad068d531f,neural stimulation,artificial neural network,H04


In [7]:
test_df.head()


Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75
2,36d72442aefd8232,abatement,active catalyst,A47,0.25
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0


In [8]:
sub_df.head()


Unnamed: 0,id,score
0,4112d61851461f60,0
1,09e418c93a776564,0
2,36baf228038e314b,0
3,1f37ead645e7f0c8,0
4,71a5b6ad068d531f,0


In [9]:
train_examples=int(len(train_df)*0.9)
train_data=train_df.iloc[:train_examples,1:]
val_data=train_df.iloc[train_examples:, 1:]
print('train_size:',train_data.shape[0])
print('val_size',val_data.shape[0])

train_size: 32
val_size 4


In [10]:
class PatentDataset(torch.utils.data.Dataset):
    def __init__(self, anchor,target,context,score,tokenizer,max_len):
        
        self.anchor=anchor
        self.target=target
        self.context=context
        self.score=score
        self.tokenizer=tokenizer
        self.max_len=max_len
    def __len__(self):
        return len(self.anchor)
    
    def __getitem__(self,idx):
        
        anchor=self.anchor[idx]
        target=self.target[idx]
        context=self.context[idx]
        score=self.score[idx]
        
        encoded_data=self.tokenizer.encode_plus(context+' '+anchor,target,padding='max_length',max_length=self.max_len,truncation=True,return_attention_mask=True)
        
        input_ids=encoded_data["input_ids"]
        attention_mask=encoded_data["attention_mask"]
        token_type_ids=encoded_data["token_type_ids"]
        
        return{
            'input_ids':torch.tensor(input_ids,dtype=torch.long),
            'attention_mask':torch.tensor(attention_mask,dtype=torch.long),
            'token_type':torch.tensor(token_type_ids,dtype=torch.long),
            'label':torch.tensor(score,dtype=torch.long),          
        }
        

In [11]:
class BertClassifier(nn.Module):
    def __init__(self, dropout):
        super(BertClassifier,self).__init__()
        self.bert=BertModel.from_pretrained(Config.model)
        self.dropout=nn.Dropout(dropout)
        self.linear=nn.Linear(1024,5)
        self.relu=nn.ReLU()
        
    def forward(self, input_id, mask):
        
        _,pooled_output=self.bert(input_ids=input_id,attention_mask=mask,return_dict=False)
        dropout_output=self.dropout(pooled_output)
        linear_output=self.linear(dropout_output)
        final_layer=self.relu(linear_output)
        return final_layer
        

In [12]:
train_examples=int(len(train_df)*0.9)
train=train_df.iloc[:train_examples,1:]
val=train_df.iloc[train_examples:, 1:]
print('train_size:',train.shape[0])
print('val_size:',val.shape[0])


train_size: 32
val_size: 4


In [13]:
tokenizer=AutoTokenizer.from_pretrained(Config.model,padding='max_length',pad_to_max=True,max_length=Config.max_len,truncation=True)


In [14]:
def train(model,train,val,learning_rate=.01,epochs=200):
    train_dataset=PatentDataset(anchor=train.target.values,
                                target=train.target.values,
                                context=train.context.values,
                                score=train.score.values,
                                tokenizer=tokenizer,
                                max_len=Config.max_len)
    
    val_dataset=PatentDataset(anchor=val.target.values,
                                target=val.target.values,
                                context=val.context.values,
                                score=val.score.values,
                                tokenizer=tokenizer,
                                max_len=Config.max_len)
    
    train_dataloader=DataLoader(train_dataset,batch_size=32,shuffle=True)
    val_dataloader=DataLoader(val_dataset,batch_size=32)
    
    criterion=nn.CrossEntropyLoss()
    optimizer=Adam(model.parameters(),lr=learning_rate)
    
    if torch.cuda.is_available():
        model=model.cuda()
        criterion=criterion.cuda()
        
    for epoch_num in range(epochs):
        total_acc_train=0
        total_loss_train=0
        
        for item in tqdm(train_dataloader):
            train_label=item['label'].to(device)
            mask=item['attention_mask'].to(device)
            input_id=item['input_ids'].squeeze(1).to(device)
            
            output=model(input_id,mask)
            batch_loss=criterion(output,train_label)
            total_loss_train+=batch_loss.item()
            
            acc=(output.argmax(dim=1)==train_label).sum().item()
            total_acc_train+=acc
            
            model.zero_grad()
            batch_loss.backward()
            optimizer.step()
            
            total_acc_val=0
            total_loss_val=0
            
        with torch.no_grad():
            
            for item in val_dataloader:
                
                val_label=item['label'].to(device)
                mask=item['attention_mask'].to(device)
                input_id=item['input_ids'].squeeze(1).to(device)
                
                output=model(input_id,mask)
                
                batch_loss=criterion(output,val_label)
                total_loss_train+=batch_loss.item()
                
                acc=(output.argmax(dim=1)==val_label).sum().item()
                total_acc_val+=acc
        save_path=f'bert_{epoch_num}.pt'
        torch.save({
            'epoch':epoch_num,
            'model_state_dict':model.state_dict(),
            'optimizer_state_dict':optimizer.state_dict(),
        },save_path)


model=BertClassifier(dropout=0.5)
train(model,train_data,val_data,Config.lr,Config.epochs)

Some weights of the model checkpoint at anferico/bert-for-patents were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


AttributeError: 'DataFrame' object has no attribute 'score'