In [None]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from transformers import BertTokenizer, BertModel, AdamW
from sklearn.preprocessing import LabelEncoder
import pickle
from torch.nn import functional as F
from transformers import DataCollatorWithPadding
from torch.optim import lr_scheduler


In [None]:
model_path = 'bert_chinese/'
file_path = '/s/yzhou180/HSMS/TrainingSet/CRC/CH_CRC_Aspects_TrainingSet.csv'
weight_path = "CRC_sentiment.bin"

config = {
          "epochs": 10,
          "train_batch_size": 64,
          "valid_batch_size": 64,
          "max_length": 64,
          "lr": 1e-5,
          "weight_decay": 1e-6,           
          "num_classes": 3,
          "device": torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
          "tokenizer" : BertTokenizer.from_pretrained(model_path)
          }

In [None]:
df = pd.read_csv(file_path)
encoder = LabelEncoder()
df['Aspect'] = encoder.fit_transform(df['Aspect'])
df = df.sample(frac=1).reset_index()

In [None]:
#保存encoder 预测的时候要用
encoder_file = open('encoder_CRC_aspect','wb')
pickle.dump(encoder,encoder_file)
encoder_file.close()

In [None]:
class TestdriveDataset(Dataset):
    def __init__(self, df, max_length):
        self.text = df['text'].values
        self.target = df['Aspect'].values
        self.max_length = max_length
      
        
        
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self,ids):
        tokenizer = config['tokenizer']
        text = self.text[ids]
        target = self.target[ids]
        try:
            inputs = tokenizer.encode_plus(text,
                          truncation=True,
                          add_special_tokens=True,
                          max_length = self.max_length
                          )
        except:
            print(text)
        
        data_dict = {'input_ids' : inputs['input_ids'],
                    'token_type_ids': inputs['token_type_ids'],
                    'attention_mask': inputs['attention_mask'],
                    'target': target}
        
        
        return data_dict
    
    

In [None]:
collate_fn = DataCollatorWithPadding(tokenizer=config['tokenizer'])

In [None]:
class TestdriveModel(nn.Module):
    def __init__(self,drop_rate):
        super(TestdriveModel, self).__init__()
        self.bert = BertModel.from_pretrained(model_path)
        self.drop = nn.Dropout(drop_rate)
        self.fc = nn.Linear(768, config['num_classes'])
    
    def forward(self, input_ids, attention_mask, token_type_ids):
        output = self.bert(input_ids=input_ids,
                       attention_mask=attention_mask)
                       #token_type_ids=token_type_ids)
        
        output = self.drop(output.last_hidden_state[:,0])
        output = self.fc(output)
        #output = F.softmax(output,dim=1)
        
        return output


In [None]:
def train(model, dataloader, optimizer,device):
    model.train()
    
    total_loss = 0
    data_size = 0
    
    pbar = tqdm(enumerate(dataloader))
    for step, data in pbar:
        input_ids = data['input_ids'].to(device, dtype = torch.long)
        attention_mask = data['attention_mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        target = torch.LongTensor(data['target'])
        batch_size = input_ids.shape[0]
        
        optimizer.zero_grad()
        
        output = model(input_ids, attention_mask, token_type_ids)
        
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        
        scheduler.step()
        
        output = F.softmax(output,dim=1)
        prediction = output.argmax(1)
        accuracy = (prediction == target).sum().item() / len(target)
        

        total_loss += loss.item() * batch_size
        data_size += batch_size
        
        epoch_loss = total_loss / data_size
        
        pbar.set_postfix(step = step, Train_Loss=loss.item(), accuracy = accuracy)
        
    return epoch_loss

In [None]:
@torch.no_grad()
def valid(model, dataloader,device):
    model.eval()
    
    total_loss = 0
    data_size = 0
    
    pbar = tqdm(enumerate(dataloader))
    for step, data in pbar:
        input_ids = data['input_ids'].to(device, dtype = torch.long)
        attention_mask = data['attention_mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        target = torch.LongTensor(data['target'])
        batch_size = input_ids.shape[0]
        
        output = model(input_ids, attention_mask, token_type_ids)
        
        loss = criterion(output,target)
        output = F.softmax(output,dim=1)
        prediction = output.argmax(1)
        accuracy = (prediction == target).sum().item() / len(target)
        

        total_loss += loss.item() * batch_size
        data_size += batch_size
        
        epoch_loss = total_loss / data_size
        
        pbar.set_postfix(step = step, Train_Loss=loss.item(), accuracy = accuracy)
    
    return epoch_loss

In [None]:
df_train = df.loc[df.index < 0.8 * len(df),:]
df_valid = df.loc[df.index > 0.8 * len(df),:]

train_dataset = TestdriveDataset(df_train,max_length = config['max_length'])
train_loader = DataLoader(train_dataset, shuffle = True, batch_size = config['train_batch_size'], drop_last = False, collate_fn = collate_fn )

valid_dataset = TestdriveDataset(df_valid,max_length = config['max_length'])
valid_loader = DataLoader(valid_dataset, shuffle = False, batch_size = config['train_batch_size'], drop_last = False, collate_fn = collate_fn )

In [None]:
model = TestdriveModel(0.2)
epoch_num = config['epochs']
device = config['device']
optimizer = AdamW(model.parameters(), lr = config['lr'], correct_bias = True, weight_decay=config['weight_decay'])
criterion = nn.CrossEntropyLoss()
scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=500,
                                                   eta_min=1e-6)

# total_steps = len(train_loader) * epoch_num
# scheduler = get_linear_schedule_with_warmup(optimizer,
#                                                          int(total_steps *0.1),
#                                                          total_steps)


best_valid_loss = 100

for i in range(epoch_num):
    train_epoch_loss = train(model, train_loader, optimizer,device)
    valid_epoch_loss = valid(model, valid_loader, device)
    
    if valid_epoch_loss < best_valid_loss:
        torch.save(model.state_dict(), weight_path)
        best_valid_loss = valid_epoch_loss
        print('model_saved', valid_epoch_loss)
    
    print('train_epoch_loss:', train_epoch_loss)
    print('valid_epoch_loss:', valid_epoch_loss)