In [None]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from transformers import BertTokenizer, BertModel, AdamW
from sklearn.preprocessing import LabelEncoder
import pickle
from torch.nn import functional as F
from transformers import DataCollatorWithPadding
from torch.optim import lr_scheduler
from datetime import datetime,timedelta,timezone
import re
from sentence_split_tfidf import *

In [None]:
#如果不需要切句，把True改成False
cut = True #False
flag  'aspect'
model_path = 'bert_chinese/'
directory_path = 'HSMS/TrainingSet/test_drive/'
file_path = directory_path + 'test.csv'

weight_path = "Loss-v5.bin"

config = {
          "epochs": 10,
          "batch_size": 16,
          "max_length": 64,
          "lr": 1e-5,
          "weight_decay": 1e-6,           
          "num_classes": 8,
          "device": torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
          "tokenizer" : BertTokenizer.from_pretrained(model_path)
          }

if flag == 'aspect':
    weight_path = "Loss-v5.bin"
    config['num_class'] == 8
    encoder = pickle.load(open('encoder_aspect',"rb"))
elif flag == 'sentiment':
    weight_path = "Loss-sentiment_v2.bin"
    config['num_class'] == 3
    encoder = pickle.load(open('encoder_sentiment',"rb"))    
        
df = pd.read_csv(file_path)

df['text'] = df['评论内容']

In [None]:
####切句####
if cut:
    tfidf = pickle.load(open('tfidf_test_drive.pkl','rb'))
    cs = cutSentence(tfidf)
    df['sentences'] = df['text'].apply(cs.get_cut_sentence)
    df = df.explode('sentences')
else:
    df['sentences'] = df['text']

In [None]:
class TestdriveDataset(Dataset):
    def __init__(self, df, max_length):
        self.text = df['sentences'].values
        
        self.max_length = max_length
        
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self,ids):
        tokenizer = config['tokenizer']
        text = self.text[ids]
        target = self.target[ids]
        inputs = tokenizer.encode_plus(text,
                      truncation=True,
                      add_special_tokens=True,
                      max_length = self.max_length
                      )
        
        data_dict = {'input_ids' : inputs['input_ids'],
                    'attention_mask': inputs['attention_mask']
                    }
        
        
        return data_dict


In [None]:
collate_fn = DataCollatorWithPadding(tokenizer=config['tokenizer'])

In [None]:
class TestdriveModel(nn.Module):
    def __init__(self,drop_rate):
        super(TestdriveModel, self).__init__()
        self.bert = BertModel.from_pretrained(model_path)
        self.drop = nn.Dropout(drop_rate)
        self.fc = nn.Linear(768, config['num_classes'])
    
    def forward(self, input_ids, attention_mask):
        output = self.bert(input_ids=input_ids,
                       attention_mask=attention_mask)
        
        output = self.drop(output.last_hidden_state[:,0])
        output = self.fc(output)
        
        return output

In [None]:
@torch.no_grad()
def valid(model, dataloader,device):
    model.eval()
    
    prediction_list = []
    pbar = tqdm(enumerate(dataloader))
    for step, data in pbar:
        input_ids = data['input_ids'].to(device, dtype = torch.long)
        attention_mask = data['attention_mask'].to(device, dtype = torch.long)
        
        
        output = model(input_ids, attention_mask)
        
        output = F.softmax(output,dim=1)
        prediction = output.argmax(1)
        prediction_list.extend(prediction.cpu().detach().numpy())

      
        
    return prediction_list

In [None]:
test_dataset = TestdriveDataset(df,max_length = config['max_length'])
test_loader = DataLoader(test_dataset, shuffle = False, batch_size = config['batch_size'], drop_last = False,collate_fn=collate_fn)

In [None]:
def inference(model_paths, dataloader, device):
    

    print('start')
    model = TestdriveModel(0.2)
    model.to(config['device'])
    model.load_state_dict(torch.load(model_paths))

    
    preds = valid(model, dataloader, device)

    
    return preds

In [None]:
model_preds = inference(weight_path, test_loader, config['device'])
df['prediction'] = encoder.inverse_transform(model_preds)

In [None]:
def get_output(df,directory_path):
    timezone_offset = +8.0 
    tzinfo = timezone(timedelta(hours=timezone_offset))
    current_date =  datetime.now(tzinfo)
    Date_of_today = ('0' + str(current_date.month) if len(str(current_date.month))<2 else str(current_date.month)) + ('0' + str(current_date.day) if len(str(current_date.day))<2 else str(current_date.day))
    file_name =  f'prediction {Data_of_today}'
    df.to_excel(directory_path + file_name)

In [None]:
#第一遍到这里，然后去改flag再跑一遍
if flag =='sentiment':
    df_sentiment = df.copy()
else:
    df_aspect = df.copy()

In [None]:
df_output = pd.concat([df_sentiment, df_aspect[['prediction']]],axis = 1)
get_output(df_output)