In [1]:
import json

In [2]:
from biaffine import BiaffineModel

In [3]:
class Args(object):
    def __init__(self, *initial_data, **kwargs):
        for dictionary in initial_data:
            for key in dictionary:
                setattr(self, key, dictionary[key])
        for key in kwargs:
            setattr(self, key, kwargs[key])

In [4]:
args = Args(bert_model_path='/root/autodl-nas/pretrain-models/roberta-base',bert_feature_dim=768,biaffine_size=300,class_num=8,max_sequence_len=128)

In [5]:
model = BiaffineModel(args)

Some weights of the model checkpoint at /root/autodl-nas/pretrain-models/roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
import torch

In [7]:
model.load_state_dict(torch.load('savemodel/triplet_v11.bin'))

<All keys matched successfully>

In [8]:
model.eval()
pass

In [9]:
import nltk

In [10]:
from transformers import AutoTokenizer,RobertaForSequenceClassification

In [11]:
tokenizer = AutoTokenizer.from_pretrained(args.bert_model_path,add_prefix_space=True)

In [12]:
def convert(tokens):
    tokens = ['##'] + tokens
    sen_length = len(tokens)
    token_range = []
    bert_tokens = tokenizer.encode(tokens,is_split_into_words=True,truncation=True,max_length=args.max_sequence_len)
    length = len(bert_tokens)
    bert_tokens_padding = torch.zeros(args.max_sequence_len).long()
    mask = torch.zeros(args.max_sequence_len)

    for i in range(length):
        bert_tokens_padding[i] = bert_tokens[i]
    mask[:length] = 1

    token_start = 1
    for i, w, in enumerate(tokens):
        token_end = token_start + len(tokenizer.encode(w, add_special_tokens=False))
        token_range.append([token_start, token_end-1])
        token_start = token_end
    assert length == token_range[-1][-1]+2
    return bert_tokens_padding,mask,token_range,sen_length,tokens

In [13]:
bert_tokens_padding,mask,token_range,sen_length,tokens = convert('Again , competing products can generally intelligently split the wattage output between their available usb-c ports .'.split())

In [14]:
mask.shape

torch.Size([128])

In [15]:
preds = model(bert_tokens_padding.unsqueeze(0), mask.unsqueeze(0))

In [16]:
preds = torch.argmax(preds, dim=3)[0]

In [17]:
preds

tensor([[0, 1, 0,  ..., 0, 0, 0],
        [0, 1, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])

In [18]:
ASPECT_BEGIN=1
ASPECT_IN=2
OPINION_BEGIN=3
OPINION_IN=4
PAIR=5
sentiment2id = {'观点-负面': 5, '观点-中性':6, '观点-正面': 7}

ASPECT=[ASPECT_BEGIN,ASPECT_IN]
OPINION=[OPINION_BEGIN,OPINION_IN]

In [19]:
id2sentiment = {v:k for k,v in sentiment2id.items()}

In [20]:
def get_spans(tags, length, token_range, type):
    spans = []
    start = -1
    begin,mid = type
    for i in range(length):
        l, r = token_range[i]
        if tags[l][l] == -1:
            continue
        elif tags[l][l] == begin:
            if start != -1:
                spans.append([start, i - 1])
            start = i
        elif tags[l][l] not in type:
            if start != -1:
                spans.append([start, i - 1])
                start = -1
    if start != -1:
        spans.append([start, length - 1])
    return spans

In [21]:
def find_triplet(tags, aspect_spans, opinion_spans, token_ranges):
    triplets = []
    for al, ar in aspect_spans:
        for pl, pr in opinion_spans:
            tag_num = [0] * 8
            for i in range(al, ar + 1):
                for j in range(pl, pr + 1):
                    a_start = token_ranges[i][0]
                    o_start = token_ranges[j][0]
                    if al < pl:
                        tag_num[int(tags[a_start][o_start])] += 1
                    else:
                        tag_num[int(tags[o_start][a_start])] += 1

            if sum(tag_num[5:]) == 0: continue
            sentiment = -1
            if tag_num[5] >= tag_num[6] and tag_num[5] >= tag_num[7]:
                sentiment = 5
            elif tag_num[6] >= tag_num[5] and tag_num[6] >= tag_num[7]:
                sentiment = 6
            elif tag_num[7] >= tag_num[5] and tag_num[7] >= tag_num[6]:
                sentiment = 7
            if sentiment == -1:
                continue
            triplets.append([al, ar, pl, pr, sentiment])
         
    ops = set([(x[2],x[3]) for x in triplets if x[0]!=0])
    triplets = [x for x in triplets if not (x[0]==0 and (x[2],x[3]) in ops)]
        
    return triplets

In [22]:
predicted_aspect_spans = get_spans(preds, sen_length, token_range, ASPECT)
predicted_opinion_spans = get_spans(preds, sen_length, token_range, OPINION)

In [23]:
triplets = find_triplet(preds, predicted_aspect_spans,predicted_opinion_spans,token_range)

In [24]:
from copy import deepcopy

In [25]:
triplets

[[0, 0, 3, 16, 5]]

In [26]:
for al,ar,pl,pr,sentiment in triplets:
    aspect = ' '.join(tokens[al:ar+1])
    opinion = ' '.join(tokens[pl:pr+1])
    sentiment = id2sentiment[sentiment]
    
    
    print((aspect,opinion,sentiment))
    
    tt = deepcopy(tokens)
    tt[al] = '# ' + tt[al]
    tt[ar] = tt[ar] + ' #'
    tt[pl] = '$ ' + tt[pl]
    tt[pr] = tt[pr] + ' $'
    s1 = tokens[al:ar+1] + tokens[pl:pr+1] if al > 0 else tokens[pl:pr+1]
    s2 = tt[1:]
    print(s1,s2)
    

('##', 'competing products can generally intelligently split the wattage output between their available usb-c ports', '观点-负面')
['competing', 'products', 'can', 'generally', 'intelligently', 'split', 'the', 'wattage', 'output', 'between', 'their', 'available', 'usb-c', 'ports'] ['Again', ',', '$ competing', 'products', 'can', 'generally', 'intelligently', 'split', 'the', 'wattage', 'output', 'between', 'their', 'available', 'usb-c', 'ports $', '.']


In [27]:
tt = deepcopy(tokens)

In [28]:

ca_label_list = json.load(open('/root/autodl-nas/ABSA/通用/general-category-labels.json'))
ca_label2id = {lb:i for i,lb in enumerate(ca_label_list)}
ca_id2label = {v:k for k,v in ca_label2id.items()}
num_labels = len(ca_label_list)

In [29]:
device = 'cuda'

In [30]:
def get_category_model():
    model = RobertaForSequenceClassification.from_pretrained("/root/autodl-nas/pretrain-models/reviews-roberta/", num_labels=num_labels)
    model.load_state_dict(torch.load("/root/autodl-nas/ABSA/通用/model-p69-f54.bin"))
    model.to(device)
    return model

In [31]:
category_model = get_category_model()

Some weights of the model checkpoint at /root/autodl-nas/pretrain-models/reviews-roberta/ were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at /root/autodl-nas/pretrain-models/reviews-roberta/ and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bia

In [32]:
def category_predict(s1,s2):
    tokenized_inputs = tokenizer(s1,s2,truncation=True,max_length=100,padding='max_length',is_split_into_words=True,return_tensors='pt')
    tokenized_inputs = {k:v.to(device) for k,v in tokenized_inputs.items()}

    outs = category_model(**tokenized_inputs)[0]
    b_logit_pred = outs
    b_probs = torch.softmax(b_logit_pred,dim=-1)
    pred_label = torch.argmax(b_logit_pred,dim=-1)

    b_probs = b_probs.detach().cpu().numpy()
    pred_label = pred_label.to('cpu').numpy()
    return [[ca_id2label[int(pl)],b_probs[i][int(pl)]] for i,pl in enumerate(pred_label)]

In [33]:
model.to(device)
pass

In [34]:
def predict(text):
    text = text.lower()
    stokens = nltk.word_tokenize(text)[:110]
    try:
        bert_tokens_padding,mask,token_range,sen_length,tokens = convert(stokens)
    except:
        return []
    preds = model(bert_tokens_padding.unsqueeze(0).to(device), mask.unsqueeze(0).to(device))
    preds = torch.argmax(preds, dim=3)[0]
    preds = preds.detach().cpu().numpy()
    predicted_aspect_spans = get_spans(preds, sen_length, token_range, ASPECT)
    predicted_opinion_spans = get_spans(preds, sen_length, token_range, OPINION)
    triplets = find_triplet(preds, predicted_aspect_spans,predicted_opinion_spans,token_range)
    finals = []
    for al,ar,pl,pr,sentiment in triplets:
        aspect = ' '.join(tokens[al:ar+1])
        opinion = ' '.join(tokens[pl:pr+1])
        sentiment = id2sentiment[sentiment]
        tt = deepcopy(tokens)
        tt[al] = '# ' + tt[al]
        tt[ar] = tt[ar] + ' #'
        tt[pl] = '$ ' + tt[pl]
        tt[pr] = tt[pr] + ' $'
        s1 = tokens[al:ar+1] + tokens[pl:pr+1] if al > 0 else tokens[pl:pr+1]
        s2 = tt[1:]
        s2 = ' '.join(s2).split()
        # lb,prob = category_predict(s1,s2)[0]
        finals.append((aspect,opinion,sentiment))
    return finals
    


In [35]:
def predict_for_judge(text):
    stext = text.lower()
    stokens = nltk.word_tokenize(stext)[:110]
    
    try:
        bert_tokens_padding,mask,token_range,sen_length,tokens = convert(stokens)
    except:
        return None
    tokens = ['##'] + nltk.word_tokenize(text)[:110]
    preds = model(bert_tokens_padding.unsqueeze(0).to(device), mask.unsqueeze(0).to(device))
    preds = torch.argmax(preds, dim=3)[0]
    preds = preds.detach().cpu().numpy()
    predicted_aspect_spans = get_spans(preds, sen_length, token_range, ASPECT)
    predicted_opinion_spans = get_spans(preds, sen_length, token_range, OPINION)
    triplets = find_triplet(preds, predicted_aspect_spans,predicted_opinion_spans,token_range)
    finals = []
    tags = ['O'] * len(tokens)
    for al,ar,pl,pr,sentiment in triplets:
        tags[al:ar+1] = ['I-评价维度'] * (ar-al+1)
        tags[al] = 'B-评价维度'
        tags[pl:pr+1] = ['I-'+id2sentiment[sentiment]] * (pr-pl+1)
        tags[pl] = 'B-'+id2sentiment[sentiment]
    return tags,tokens
    


In [36]:
predict_for_judge('nice watch at a great price')

(['O', 'B-观点-正面', 'B-评价维度', 'O', 'O', 'B-观点-正面', 'I-观点-正面'],
 ['##', 'nice', 'watch', 'at', 'a', 'great', 'price'])

In [37]:
import pandas as pd

In [38]:
df = pd.read_csv('/root/autodl-nas/ABSA/raw_reviews/pen.csv').fillna('')

In [39]:
sample = df.sample(5000)

In [40]:
sample.value_counts('star')

star
5    3822
4     377
1     329
3     278
2     194
dtype: int64

In [41]:
sample = sample.to_dict('records')
five = [x for x in sample if x['star'] in [4,5]]
others = [x for x in sample if x['star'] not in [4,5]]

In [42]:
import random

In [43]:
random.shuffle(five)

In [44]:
sample = others + five[:500]

In [45]:
len(sample)

1301

In [46]:
import random

In [47]:
random.shuffle(sample)

In [48]:
# sample = [json.loads(x) for x in open('/root/autodl-tmp/xueyou/ABSA/absa待打标数据/juying_sample_1693.jsonl')]

In [49]:
import langdetect

In [50]:
from tqdm import tqdm

In [51]:
texts = []
for x in tqdm(sample):
    content = x['content']
    for p in content.split('\n'):
        for s in nltk.sent_tokenize(p):
            if len(s) < 5:
                continue
            try:
                if langdetect.detect(s) !='en':
                    continue
            except:
                continue
            texts.append({'id':x['reviewId'],'star':x['star'],'text':s})

100%|██████████| 1301/1301 [00:24<00:00, 53.38it/s]


In [52]:
len(texts)

3665

In [53]:
texts[556]

{'id': 'R6VHNJZI2YI5F',
 'star': 2,
 'text': 'Pens only 2/3 to 3/4 filled with ink.'}

In [54]:
predict(texts[556]['text'])

[('pens', 'only 2/3 to 3/4 filled with ink', '观点-负面')]

In [55]:
data = []
for x in tqdm(texts[:3000]):
    ret = predict_for_judge(x['text'])
    if ret:
        data.append((x,ret))

100%|██████████| 3000/3000 [00:34<00:00, 88.13it/s] 


In [56]:
def get_span(words,tags):
    word = []
    tag = ''
    tag_words = []
    for i,(c,t) in enumerate(zip(words,tags)):
        if t[0] in ['B','S','O']:
            if word:
                tag_words.append((word,i,tag))
            if t[0] == 'O':
                word = []
                tag = ''
                continue
            word = [c]
            tag = t[2:]
        else:
            word.append(c)
    if word:
        tag_words.append((word,i+1,tag))

    return [(b-len(a),b,c) for a,b,c in tag_words]

In [57]:
len(data)

3000

In [58]:
with open('/root/autodl-tmp/xueyou/ABSA/absa待打标数据/pen_judge_ABSA.jsonl','w') as f:
    for doc,(tag,tokens) in data:
        tokens = tokens[1:]
        tag = tag[1:]
        spans = get_span(tokens,tag)
        label = []
        for s,e,l in spans:
            s = len(' '.join(tokens[:s])) + 1 if s > 0 else 0
            e = len(' '.join(tokens[:e]))
            label.append([s,e,l])
        f.write(json.dumps({'text':' '.join(tokens),'label':label,'star':doc['star'],'reviewId':doc['id']},ensure_ascii=False) + '\n')

In [59]:
label

[[6, 10, '评价维度'], [24, 29, '观点-正面'], [126, 130, '评价维度'], [131, 155, '观点-负面']]

In [60]:
def batch_predict(texts):
    batch = []
    for item in texts:
        text = item['text']
        text = text.lower()
        stokens = nltk.word_tokenize(text)[:110]
        try:
            bert_tokens_padding,mask,token_range,sen_length,tokens = convert(stokens)
        except:
            continue
        batch.append((item,bert_tokens_padding,mask,token_range,sen_length,tokens))
    preds = model(torch.stack([x[1] for x in batch]).to(device), torch.stack([x[2] for x in batch]).to(device))
    preds = torch.argmax(preds, dim=3)
    preds = preds.detach().cpu().numpy()
    finals = []
    for i in range(len(batch)):
        predicted_aspect_spans = get_spans(preds[i], batch[i][-2], batch[i][-3], ASPECT)
        predicted_opinion_spans = get_spans(preds[i], batch[i][-2], batch[i][-3], OPINION)
        triplets = find_triplet(preds[i], predicted_aspect_spans,predicted_opinion_spans,batch[i][-3])
        tokens = batch[i][-1]
        finals.append((batch[i][0],tokens,triplets))
        # s1s,s2s = [],[]
        # aspects,opinions,sentiments = [],[],[]
        # for al,ar,pl,pr,sentiment in triplets:
        #     aspect = ' '.join(tokens[al:ar+1])
        #     opinion = ' '.join(tokens[pl:pr+1])
        #     sentiment = id2sentiment[sentiment]
        #     aspects.append(aspect)
        #     opinions.append(opinion)
        #     sentiments.append(sentiment)
        #     tt = deepcopy(tokens)
        #     tt[al] = '# ' + tt[al]
        #     tt[ar] = tt[ar] + ' #'
        #     tt[pl] = '$ ' + tt[pl]
        #     tt[pr] = tt[pr] + ' $'
        #     s1 = tokens[al:ar+1] + tokens[pl:pr+1] if al > 0 else tokens[pl:pr+1]
        #     s2 = tt[1:]
        #     s2 = ' '.join(s2).split()
        #     s1s.append(s1)
        #     s2s.append(s2)
        # categorys = category_predict(s1s,s2s)
        # for i,((lb,prob),aspect,opinion,sentiment) in enumerate(zip(categorys,aspects,opinions,sentiments)):
        #     finals.append((batch[i][0]['id'],aspect,opinion,sentiment,lb,prob))
    return finals
    

In [61]:
batch_predict([{'text':'I love the watch.','id':1}])

[({'text': 'I love the watch.', 'id': 1},
  ['##', 'i', 'love', 'the', 'watch', '.'],
  [[4, 4, 2, 2, 7]])]

In [62]:
import pandas as pd

In [63]:
from tqdm import tqdm

In [64]:
batch_size = 64

In [65]:
from glob import glob

In [66]:
for fname in glob('/root/autodl-nas/ABSA/Anker_reviews/*.csv'):
    if 'acdc.csv' in fname or 'absa' in fname:
        continue
    print('process',fname)
    finals = []
    df = pd.read_csv(fname).fillna('')
    print('raw reviews',len(df))
    texts = []
    for x in tqdm(df.to_dict('records')):
        content = x['content']
        for p in content.split('\n'):
            for s in nltk.sent_tokenize(p):
                if len(s) < 5:
                    continue
                texts.append({'id':x['reviewId'],'star':x['star'],'text':s})
    print('raw texts',len(texts))
    for i in tqdm(range(0,len(texts),batch_size)):
        batch = texts[i:i+batch_size]
        outs = batch_predict(batch)
        finals.extend(outs)

    s1s,s2s = [],[]
    idxs = []
    aspects,opinions,sentiments = [],[],[]
    for item,tokens,triplets in finals:
        for al,ar,pl,pr,sentiment in triplets:
            aspect = ' '.join(tokens[al:ar+1])
            opinion = ' '.join(tokens[pl:pr+1])
            sentiment = id2sentiment[sentiment]
            aspects.append(aspect)
            opinions.append(opinion)
            sentiments.append(sentiment)
            tt = deepcopy(tokens)
            tt[al] = '# ' + tt[al]
            tt[ar] = tt[ar] + ' #'
            tt[pl] = '$ ' + tt[pl]
            tt[pr] = tt[pr] + ' $'
            s1 = tokens[al:ar+1] + tokens[pl:pr+1] if al > 0 else tokens[pl:pr+1]
            s2 = tt[1:]
            s2 = ' '.join(s2).split()
            s1s.append(s1)
            s2s.append(s2)
            idxs.append(item)
    outs = []
    for i in tqdm(range(0,len(idxs),batch_size)):
        categorys = category_predict(s1s[i:i+batch_size],s2s[i:i+batch_size])
        for j,((lb,prob),aspect,opinion,sentiment) in enumerate(zip(categorys,aspects[i:i+batch_size],opinions[i:i+batch_size],sentiments[i:i+batch_size])):
            outs.append((idxs[i+j],aspect,opinion,sentiment,lb,prob))

    df = pd.DataFrame([{'id':x[0]['id'],'aspect':x[1],'opinion':x[2],'sentiment':x[3],'category':x[4]} for x in outs if x[-1]>=0.2])
    df.to_csv(fname.replace('.csv','_absa.csv'),index=False)

In [67]:
df.to_csv(fname.replace('.csv','_absa.csv'),index=False)

NameError: name 'fname' is not defined