In [2]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers.pipelines import SUPPORTED_TASKS

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
for k, v in SUPPORTED_TASKS.items():
    print(k, v)

audio-classification {'impl': <class 'transformers.pipelines.audio_classification.AudioClassificationPipeline'>, 'tf': (), 'pt': (<class 'transformers.models.auto.modeling_auto.AutoModelForAudioClassification'>,), 'default': {'model': {'pt': ('superb/wav2vec2-base-superb-ks', '372e048')}}, 'type': 'audio'}
automatic-speech-recognition {'impl': <class 'transformers.pipelines.automatic_speech_recognition.AutomaticSpeechRecognitionPipeline'>, 'tf': (), 'pt': (<class 'transformers.models.auto.modeling_auto.AutoModelForCTC'>, <class 'transformers.models.auto.modeling_auto.AutoModelForSpeechSeq2Seq'>), 'default': {'model': {'pt': ('facebook/wav2vec2-base-960h', '22aad52')}}, 'type': 'multimodal'}
text-to-audio {'impl': <class 'transformers.pipelines.text_to_audio.TextToAudioPipeline'>, 'tf': (), 'pt': (<class 'transformers.models.auto.modeling_auto.AutoModelForTextToWaveform'>, <class 'transformers.models.auto.modeling_auto.AutoModelForTextToSpectrogram'>), 'default': {'model': {'pt': ('suno

In [4]:
tokenizer = AutoTokenizer.from_pretrained("./models/liam168/c2-roberta-base-finetuned-dianping-chinese")
tokenizer

BertTokenizerFast(name_or_path='./models/liam168/c2-roberta-base-finetuned-dianping-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [5]:
tokenizer.vocab

{'孺': 2120,
 '2018': 8271,
 '摯': 3041,
 '##榛': 16584,
 '迫': 6833,
 '鄺': 6976,
 '##圓': 14812,
 '錢': 7092,
 '瀞': 4111,
 '權': 3609,
 '##de': 8510,
 '##ps': 8525,
 'lohas': 9757,
 'po': 8559,
 '##梁': 16505,
 '狹': 4330,
 '衆': 6120,
 '跎': 6650,
 'も': 571,
 '慕': 2710,
 '##央': 14982,
 '懶': 2754,
 '##tz': 10228,
 '##陀': 20408,
 '捆': 2928,
 '吖': 1407,
 '##▇': 13600,
 '##寵': 15244,
 '##酌': 20037,
 'mc': 10341,
 '##賜': 19598,
 'bloomberg': 10313,
 '咪': 1488,
 '専': 2197,
 '業': 3511,
 '藻': 5976,
 '##养': 14132,
 '##嗔': 14682,
 '弋': 2465,
 '叟': 1362,
 '5500': 11298,
 '疼': 4563,
 '篠': 5069,
 '##长': 20327,
 '▲topjul': 10572,
 'chris': 9582,
 '橘': 3580,
 '##半': 14345,
 '##越': 19689,
 '##レス': 12696,
 '309': 11289,
 '饍': 7639,
 '∀': 376,
 '##鹏': 20962,
 '吒': 1404,
 '帕': 2364,
 '抿': 2854,
 'ч': 255,
 '☺': 484,
 '##宋': 15186,
 'echo': 11701,
 'p3': 12454,
 'return': 12330,
 '郜': 6949,
 '##瀬': 17172,
 '捻': 2951,
 '##000': 9086,
 '楫': 3510,
 '##珐': 17456,
 '##吏': 14458,
 '##燕': 17299,
 'expedia': 12317,
 'engi

In [6]:
sen = "我有一个梦想，能让大家都无忧无虑"
tokens = tokenizer.tokenize(sen)
tokens

['我',
 '有',
 '一',
 '个',
 '梦',
 '想',
 '，',
 '能',
 '让',
 '大',
 '家',
 '都',
 '无',
 '忧',
 '无',
 '虑']

In [7]:
ids = tokenizer.encode(sen, add_special_tokens=True)
ids

[101,
 2769,
 3300,
 671,
 702,
 3457,
 2682,
 8024,
 5543,
 6375,
 1920,
 2157,
 6963,
 3187,
 2569,
 3187,
 5991,
 102]

In [8]:
sen = tokenizer.decode(ids, skip_special_tokens=False)
print(sen)
sen = tokenizer.decode(ids, skip_special_tokens=True)
print(sen)

[CLS] 我 有 一 个 梦 想 ， 能 让 大 家 都 无 忧 无 虑 [SEP]
我 有 一 个 梦 想 ， 能 让 大 家 都 无 忧 无 虑


In [9]:
ids = tokenizer.encode(sen, max_length=5, padding='max_length', truncation=True)
print(ids)
print(tokenizer.decode(ids, skip_special_tokens=False))
ids = tokenizer.encode(sen, max_length=25, padding='max_length', truncation=True)
print(ids)
print(tokenizer.decode(ids, skip_special_tokens=False))

[101, 2769, 3300, 671, 102]
[CLS] 我 有 一 [SEP]
[101, 2769, 3300, 671, 702, 3457, 2682, 8024, 5543, 6375, 1920, 2157, 6963, 3187, 2569, 3187, 5991, 102, 0, 0, 0, 0, 0, 0, 0]
[CLS] 我 有 一 个 梦 想 ， 能 让 大 家 都 无 忧 无 虑 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]


In [10]:
inputs = tokenizer(sen, max_length=25, padding='max_length', truncation=True)
inputs

{'input_ids': [101, 2769, 3300, 671, 702, 3457, 2682, 8024, 5543, 6375, 1920, 2157, 6963, 3187, 2569, 3187, 5991, 102, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]}

In [11]:
sens = ['我有一个梦想', '所有人都能幸福', '但是感觉实现不了']
res = tokenizer(sens, max_length=25, padding='max_length', truncation=True)
print(res)

{'input_ids': [[101, 2769, 3300, 671, 702, 3457, 2682, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 2792, 3300, 782, 6963, 5543, 2401, 4886, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 852, 3221, 2697, 6230, 2141, 4385, 679, 749, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}


In [12]:
sen = '我有一个dreaming！'

In [13]:
data = pd.read_csv("./data/ChnSentiCorp_htl_all.csv")
data.head

<bound method NDFrame.head of       label                                             review
0         1  距离川沙公路较近,但是公交指示不对,如果是"蔡陆线"的话,会非常麻烦.建议用别的路线.房间较...
1         1                       商务大床房，房间很大，床有2M宽，整体感觉经济实惠不错!
2         1         早餐太差，无论去多少人，那边也不加食品的。酒店应该重视一下这个问题了。房间本身很好。
3         1  宾馆在小街道上，不大好找，但还好北京热心同胞很多~宾馆设施跟介绍的差不多，房间很小，确实挺小...
4         1               CBD中心,周围没什么店铺,说5星有点勉强.不知道为什么卫生间没有电吹风
...     ...                                                ...
7761      0  尼斯酒店的几大特点：噪音大、环境差、配置低、服务效率低。如：1、隔壁歌厅的声音闹至午夜3点许...
7762      0  盐城来了很多次，第一次住盐阜宾馆，我的确很失望整个墙壁黑咕隆咚的，好像被烟熏过一样家具非常的...
7763      0  看照片觉得还挺不错的，又是4星级的，但入住以后除了后悔没有别的，房间挺大但空空的，早餐是有但...
7764      0  我们去盐城的时候那里的最低气温只有4度，晚上冷得要死，居然还不开空调，投诉到酒店客房部，得到...
7765      0  说实在的我很失望，之前看了其他人的点评后觉得还可以才去的，结果让我们大跌眼镜。我想这家酒店以...

[7766 rows x 2 columns]>

In [14]:
data = data.dropna()
data.head

<bound method NDFrame.head of       label                                             review
0         1  距离川沙公路较近,但是公交指示不对,如果是"蔡陆线"的话,会非常麻烦.建议用别的路线.房间较...
1         1                       商务大床房，房间很大，床有2M宽，整体感觉经济实惠不错!
2         1         早餐太差，无论去多少人，那边也不加食品的。酒店应该重视一下这个问题了。房间本身很好。
3         1  宾馆在小街道上，不大好找，但还好北京热心同胞很多~宾馆设施跟介绍的差不多，房间很小，确实挺小...
4         1               CBD中心,周围没什么店铺,说5星有点勉强.不知道为什么卫生间没有电吹风
...     ...                                                ...
7761      0  尼斯酒店的几大特点：噪音大、环境差、配置低、服务效率低。如：1、隔壁歌厅的声音闹至午夜3点许...
7762      0  盐城来了很多次，第一次住盐阜宾馆，我的确很失望整个墙壁黑咕隆咚的，好像被烟熏过一样家具非常的...
7763      0  看照片觉得还挺不错的，又是4星级的，但入住以后除了后悔没有别的，房间挺大但空空的，早餐是有但...
7764      0  我们去盐城的时候那里的最低气温只有4度，晚上冷得要死，居然还不开空调，投诉到酒店客房部，得到...
7765      0  说实在的我很失望，之前看了其他人的点评后觉得还可以才去的，结果让我们大跌眼镜。我想这家酒店以...

[7765 rows x 2 columns]>

In [15]:
class MyDataset(Dataset):
    def __init__(self, data):
        super(MyDataset, self).__init__()
        self.data = data

    def __getitem__(self, index):
        return self.data.iloc[index]['review'], self.data.iloc[index]['label']

    def __len__(self):
        return len(self.data)


dataset = MyDataset(data)
for i in range(5):
    print(dataset[i])

('距离川沙公路较近,但是公交指示不对,如果是"蔡陆线"的话,会非常麻烦.建议用别的路线.房间较为简单.', 1)
('商务大床房，房间很大，床有2M宽，整体感觉经济实惠不错!', 1)
('早餐太差，无论去多少人，那边也不加食品的。酒店应该重视一下这个问题了。房间本身很好。', 1)
('宾馆在小街道上，不大好找，但还好北京热心同胞很多~宾馆设施跟介绍的差不多，房间很小，确实挺小，但加上低价位因素，还是无超所值的；环境不错，就在小胡同内，安静整洁，暖气好足-_-||。。。呵还有一大优势就是从宾馆出发，步行不到十分钟就可以到梅兰芳故居等等，京味小胡同，北海距离好近呢。总之，不错。推荐给节约消费的自助游朋友~比较划算，附近特色小吃很多~', 1)
('CBD中心,周围没什么店铺,说5星有点勉强.不知道为什么卫生间没有电吹风', 1)


In [16]:
train_dataset, val_dataset = random_split(dataset, [0.9, 0.1])
print(len(train_dataset), len(val_dataset))

6989 776


In [17]:
tokenizer = AutoTokenizer.from_pretrained('models/hfl/rbt3')


def collate_fn(batch):
    texts, labels = zip(*batch)
    inputs = tokenizer(texts, padding='max_length', truncation=True, max_length=128, return_tensors='pt')
    inputs['labels'] = torch.tensor(labels)
    return inputs

In [18]:
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)
next(enumerate(train_dataloader))[1]

{'input_ids': tensor([[ 101, 5016, 1394,  ..., 2145,  782,  102],
        [ 101, 6983, 2421,  ...,    0,    0,    0],
        [ 101, 3193, 7623,  ...,    0,    0,    0],
        ...,
        [ 101, 3025, 4923,  ...,    0,    0,    0],
        [ 101,  817, 3419,  ...,    0,    0,    0],
        [ 101, 3302, 1218,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1])}

In [19]:
model = AutoModelForSequenceClassification.from_pretrained("./models/hfl/rbt3").to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./models/hfl/rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
def evaluate():
    model.eval()
    acc_num = 0
    for batch in val_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        pred = torch.argmax(outputs.logits, dim=-1)
        acc_num += (pred.long() == batch['labels'].long()).float().sum()
    return acc_num / len(val_dataset)

In [21]:
def train(epoch=3, log_step=50):
    global_step = 0
    for i in range(epoch):
        model.train()
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            optimizer.zero_grad()
            outputs = model(**batch)
            outputs.loss.backward()
            optimizer.step()
            if global_step % log_step == 0:
                print(f'ep:{i}, global_step:{global_step}, loss:{outputs.loss.item()}')
            global_step += 1
        acc = evaluate()
        print(f'ep:{i}, acc:{acc}')

In [23]:
train()
torch.save(model, './models/my_model/classification.pth')

ep:0, global_step:0, loss:0.37253934144973755
ep:0, global_step:100, loss:0.046587929129600525
ep:0, global_step:200, loss:0.1010926142334938
ep:0, global_step:300, loss:0.06908750534057617
ep:0, global_step:400, loss:0.16498416662216187
ep:0, acc:0.9085051417350769
ep:1, global_step:500, loss:0.10629517585039139
ep:1, global_step:600, loss:0.09259515255689621
ep:1, global_step:700, loss:0.04231934994459152
ep:1, global_step:800, loss:0.15414129197597504
ep:1, acc:0.9085051417350769
ep:2, global_step:900, loss:0.0320313386619091
ep:2, global_step:1000, loss:0.03746272623538971
ep:2, global_step:1100, loss:0.21679261326789856
ep:2, global_step:1200, loss:0.1376654952764511
ep:2, global_step:1300, loss:0.35856297612190247
ep:2, acc:0.9046391248703003


In [31]:
# test
model = torch.load('./models/my_model/classification.pth')
sen1 = '我喜欢这家酒店的服务'
sen2 = '我认为这家酒店很糟糕'
id2label = {0: '差评', 1: '好评'}
model.eval()
with torch.no_grad():
    inputs = tokenizer(sen1, return_tensors='pt')
    inputs = {k: v.to(device) for k, v in inputs.items()}
    outputs = model(**inputs)
    pred = torch.argmax(outputs.logits, dim=-1)
    print(f'sen:{sen1},pred:{id2label[pred.item()]}')
    inputs = tokenizer(sen2, return_tensors='pt')
    inputs = {k: v.to(device) for k, v in inputs.items()}
    outputs = model(**inputs)
    pred = torch.argmax(outputs.logits, dim=-1)
    print(f'sen:{sen2},pred:{id2label[pred.item()]}')

  model = torch.load('./models/my_model/classification.pth')


sen:我喜欢这家酒店的服务,pred:好评
sen:我认为这家酒店很糟糕,pred:差评
