In [3]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers.pipelines import SUPPORTED_TASKS

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
for k, v in SUPPORTED_TASKS.items():
    print(k, v)

audio-classification {'impl': <class 'transformers.pipelines.audio_classification.AudioClassificationPipeline'>, 'tf': (), 'pt': (<class 'transformers.models.auto.modeling_auto.AutoModelForAudioClassification'>,), 'default': {'model': {'pt': ('superb/wav2vec2-base-superb-ks', '372e048')}}, 'type': 'audio'}
automatic-speech-recognition {'impl': <class 'transformers.pipelines.automatic_speech_recognition.AutomaticSpeechRecognitionPipeline'>, 'tf': (), 'pt': (<class 'transformers.models.auto.modeling_auto.AutoModelForCTC'>, <class 'transformers.models.auto.modeling_auto.AutoModelForSpeechSeq2Seq'>), 'default': {'model': {'pt': ('facebook/wav2vec2-base-960h', '55bb623')}}, 'type': 'multimodal'}
text-to-audio {'impl': <class 'transformers.pipelines.text_to_audio.TextToAudioPipeline'>, 'tf': (), 'pt': (<class 'transformers.models.auto.modeling_auto.AutoModelForTextToWaveform'>, <class 'transformers.models.auto.modeling_auto.AutoModelForTextToSpectrogram'>), 'default': {'model': {'pt': ('suno

In [5]:
tokenizer = AutoTokenizer.from_pretrained("./models/liam168/c2-roberta-base-finetuned-dianping-chinese")
tokenizer



BertTokenizerFast(name_or_path='./models/liam168/c2-roberta-base-finetuned-dianping-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [6]:
tokenizer.vocab

{'幌': 2389,
 'ても': 9272,
 '芡': 5699,
 '##摆': 16087,
 '猝': 4340,
 'wi': 8541,
 '##隕': 20450,
 '狱': 4328,
 '弊': 2464,
 '罩': 5388,
 '蘋': 5981,
 '狈': 4314,
 '##韶': 20568,
 '##离': 17952,
 'welcome': 12759,
 '##0': 8129,
 '##鹳': 20974,
 '＠': 8044,
 '##ts': 8723,
 '##嗶': 14699,
 '吶': 1428,
 '嚴': 1713,
 '##彪': 15564,
 '##ai': 8982,
 '1920': 9208,
 '##佟': 13928,
 '##゜': 13678,
 '馆': 7667,
 '##tos': 12920,
 '##ᅦ': 13473,
 '繞': 5254,
 '秦': 4912,
 '咕': 1475,
 '歧': 3637,
 '莠': 5807,
 '##2007': 10604,
 'show': 8658,
 '##嶂': 15379,
 'tag': 8801,
 '##肩': 18561,
 '##force': 10488,
 '##楚': 16561,
 '瘦': 4607,
 '##►': 13612,
 '##吊': 14453,
 '##髦': 20828,
 '##graphy': 12872,
 '勁': 1233,
 '##ヶ': 10569,
 'ａ': 8051,
 '臭': 5634,
 '##跛': 19711,
 '##饽': 20718,
 '珮': 4409,
 '水': 3717,
 '##ision': 12556,
 '##ha': 8778,
 '止': 3632,
 '卦': 1308,
 'てきます': 11397,
 '##≫': 13550,
 '盹': 4686,
 '譙': 6353,
 '営': 1612,
 '寥': 2178,
 '澜': 4073,
 '##ました': 8861,
 '##単': 14356,
 '瞄': 4730,
 'される': 12033,
 'g5': 12492,
 '##杀': 163

In [7]:
sen = "我有一个梦想，能让大家都无忧无虑"
tokens = tokenizer.tokenize(sen)
tokens

['我',
 '有',
 '一',
 '个',
 '梦',
 '想',
 '，',
 '能',
 '让',
 '大',
 '家',
 '都',
 '无',
 '忧',
 '无',
 '虑']

In [8]:
ids = tokenizer.encode(sen, add_special_tokens=True)
ids

[101,
 2769,
 3300,
 671,
 702,
 3457,
 2682,
 8024,
 5543,
 6375,
 1920,
 2157,
 6963,
 3187,
 2569,
 3187,
 5991,
 102]

In [9]:
sen = tokenizer.decode(ids, skip_special_tokens=False)
print(sen)
sen = tokenizer.decode(ids, skip_special_tokens=True)
print(sen)

[CLS] 我 有 一 个 梦 想 ， 能 让 大 家 都 无 忧 无 虑 [SEP]
我 有 一 个 梦 想 ， 能 让 大 家 都 无 忧 无 虑


In [10]:
ids = tokenizer.encode(sen, max_length=5, padding='max_length', truncation=True)
print(ids)
print(tokenizer.decode(ids, skip_special_tokens=False))
ids = tokenizer.encode(sen, max_length=25, padding='max_length', truncation=True)
print(ids)
print(tokenizer.decode(ids, skip_special_tokens=False))

[101, 2769, 3300, 671, 102]
[CLS] 我 有 一 [SEP]
[101, 2769, 3300, 671, 702, 3457, 2682, 8024, 5543, 6375, 1920, 2157, 6963, 3187, 2569, 3187, 5991, 102, 0, 0, 0, 0, 0, 0, 0]
[CLS] 我 有 一 个 梦 想 ， 能 让 大 家 都 无 忧 无 虑 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]


In [11]:
inputs = tokenizer(sen, max_length=25, padding='max_length', truncation=True)
inputs

{'input_ids': [101, 2769, 3300, 671, 702, 3457, 2682, 8024, 5543, 6375, 1920, 2157, 6963, 3187, 2569, 3187, 5991, 102, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]}

In [12]:
sens = ['我有一个梦想', '所有人都能幸福', '但是感觉实现不了']
res = tokenizer(sens, max_length=25, padding='max_length', truncation=True)
print(res)

{'input_ids': [[101, 2769, 3300, 671, 702, 3457, 2682, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 2792, 3300, 782, 6963, 5543, 2401, 4886, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 852, 3221, 2697, 6230, 2141, 4385, 679, 749, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}


In [13]:
sen = '我有一个dreaming！'