In [3]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers.pipelines import SUPPORTED_TASKS

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
for k, v in SUPPORTED_TASKS.items():
    print(k, v)

audio-classification {'impl': <class 'transformers.pipelines.audio_classification.AudioClassificationPipeline'>, 'tf': (), 'pt': (<class 'transformers.models.auto.modeling_auto.AutoModelForAudioClassification'>,), 'default': {'model': {'pt': ('superb/wav2vec2-base-superb-ks', '372e048')}}, 'type': 'audio'}
automatic-speech-recognition {'impl': <class 'transformers.pipelines.automatic_speech_recognition.AutomaticSpeechRecognitionPipeline'>, 'tf': (), 'pt': (<class 'transformers.models.auto.modeling_auto.AutoModelForCTC'>, <class 'transformers.models.auto.modeling_auto.AutoModelForSpeechSeq2Seq'>), 'default': {'model': {'pt': ('facebook/wav2vec2-base-960h', '55bb623')}}, 'type': 'multimodal'}
text-to-audio {'impl': <class 'transformers.pipelines.text_to_audio.TextToAudioPipeline'>, 'tf': (), 'pt': (<class 'transformers.models.auto.modeling_auto.AutoModelForTextToWaveform'>, <class 'transformers.models.auto.modeling_auto.AutoModelForTextToSpectrogram'>), 'default': {'model': {'pt': ('suno

In [5]:
tokenizer = AutoTokenizer.from_pretrained("./models/liam168/c2-roberta-base-finetuned-dianping-chinese")
tokenizer



BertTokenizerFast(name_or_path='./models/liam168/c2-roberta-base-finetuned-dianping-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [6]:
tokenizer.vocab

{'Âπå': 2389,
 '„Å¶„ÇÇ': 9272,
 'Ëä°': 5699,
 '##ÊëÜ': 16087,
 'Áåù': 4340,
 'wi': 8541,
 '##Èöï': 20450,
 'Áã±': 4328,
 'Âºä': 2464,
 'ÁΩ©': 5388,
 'Ëòã': 5981,
 'Áãà': 4314,
 '##Èü∂': 20568,
 '##Á¶ª': 17952,
 'welcome': 12759,
 '##0': 8129,
 '##Èπ≥': 20974,
 'Ôº†': 8044,
 '##ts': 8723,
 '##Âó∂': 14699,
 'Âê∂': 1428,
 'Âö¥': 1713,
 '##ÂΩ™': 15564,
 '##ai': 8982,
 '1920': 9208,
 '##‰Ωü': 13928,
 '##„Çú': 13678,
 'È¶Ü': 7667,
 '##tos': 12920,
 '##·Ö¶': 13473,
 'Áπû': 5254,
 'Áß¶': 4912,
 'Âíï': 1475,
 'Ê≠ß': 3637,
 'Ëé†': 5807,
 '##2007': 10604,
 'show': 8658,
 '##Â∂Ç': 15379,
 'tag': 8801,
 '##ËÇ©': 18561,
 '##force': 10488,
 '##Ê•ö': 16561,
 'Áò¶': 4607,
 '##‚ñ∫': 13612,
 '##Âêä': 14453,
 '##È´¶': 20828,
 '##graphy': 12872,
 'ÂãÅ': 1233,
 '##„É∂': 10569,
 'ÔΩÅ': 8051,
 'Ëá≠': 5634,
 '##Ë∑õ': 19711,
 '##È•Ω': 20718,
 'ÁèÆ': 4409,
 'Ê∞¥': 3717,
 '##ision': 12556,
 '##ha': 8778,
 'Ê≠¢': 3632,
 'Âç¶': 1308,
 '„Å¶„Åç„Åæ„Åô': 11397,
 '##‚â´': 13550,
 'Áõπ': 4686,
 'Ë≠ô': 6353,
 'Âñ∂': 1612,

In [7]:
sen = "ÊàëÊúâ‰∏Ä‰∏™Ê¢¶ÊÉ≥ÔºåËÉΩËÆ©Â§ßÂÆ∂ÈÉΩÊó†ÂøßÊó†Ëôë"
tokens = tokenizer.tokenize(sen)
tokens

['Êàë',
 'Êúâ',
 '‰∏Ä',
 '‰∏™',
 'Ê¢¶',
 'ÊÉ≥',
 'Ôºå',
 'ËÉΩ',
 'ËÆ©',
 'Â§ß',
 'ÂÆ∂',
 'ÈÉΩ',
 'Êó†',
 'Âøß',
 'Êó†',
 'Ëôë']

In [8]:
ids = tokenizer.encode(sen, add_special_tokens=True)
ids

[101,
 2769,
 3300,
 671,
 702,
 3457,
 2682,
 8024,
 5543,
 6375,
 1920,
 2157,
 6963,
 3187,
 2569,
 3187,
 5991,
 102]

In [9]:
sen = tokenizer.decode(ids, skip_special_tokens=False)
print(sen)
sen = tokenizer.decode(ids, skip_special_tokens=True)
print(sen)

[CLS] Êàë Êúâ ‰∏Ä ‰∏™ Ê¢¶ ÊÉ≥ Ôºå ËÉΩ ËÆ© Â§ß ÂÆ∂ ÈÉΩ Êó† Âøß Êó† Ëôë [SEP]
Êàë Êúâ ‰∏Ä ‰∏™ Ê¢¶ ÊÉ≥ Ôºå ËÉΩ ËÆ© Â§ß ÂÆ∂ ÈÉΩ Êó† Âøß Êó† Ëôë


In [10]:
ids = tokenizer.encode(sen, max_length=5, padding='max_length', truncation=True)
print(ids)
print(tokenizer.decode(ids, skip_special_tokens=False))
ids = tokenizer.encode(sen, max_length=25, padding='max_length', truncation=True)
print(ids)
print(tokenizer.decode(ids, skip_special_tokens=False))

[101, 2769, 3300, 671, 102]
[CLS] Êàë Êúâ ‰∏Ä [SEP]
[101, 2769, 3300, 671, 702, 3457, 2682, 8024, 5543, 6375, 1920, 2157, 6963, 3187, 2569, 3187, 5991, 102, 0, 0, 0, 0, 0, 0, 0]
[CLS] Êàë Êúâ ‰∏Ä ‰∏™ Ê¢¶ ÊÉ≥ Ôºå ËÉΩ ËÆ© Â§ß ÂÆ∂ ÈÉΩ Êó† Âøß Êó† Ëôë [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]


In [11]:
inputs = tokenizer(sen, max_length=25, padding='max_length', truncation=True)
inputs

{'input_ids': [101, 2769, 3300, 671, 702, 3457, 2682, 8024, 5543, 6375, 1920, 2157, 6963, 3187, 2569, 3187, 5991, 102, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]}

In [12]:
sens = ['ÊàëÊúâ‰∏Ä‰∏™Ê¢¶ÊÉ≥', 'ÊâÄÊúâ‰∫∫ÈÉΩËÉΩÂπ∏Á¶è', '‰ΩÜÊòØÊÑüËßâÂÆûÁé∞‰∏ç‰∫Ü']
res = tokenizer(sens, max_length=25, padding='max_length', truncation=True)
print(res)

{'input_ids': [[101, 2769, 3300, 671, 702, 3457, 2682, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 2792, 3300, 782, 6963, 5543, 2401, 4886, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 852, 3221, 2697, 6230, 2141, 4385, 679, 749, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}


In [13]:
sen = 'ÊàëÊúâ‰∏Ä‰∏™dreamingÔºÅ'