In [1]:
import numpy as np
import pandas as pd

from janome.tokenizer import Tokenizer

from transformers import AutoTokenizer

In [5]:
!ls tmp

[34mdata[m[m                     hatespeech_test.csv      hatespeech_train_DAI.csv
[34mhatespeech[m[m               hatespeech_test_DAI.csv
hatespeech.zip           hatespeech_train.csv


In [6]:
df_train = pd.read_csv('tmp/hatespeech_train.csv')
df_test = pd.read_csv('tmp/hatespeech_test.csv')
df_train.shape, df_test.shape

((4656, 5), (600, 4))

In [7]:
df_train.head()

Unnamed: 0,id,source,text,label,hold
0,b1b0d5e6c,newsplus,手引きしたのは高い確率で大院君だよなぁ,0,2
1,a5e29c5f1,livejupiter,いや別にワイが困るわけじゃないからええけど対応大変やなぁと思ってな,0,1
2,bd72cb57e,newsplus,そこで家族会すら総スカンを食らった極左を出すあたり頭が逝ってるな\n結局動かぬ証拠だから動か...,0,4
3,66387bae8,news4vip,もともとB'zが好きだったんだが松本のソロアルバムでtake5カバーしててな\nそれで原曲聴...,0,1
4,b80dc3a94,livejupiter,スタンドって本体の内面現すんだよな\n\nいろいろ想像するとおもろいわ\n何でDIOは時止め...,0,2


In [8]:
df_test.head()

Unnamed: 0,id,source,text,label
0,e3d3e578b,news4vip,まぁ、俺の言ってることは余りあてにしないでくれwwww\n\n必ずいい人は現れるよ,0
1,a728f26db,news4vip,すまないがそれはレンジのほうか？\nそれともトースター？,0
2,c53e66dc9,livejupiter,そら大不況で独り勝ちした金持ちを殺してしかも自分もちゃんと腹切ってるからな\n今で言うならリ...,0
3,fb3b600fb,newsplus,外国や外国人に税金や公的資金を使うのは違法と言ってくれたら、民進党の支持率は爆上げだろうな。,0
4,ef9cd1326,newsplus,中国人は、日本の年金制度にものらないとw,0


### Janome（分ち書き）

In [33]:
t_janome = Tokenizer(wakati=True)
t_janome

<janome.tokenizer.Tokenizer at 0x7fef6c86a190>

In [34]:
train_janome_token_list = [[tkn for tkn in t_janome.tokenize(df_train['text'][i])] for i in range(df_train.shape[0])]
test_janome_token_list = [[tkn for tkn in t_janome.tokenize(df_test['text'][i])] for i in range(df_test.shape[0])]
len(train_janome_token_list), len(test_janome_token_list)

(4656, 600)

In [35]:
train_janome_token_flatten = [tkn for sublist in train_janome_token_list for tkn in sublist]
test_janome_token_flatten = [tkn for sublist in test_janome_token_list for tkn in sublist]
len(train_janome_token_flatten), len(test_janome_token_flatten)

(131587, 17175)

In [36]:
train_janome_token = list(set(train_janome_token_flatten))   # trainに出現するユニークなトークン
test_janome_token = list(set(test_janome_token_flatten))    # testに出現するユニークなトークン
len(train_janome_token), len(test_janome_token)

(14261, 3985)

In [50]:
all_janome_token = list(set(train_janome_token + test_janome_token))   # 総ユニークトークン数
len(all_janome_token)

15227

In [39]:
only_train_janome_token = list(set(train_janome_token) - set(test_janome_token))   # trainにのみ出現するトークン
only_test_janome_token = list(set(test_janome_token) - set(train_janome_token))   # testにのみ出現するトークン
len(only_train_janome_token), len(only_test_janome_token)

(11242, 966)

### tohoku-nlp/bert-base-japanese-whole-word-masking（WordPiece）
https://huggingface.co/tohoku-nlp/bert-base-japanese-whole-word-masking

In [41]:
t_tohokuBertBase = AutoTokenizer.from_pretrained('tohoku-nlp/bert-base-japanese-whole-word-masking')
t_tohokuBertBase

BertJapaneseTokenizer(name_or_path='tohoku-nlp/bert-base-japanese-whole-word-masking', vocab_size=32000, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [45]:
train_tbertb_token_list = [[tkn for tkn in t_tohokuBertBase.tokenize(df_train['text'][i])] for i in range(df_train.shape[0])]
test_tbertb_token_list = [[tkn for tkn in t_tohokuBertBase.tokenize(df_test['text'][i])] for i in range(df_test.shape[0])]
len(train_tbertb_token_list), len(test_tbertb_token_list)

(4656, 600)

In [46]:
train_tbertb_token_flatten = [tkn for sublist in train_tbertb_token_list for tkn in sublist]
test_tbertb_token_flatten = [tkn for sublist in test_tbertb_token_list for tkn in sublist]
len(train_tbertb_token_flatten), len(test_tbertb_token_flatten)

(141502, 18449)

In [48]:
train_tbertb_token = list(set(train_tbertb_token_flatten))   # trainに出現するユニークなトークン
test_tbertb_token = list(set(test_tbertb_token_flatten))    # testに出現するユニークなトークン
len(train_tbertb_token), len(test_tbertb_token)

(11493, 4067)

In [51]:
all_tbertb_token = list(set(train_tbertb_token + test_tbertb_token))   # 総ユニークトークン数
len(all_tbertb_token)

12029

In [49]:
only_train_tbertb_token = list(set(train_tbertb_token) - set(test_tbertb_token))   # trainにのみ出現するトークン
only_test_tbertb_token = list(set(test_tbertb_token) - set(train_tbertb_token))   # testにのみ出現するトークン
len(only_train_tbertb_token), len(only_test_tbertb_token)

(7962, 536)