# Load Cinnamon Dataset


## get tags

In [85]:
import os, sys, glob

import numpy as np
import pandas as pd

def get_tags(cinnamon_path):
    files = glob.glob(f'{cinnamon_path}/train/ca_data/*')

    tags = set()
    for file in files:
        dataframe = pd.read_excel(file, encoding="utf8")
        label_str = filter(lambda i:(type(i) is str), dataframe['Tag'])
        def split(strings):
            out = list()
            for string in strings: 
                out += string.split(";")
            return out
        items = split(label_str)
        tags.update(items)
    return sorted(list(tags))

tags = get_tags('/media/D/ADL2020-SPRING/project/cinnamon')
#tags

## get  sample

In [90]:
import os, sys, glob

import numpy as np
import pandas as pd


def get_samples(cinnamon_path):
    groups = []
    files = glob.glob(f'{cinnamon_path}/ca_data/*')
    for file in files:
        dataframe = pd.read_excel(file, encoding="utf8")

        p_index = dataframe.groupby('Parent Index')
        for g in list(p_index.groups.keys()):
            groups.append(p_index.get_group(g))
    return groups


cinnamon_path = '/media/D/ADL2020-SPRING/project/cinnamon/train'

groups = get_samples(cinnamon_path)
groups


[     Page No                   Text  Index  ...  Is Table  Tag       Value
 1          1      次のとおり一般競争入札に付します。      2  ...       NaN  NaN         NaN
 2          1             平成29年9月22日      3  ...       NaN  公告日  平成29年9月22日
 3          1  独立行政法人石油天然ガス・金属鉱物資源機構      4  ...       NaN  NaN         NaN
 4          1                  契約担当役      5  ...       NaN  NaN         NaN
 5          1      金属・石炭事業支援本部長 池田 肇      6  ...       NaN  NaN         NaN
 6          1  ◎調達機関番号 ５８６ ◎所在地番号 １３      7  ...       NaN  NaN         NaN
 7          1                 １ 調達内容      8  ...       NaN  NaN         NaN
 31         1               ２ 競争参加資格     32  ...       NaN  NaN         NaN
 46         2            ３ 入札書の提出場所等     47  ...       NaN  NaN         NaN
 68         2                  ４ その他     69  ...       NaN  NaN         NaN
 109        3         ５ 契約の公表に係る留意事項    110  ...       NaN  NaN         NaN
 162        5              ６ Summary    163  ...       NaN  NaN         NaN
 
 [12 rows 

In [87]:

# dataframe = pd.read_excel(files[0], encoding="utf8")
# p_index = dataframe.groupby('Parent Index')
# print(p_index.get_group(list(p_index.groups.keys())[1]))
# print(p_index.groups)


## Dataset

In [196]:
import torch
from torch.utils.data import Dataset, DataLoader

########################################################
##################  Cinnamon Dataset  ##################
class Cinnamon_Dataset(Dataset):
    def __init__(self, cinnamon_path, tokenizer):
        def get_tags(cinnamon_path):
            tags = set()
            files = glob.glob(f'{cinnamon_path}/ca_data/*')
            for file in files:
                dataframe = pd.read_excel(file, encoding="utf8")
                label_str = filter(lambda i:(type(i) is str), dataframe['Tag'])
                def split(strings):
                    out = list()
                    for string in strings: 
                        out += string.split(";")
                    return out
                items = split(label_str)
                tags.update(items)
            return tuple(sorted(list(tags)))
            #return tuple(["[PAD]", "[None]"] + sorted(list(tags)))
        
        def get_samples(cinnamon_path):
            groups = []
            files = glob.glob(f'{cinnamon_path}/ca_data/*')
            for file in files:
                dataframe = pd.read_excel(file, encoding="utf8")

                p_index = dataframe.groupby('Parent Index')
                for g in list(p_index.groups.keys()):
                    groups.append(p_index.get_group(g))
            return groups
        
        self.tokenizer = tokenizer
        self.samples = get_samples(cinnamon_path)
        self.tags = get_tags(cinnamon_path)

        print(f'\t[Info] Load Cannon_Dataset complete !! len:{self.__len__()}')    
        
    def __len__(self):
        return len(self.samples) 
    
    def __getitem__(self, idx):
        return self.samples[idx]
            
    def collate_fn(self, samples):        
        tokenizer, tags = self.tokenizer, self.tags
            
        CLS, SEP, PAD = tokenizer.cls_token_id, tokenizer.sep_token_id, tokenizer.pad_token_id
        
        def zero_vec(): 
            return [0]*len(tags)
        
        ## text tokenized, label vectoized
        b_token_ids, b_output = [], []
        for sample in samples:
            token_ids = [CLS]
            output = [zero_vec()]
            for text, tag in zip(sample['Text'],sample['Tag']):
                ids = tokenizer.encode(text)[1:-1] + [SEP]
                label = zero_vec()
                if isinstance(tag, str): 
                    for t in tag.split(';'):
                        label[tags.index(t)] = 1
                token_ids += ids
                output += [label]*(len(ids)-1) + [zero_vec()]
            b_token_ids.append(token_ids)
            b_output.append(output)

        ## pad to same lenght
        max_len = min(max([len(s) for s in b_token_ids]), 512)
        for token_ids, output in zip(b_token_ids, b_output):
            token_ids += [PAD]*(max_len-len(token_ids))
            output += [zero_vec()]*(max_len-len(output))

        return torch.tensor(b_token_ids), torch.tensor(b_output)
        
        
from transformers import BertTokenizer 
tokenizer = BertTokenizer.from_pretrained(pretrained_weights, do_lower_case=True)


train_dataset = Cinnamon_Dataset('/media/D/ADL2020-SPRING/project/cinnamon/train/', tokenizer)
valid_dataset = Cinnamon_Dataset('/media/D/ADL2020-SPRING/project/cinnamon/dev/', tokenizer)

train_dataloader = DataLoader(train_dataset,
                             batch_size=8,
                             collate_fn=train_dataset.collate_fn)


	[Info] Load Cannon_Dataset complete !! len:2011
	[Info] Load Cannon_Dataset complete !! len:557


In [None]:
for _input, _output in train_dataloader:
    print(_input.shape, _output.shape)
    input("")

torch.Size([8, 482]) torch.Size([8, 482, 20])

torch.Size([8, 187]) torch.Size([8, 187, 20])

torch.Size([8, 309]) torch.Size([8, 309, 20])

torch.Size([8, 142]) torch.Size([8, 142, 20])

torch.Size([8, 143]) torch.Size([8, 143, 20])

torch.Size([8, 204]) torch.Size([8, 204, 20])

torch.Size([8, 158]) torch.Size([8, 158, 20])


In [195]:
_input, _output = train_dataset.collate_fn([train_dataset[1],train_dataset[18]])
_input.shape, _output.shape

(torch.Size([2, 482]), torch.Size([2, 482, 20]))

In [168]:

pretrained_weights = 'cl-tohoku/bert-base-japanese-whole-word-masking'

from transformers import BertTokenizer 
tokenizer = BertTokenizer.from_pretrained(pretrained_weights, do_lower_case=True)

#collator = Cinnamon_Collator(tokenizer)
#collator([132])

In [190]:
samples = [train_dataset[1],train_dataset[18]]
sample = samples[0]
CLS, SEP, PAD = tokenizer.cls_token_id, tokenizer.sep_token_id, tokenizer.pad_token_id
#print(sample.columns)
#print(sample['Tag'])

# 1個sample
def zero_vec(): 
    return [0]*len(tags)

## text tokenized, label vectoized
b_token_ids, b_output = [], []
for sample in samples:
    token_ids = [CLS]
    output = [zero_vec()]
    for text, tag in zip(sample['Text'],sample['Tag']):
        ids = tokenizer.encode(text)[1:-1] + [SEP]
        label = zero_vec()
        if isinstance(tag, str): 
            for t in tag.split(';'):
                label[tags.index(t)] = 1
        token_ids += ids
        output += [label]*(len(ids)-1) + [zero_vec()]
    b_token_ids.append(token_ids)
    b_output.append(output)
    
## pad to same lenght
max_len = min(max([len(s) for s in b_token_ids]), 512)
for token_ids, output in zip(b_token_ids, b_output):
    token_ids += [PAD]*(max_len-len(token_ids))
    output += [zero_vec()]*(max_len-len(output))
    
return torch.tensor(b_token_ids), torch.tensor(b_output)

len(token_ids),len(output)
b_token_ids, b_output
[len(s) for s in b_token_ids], [len(s) for s in b_output]

SyntaxError: 'return' outside function (<ipython-input-190-8e119e107b9e>, line 32)

In [185]:
token_ids, output

sample, tokenizer.decode(token_ids), output

(    Page No  ...                                  Value
 8         1  ...                                    NaN
 9         1  ...                旧松尾鉱山新中和処 理 施 設で使用する電 気
 11        1  ...                                    NaN
 12        1  ...    平 成 30年;平 成 30年 3月 1日;平 成 31年 2月28日
 13        1  ...  岩手県;旧松尾鉱山新中和処理施設;岩手県八幡平市松尾寄木第１地割字１番地先
 14        1  ...                                    NaN
 15        1  ...                                    NaN
 16        1  ...                                    NaN
 17        1  ...                                    NaN
 18        1  ...                                    NaN
 19        1  ...                                    NaN
 20        1  ...                                    NaN
 21        1  ...                                    NaN
 22        1  ...                                    NaN
 23        1  ...                                    NaN
 24        1  ...                                    NaN
 25        1  ...              

In [110]:
indexs = tokenizer.encode(' （１）独立行政法人石油天然ガス・金属鉱物資源機構の「競争参加者の') + [tokenizer.sep_token_id]
tokenizer.decode(indexs)

'[CLS] [UNK] [UNK] [UNK] 独 立 行 政 法 人 石 油 天 然 カス ・ 金 属 鉱 物 資 源 機 構 の 「 競 争 参 加 者 の [SEP] [SEP]'

In [109]:
tokenizer.sep_token_id, tokenizer.pad_token

(3, '[PAD]')