In [None]:
from inference import post_process

text = '（ＦＡＸ） ０３－６７５８－８０６４'
value = '〒１０5-０００１東京都港区虎'

post_process(value, 'a', text)

In [1]:
import warnings

warnings.filterwarnings('ignore')

In [2]:
import json, glob, torch
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader

import re
import unicodedata


In [10]:

def get_tags(cinnamon_path):
    tags = set()
    files = glob.glob(f'{cinnamon_path}/ca_data/*')
    for file in files:
        dataframe = pd.read_excel(file, encoding="utf8")
        label_str = filter(lambda i:(type(i) is str), dataframe['Tag'])
        def split(strings):
            out = list()
            for string in strings: 
                out += string.split(";")
            out = [unicodedata.normalize("NFKC", re.sub('＊|\*|\s+', '', tag)) for tag in out]
            return out
        items = split(label_str)
        tags.update(items)
    return tuple(sorted(list(tags)))

tags = ('仕様書交付期限',
         '入札件名',
         '入札書締切日時',
         '入札書送付先',
         '入札書送付先部署/担当者名',
         '公告日',
         '施設名',
         '調達年度',
         '調達終了日',
         '調達開始日',
         '資格申請締切日時',
         '資格申請送付先',
         '資格申請送付先部署/担当者名',
         '質問票締切日時',
         '質問箇所TEL/FAX',
         '質問箇所所属/担当者',
         '都道府県',
         '開札場所',
         '開札日時',
         '需要場所(住所)')

def clean_str(s):
    return str(s).replace('イ．','').replace('ア．','').replace('．','').replace(' ','')

def sub_idx_finder(list1, list2, t=None):            
            if t=='入札件名':
                for i in range(len(list1)-len(list2)+1):
                    find = True
                    hit, miss = 0, 0
                    for j in range(len(list2)):
                        if list1[i+j] != list2[j]: 
                            find = False
                            miss += 1
                        else:
                            hit += 1
                    if miss < len(list2)/4:
                        find = True
                    if find:
                        return i
            elif t=='需要場所(住所)': #反過來找
                for i in range(len(list1)-len(list2), -1, -1):
                    find = True
                    hit, miss = 0, 0
                    for j in range(len(list2)):
                        if list1[i+j] != list2[j]: 
                            find = False
                            miss += 1
                        else:
                            hit += 1
                    if miss < len(list2)/6:
                        find = True
                    if find:
                        return i          
            elif t=='質問箇所所属/担当者': #反過來找
                for i in range(len(list1)-len(list2), -1, -1):
                    find = True
                    hit, miss = 0, 0
                    for j in range(len(list2)):
                        if list1[i+j] != list2[j]: 
                            find = False
                            miss += 1
                        else:
                            hit += 1
                    if miss < len(list2)/4:
                        find = True
                    if find:
                        return i           
            elif t=='質問箇所TEL/FAX': #反過來找
                for i in range(len(list1)-len(list2), -1, -1):
                    find = True
                    hit, miss = 0, 0
                    for j in range(len(list2)):
                        if list1[i+j] != list2[j]: 
                            find = False
                            miss += 1
                        else:
                            hit += 1
                    if miss < len(list2)/3:
                        find = True
                    if find:
                        return i    
            else: #正向找
                for i in range(len(list1)-len(list2)+1):
                    find = True
                    hit, miss = 0, 0
                    for j in range(len(list2)):
                        if list1[i+j] != list2[j]: 
                            find = False
                            miss += 1
                        else:
                            hit += 1
                    if find:
                        return i                
            return None

class Cinnamon_Dataset_v2(Dataset):
    def __init__(self, cinnamon_path, tokenizer, tags=None):        
        def get_samples(cinnamon_path):
            datas = []
            files = glob.glob(f'{cinnamon_path}/ca_data/*')
            for file in files:
                doc_id = file[file.find('ca_data/')+8:file.find('.pdf.xlsx')]
                
                dataframe = pd.read_excel(file, encoding="utf8")
                dataframe['Parent Index'] = dataframe['Parent Index'].fillna(1)
                
                for item in dataframe.iterrows(): 
                    #item:(Page No, Text, Index, Parent Index, Is Title, Is Table, Tag, Value)
                    item = item[1]
                    
                    doc, index = doc_id, item['Index']
                    text, p_text = item['Text'], dataframe.loc[dataframe['Index']==item['Parent Index'],'Text'].item()
                    tags, values = item['Tag'], item['Value']
                    
                    datas.append({'doc':doc_id,'index':index,
                                  'text':text, 'p_text':p_text,
                                  'tags':tags,'values':values})
                #print(datas)
            return datas
        
        self.tokenizer = tokenizer
        self.samples = get_samples(cinnamon_path)
        self.tags = get_tags(cinnamon_path) if tags is None else tags

        print(f'\t[Info] Load Cannon_Dataset_v2 complete !! len:{self.__len__()}')    
        
    def __len__(self):
        return len(self.samples) 
    
    def __getitem__(self, idx):
        return self.samples[idx]
            
    def collate_fn(self, samples):        
        tokenizer, TAGS = self.tokenizer, self.tags
            
        CLS, SEP, PAD = tokenizer.cls_token_id, tokenizer.sep_token_id, tokenizer.pad_token_id
                
        ## text tokenized, label vectoized
        b_ids, b_labels, b_masks = [], [], []
        for sample in samples:            
            text, p_text, tags, values = sample['text'],sample['p_text'],sample['tags'],sample['values']
            
            # string cleaning
            text = clean_str(text)
            p_text = clean_str(p_text)
            tags = unicodedata.normalize("NFKC", re.sub('＊|\*|\s+', '', tags)) if tags is not np.nan else tags
            values = clean_str(values)
                    
            # text to tokens
            text_ids = tokenizer.encode(text)[1:-1]
            p_text_ids = tokenizer.encode(p_text)[1:-1]
                
            # input, output, mask
            ids = [CLS] + text_ids + [SEP] + p_text_ids + [SEP]
            labels = [[0 for i in range(len(TAGS))] for j in range(len(ids)) ]
            masks = [0] + [1 for i in range(len(text_ids))] + [0 for i in range(len(ids)-len(text_ids)-1)]
                                
            # assign label 
            if isinstance(tags, str):
                for tag,value in zip(tags.split(';'), str(values).split(';')):   
                    tag = unicodedata.normalize("NFKC", re.sub('＊|\*|\s+', '', tag))
                        
                    value_ids = tokenizer.encode(value)[1:-1]
                    pivote = sub_idx_finder(text_ids, value_ids, tag)                        
                    if pivote is not None:
                        for k in range(len(value_ids)):
                            labels[1+pivote+k][TAGS.index(tag)] = 1
                    else:
                        print("\t[ERROR] pivote not found ")
            b_ids.append(ids)
            b_labels.append(labels)
            b_masks.append(masks)

        ## pad to same lenght
        max_len = min([max([len(s) for s in b_ids]), 512])
        for i,(ids, labels, masks) in enumerate(zip(b_ids, b_labels, b_masks)):            
            ids = ids[:max_len]
            ids += [PAD]*(max_len-len(ids))
            b_ids[i] = ids
            
            labels = labels[:max_len]
            labels += [[0 for j in range(len(TAGS))] for k in range(max_len-len(labels))]
            b_labels[i] = labels
            
            masks = masks[:max_len]
            masks += [0]*(max_len-len(masks))
            b_masks[i] = masks

        return torch.tensor(b_ids), torch.tensor(b_labels), torch.tensor(b_masks)
    
    


	[Info] Load Cannon_Dataset complete !! len:9850


(tensor([[    2,  1266,  1759,  1711,  5580,  5107,  2988,    35,  3663, 10582,
           4803,  2650,     3, 17175,   170, 29121,     3,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0],
         [    2,  1097,    11,    34,   344,     7,     9,     6,  5686,  1711,
            118,     5,   323,  7460,     5,  1768,     6,  5686,  1711,    13,
              5,   284,     5,  3279,   459,     5,  1768,     7,  2367,     3,
           4478,  1097,     5,  4980,     7, 16745, 24169,  7536,     3],
         [    2,     5,   104,     5,   859,  5292,     5,  3225,    11,  1750,
             16, 17175,    15,    10,   104,    11,  1042,     3,    23,   101,
             24, 20781,   104,     5,  1067,  1559,  3688,    26,    20,    10,
           3571, 10240,    11,  1770,     3,     0,     0,     0,     0],
         [    2,  3462,     5,   859,     6,    36,  6899,

In [None]:
train_dataset[100]

        
abc = 'イ．．イ．dfイア．ア．afwq  er．adイ．'
re.sub(r'アイ． ','',abc)
abc

from dataset import DataLoader
from train import BertTokenizer, BertJapaneseTokenizer, Model, pretrained_weights, train
from main import parse_args

args = parse_args('')

tokenizer = BertJapaneseTokenizer.from_pretrained(pretrained_weights, do_lower_case=True)

train_dataset = Cinnamon_Dataset('/media/D/ADL2020-SPRING/project/cinnamon/train/', tokenizer,tags)

train_dataloader = DataLoader(train_dataset,
                        batch_size = 4,
                        num_workers = 1,
                        collate_fn=train_dataset.collate_fn,
                        shuffle=False)

train_dataset.collate_fn([train_dataset[0],train_dataset[1]])


In [21]:
a = torch.tensor([[1,2,2,1,3],[2,2,2,2,1]])
#torch.masked_select(a, torch.tensor([[0],[1]]).expand(2,5))
c,d = a.shape

In [37]:
df = pd.DataFrame(columns=['a','c','11'])
df = df.append({'a':1,'c':1111,'11':'ddd'}, ignore_index=True)
df = df.append({'a':1,'c':1111,'11':'d1111dd'}, ignore_index=True)
df = df.append({'a':1,'c':1111,'11':'33'}, ignore_index=True)
df

Unnamed: 0,a,c,11
0,1,1111,ddd
1,1,1111,d1111dd
2,1,1111,33


In [31]:
from dataset import DataLoader
from train import BertTokenizer, BertJapaneseTokenizer, Model, pretrained_weights, train
from main import parse_args


args = parse_args('')

tokenizer = BertJapaneseTokenizer.from_pretrained(pretrained_weights)#, do_lower_case=True)

train_dataset = Cinnamon_Dataset('/media/D/ADL2020-SPRING/project/cinnamon/train/', tokenizer,tags)
valid_dataset = Cinnamon_Dataset('/media/D/ADL2020-SPRING/project/cinnamon/dev/', tokenizer,tags)
train_dataloader = DataLoader(train_dataset,
                        batch_size = 4,
                        num_workers = 8,
                        collate_fn=train_dataset.collate_fn,
                        shuffle=True)
valid_dataloader = DataLoader(valid_dataset,
                             batch_size = 4,
                             num_workers = 8,
                             collate_fn = valid_dataset.collate_fn,
                             shuffle = False)
    
## train
train(args, train_dataloader, valid_dataloader)


NameError: name 'Cinnamon_Dataset' is not defined

## -----------------------------------------------------------------------------------------------------------------------------


In [None]:
import warnings

warnings.filterwarnings('ignore')

import json, glob, torch
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader

import re
import unicodedata

########################################################
##################  Cinnamon Dataset  ##################
class Cinnamon_Dataset_Testing(Dataset):
    def __init__(self, cinnamon_path, tokenizer):
        def get_tags(cinnamon_path):
            tags = set()
            files = glob.glob(f'{cinnamon_path}/ca_data/*')
            for file in files:
                dataframe = pd.read_excel(file, encoding="utf8")
                label_str = filter(lambda i:(type(i) is str), dataframe['Tag'])
                def split(strings):
                    out = list()
                    for string in strings: 
                        out += string.split(";")
                    out = [unicodedata.normalize("NFKC", re.sub('＊|\*|\s+', '', tag)) for tag in out]
                    return out
                items = split(label_str)
                tags.update(items)
            return tuple(sorted(list(tags)))
        
        def get_samples(cinnamon_path):
            groups = []
            files = glob.glob(f'{cinnamon_path}/ca_data/*')
            for file in files:
                doc_id = file[file.find('ca_data/')+8:file.find('.pdf.xlsx')]
                dataframe = pd.read_excel(file, encoding="utf8")
                for i in range(10):
                    if not isinstance(dataframe['Parent Index'][i], int):
                        dataframe['Parent Index'][i] = 0 # index是nan的補 0
                dataframe['ID'] = dataframe['Index'].apply(lambda x: "{}-{}".format(doc_id,x))
                dataframe['id'] = int(doc_id)
                
                p_index = dataframe.groupby('Parent Index')
                for g in list(p_index.groups.keys()):
                    groups.append({'doc_id':doc_id,'sample':p_index.get_group(g)})
            return groups
        
        self.tokenizer = tokenizer
        self.samples = get_samples(cinnamon_path)
        self.tags = get_tags(cinnamon_path)

        print(f'\t[Info] Load Cannon_Dataset complete !! len:{self.__len__()}')    
        
    def __len__(self):
        return len(self.samples) 
    
    def __getitem__(self, idx):
        return self.samples[idx]
            
    def collate_fn(self, samples):        
        tokenizer, tags = self.tokenizer, self.tags
            
        CLS, SEP, PAD = tokenizer.cls_token_id, tokenizer.sep_token_id, tokenizer.pad_token_id
        
        def zero_vec(): 
            return [0]*len(tags)
        
        def sub_idx_finder(list1, list2):            
            for i in range(len(list1)-len(list2)):
                find = True
                hit, miss = 0, 0
                for j in range(len(list2)):
                    if list1[i+j] != list2[j]: 
                        find = False
                        miss += 1
                    else:
                        hit += 1
                if miss < len(list2)/5:
                    find = True
                if find:
                    return i
            #print('yeh')
        
        ## text tokenized, label vectoized
        b_doc_id, b_token_ids, b_output, b_token_indexs = [], [], [], []
        for sample in samples:
            doc_id = sample['doc_id']
            sample = sample['sample']
            
            token_ids = [CLS]
            output = [zero_vec()]
            token_indexs = [-1]
            for index, text, tag, value in zip(sample['Index'],sample['Text'],sample['Tag'],sample['Value']):
                # 全形半形問題
                text = str(unicodedata.normalize("NFKC", re.sub('＊|\*|\s+', '', text)))
                #tag = str(unicodedata.normalize("NFKC", re.sub('＊|\*|\s+', '', tag))) if tag is not np.nan else tag
                #value = str(unicodedata.normalize("NFKC", re.sub('＊|\*|\s+', '', value))) if value is not np.nan else value
                    
                ###
                ids = tokenizer.encode(text)[1:-1] + [SEP]
                labels = [zero_vec()]*(len(ids)-1) + [zero_vec()]
                indexs = [index]*(len(ids)-1) + [-1]
                '''
                if isinstance(tag, str):
                    for t,v in zip(tag.split(';'), str(value).split(';')):
                        t = unicodedata.normalize("NFKC", re.sub('＊|\*|\s+', '', t))
                        v = unicodedata.normalize("NFKC", re.sub('＊|\*|\s+', '', v))
                        
                        ids_v = tokenizer.encode(v)[1:-1]
                        pivote = sub_idx_finder(ids, ids_v)
                        for k in range(len(ids_v)):
                            if pivote is not None:
                                labels[pivote+k][tags.index(t)] = 1
                '''
                token_ids += ids
                output += labels
                token_indexs += indexs
            b_doc_id.append(doc_id)
            b_token_ids.append(token_ids)
            b_output.append(0)
            b_token_indexs.append(token_indexs)

        ## pad to same lenght
        max_len = min([max([len(s) for s in b_token_ids]), 512])
        for idx,(token_ids, output, token_indexs) in enumerate(zip(b_token_ids, b_output, b_token_indexs)):            
            token_ids = token_ids[:max_len]
            token_ids += [PAD]*(max_len-len(token_ids))
            b_token_ids[idx] = token_ids
            '''
            output = output[:max_len]
            output += [zero_vec()]*(max_len-len(output))
            b_output[idx] = output
            '''
            token_indexs = token_indexs[:max_len]
            token_indexs += [-1]*(max_len-len(token_indexs))
            b_token_indexs[idx] = token_indexs

        return torch.tensor(b_token_ids), None, b_token_indexs, b_doc_id, sample
    
    

In [None]:
#valid_dataset = Cinnamon_Dataset_Testing('/media/D/ADL2020-SPRING/project/cinnamon/dev/', tokenizer)

In [None]:
def _code(_input):
    def abcd(string):
        coding = []
        for char in string:
            status = unicodedata.east_asian_width(char)
            if status == 'F':
                #print('{0} is full-width.'.format(char))
                coding.append('F')
            elif status == 'H':
                #print('{0} is half-width.'.format(char))
                coding.append('H')
            else:
                #print('{0} is char'.format(char))
                coding.append('C')
        return coding
    if isinstance(_input, list):
        string_ls = _input
        ddd = []
        for s in string_ls:
            ddd.append(abcd(s))
        return ddd
    else:
        return abcd(_input)
        

class Cinnamon_Dataset_Testing(Dataset):
    def __init__(self, cinnamon_path, tokenizer, tags=None):
        def get_tags(cinnamon_path):
            tags = set()
            files = glob.glob(f'{cinnamon_path}/ca_data/*')
            for file in files:
                dataframe = pd.read_excel(file, encoding="utf8")
                label_str = filter(lambda i:(type(i) is str), dataframe['Tag'])
                def split(strings):
                    out = list()
                    for string in strings: 
                        out += string.split(";")
                    out = [unicodedata.normalize("NFKC", re.sub('＊|\*|\s+', '', tag)) for tag in out]
                    return out
                items = split(label_str)
                tags.update(items)
            return tuple(sorted(list(tags)))
        
        def get_samples(cinnamon_path):
            groups = []
            files = glob.glob(f'{cinnamon_path}/ca_data/*')
            for file in files:
                doc_id = file[file.find('ca_data/')+8:file.find('.pdf.xlsx')]
                
                dataframe = pd.read_excel(file, encoding="utf8")
                dataframe['doc_id'] = [doc_id]*len(dataframe)
                dataframe['ID'] = dataframe['Index'].apply(lambda x: "{}-{}".format(doc_id,x))
                dataframe['id'] = int(doc_id)
                
                
                '''
                for i in range(10):
                    if not isinstance(dataframe['Parent Index'][i], int):
                        dataframe['Parent Index'][i] = 0 # index是nan的補 0
                '''
                delta = 11
                for i in range(0,len(dataframe),delta):
                    sample = dataframe.loc[i:i+delta-1]
                    groups.append(sample)
                    
                    text = ''.join(sample['Text']) 
                    if len(tokenizer.encode(text))>512:
                        print(len(tokenizer.encode(text)), text)
                    
            return groups
        
        self.tokenizer = tokenizer
        self.samples = get_samples(cinnamon_path)
        self.tags = get_tags(cinnamon_path) if tags is None else tags

        print(f'\t[Info] Load Cannon_Dataset_Testing complete !! len:{self.__len__()}')    
        
    def __len__(self):
        return len(self.samples) 
    
    def __getitem__(self, idx):
        return self.samples[idx]
            
    def collate_fn(self, samples):        
        tokenizer, tags = self.tokenizer, self.tags
            
        CLS, SEP, PAD = tokenizer.cls_token_id, tokenizer.sep_token_id, tokenizer.pad_token_id
        
        def zero_vec(): 
            return [0 for i in range(len(tags))]
                
        ## text tokenized, label vectoized
        b_token_ids, b_token_indexs, b_doc_id = [], [], []
        for sample in samples:
            
            doc_id = list(sample['doc_id'])[0]
            
            token_ids = [CLS]
            token_indexs = [-1]
            
            for text, index in zip(sample['Text'], sample['Index']):
                # 全形半形問題
                text = str(text).replace('イ．','').replace('ア．','')
                #tag = str(unicodedata.normalize("NFKC", re.sub('＊|\*|\s+', '', tag))) if tag is not np.nan else tag
                #value = str(value)
                
                ids = tokenizer.encode(text)[1:-1]# + [SEP]
                
                token_ids += ids
                token_indexs += [index for jj in range(len(ids))]
                
            assert len(token_ids)==len(token_indexs)
            b_token_ids.append(token_ids)
            b_token_indexs.append(token_indexs)
            b_doc_id.append(doc_id)

        ## pad to same lenght
        max_len = min([max([len(s) for s in b_token_ids]), 512])
        for idx,(token_ids, token_indexs) in enumerate(zip(b_token_ids, b_token_indexs)):            
            token_ids = token_ids[:max_len]
            token_ids += [PAD]*(max_len-len(token_ids))
            b_token_ids[idx] = token_ids
            
            token_indexs = token_indexs[:max_len]
            token_indexs += [-1]*(max_len-len(token_indexs))
            b_token_indexs[idx] = token_indexs

        return torch.tensor(b_token_ids), None, b_token_indexs, b_doc_id, samples[0] #, torch.tensor(b_output)
    
    

# Export submission.csv

In [None]:
from dataset import Cinnamon_Dataset, DataLoader
from train import * 


In [None]:
tokenizer = BertJapaneseTokenizer.from_pretrained(pretrained_weights, do_lower_case=True)

valid_dataset = Cinnamon_Dataset_Testing('/media/D/ADL2020-SPRING/project/cinnamon/dev/', tokenizer, tags)
valid_dataloader = DataLoader(valid_dataset,
                             batch_size=1,
                             collate_fn=valid_dataset.collate_fn,
                             shuffle=False)



In [None]:
#train_dataset = Cinnamon_Dataset('/media/D/ADL2020-SPRING/project/cinnamon/train/', tokenizer)
#tags = train_dataset.tags
tags = ('仕様書交付期限',
         '入札件名',
         '入札書締切日時',
         '入札書送付先',
         '入札書送付先部署/担当者名',
         '公告日',
         '施設名',
         '調達年度',
         '調達終了日',
         '調達開始日',
         '資格申請締切日時',
         '資格申請送付先',
         '資格申請送付先部署/担当者名',
         '質問票締切日時',
         '質問箇所TEL/FAX',
         '質問箇所所属/担当者',
         '都道府県',
         '開札場所',
         '開札日時',
         '需要場所(住所)')
tags

## inference

In [None]:
from model import Model
import torch.nn.functional as F

def fuller(text):
    candidate = (ord('0'),ord('1'),ord('2'),ord('3'),ord('4'),ord('5'),ord('6'),ord('7'),ord('8'),ord('9'),
            ord('('),ord(')'),ord('~'),)
    text_out = ''
    for c in text:
        if ord(c) in candidate:
            text_out += chr(ord(c)+65248)
        else:
            text_out += c
    return text_out

def post_process(value, tag, text):
    '''
    if tag=='質問箇所TEL/FAX':
        value = value.replace('##l:','ＴＥＬ：').replace('tel:','ＴＥＬ：').replace('Tel:','ＴＥＬ：').replace('TEL:','ＴＥＬ：')
        value = value.replace('fax:','ＦＡＸ：').replace('Fax:','ＦＡＸ：').replace('FAX:','ＦＡＸ：')
    
    print(text)
    input("")
    
    # 半形 轉 全形
    value = fuller(value)
    '''
    value_ret = ''
    for c in value:
        if c in text:
            value_ret += c
        elif chr(ord(c)+65248) in text:
            value_ret += chr(ord(c)+65248)
        elif ord('a')<=ord(c) and ord(c)<=ord('z'): #小寫轉大寫
            if chr(ord(c)-32) in text: #小寫轉大寫 半形
                value_ret += chr(ord(c)-32)
            elif chr(ord(c)+65248-32) in text: #小寫轉大寫 + 轉全形
                value_ret += chr(ord(c)+65248-32)
        elif ord('A')<=ord(c) and ord(c)<=ord('Z'): #大寫轉小寫
            if chr(ord(c)+32) in text: #大寫轉小寫 半形
                value_ret += chr(ord(c)+65248+32)            
            elif chr(ord(c)+65248+32) in text: #大寫轉小寫 + 轉全形
                value_ret += chr(ord(c)+65248+32)
        else:
            pass
            #print(c, text, value)
            
    return value_ret
    
    


In [None]:
from collections import Counter
l = [0,1,2,2,2,1,9,"a","b","b"]
Counter(l).most_common()[0][0]

In [None]:

model = Model()
model.load_state_dict(torch.load('./ckpt/epoch_50.pt')['state_dict'])
model.eval()

total_dataframe = None 
for iii,(_input, _label, token_indexs, doc_id, sample) in enumerate(valid_dataloader):
    sample['Prediction'] = ""
    sample['Tag'] = ""
    sample['Value'] = ""
    
    _output = model(_input)[0]
    prob = F.sigmoid(_output)
    
    for i,tag in enumerate(tags):
        index = [] #set()
        values = []
        for j in range(prob.size(0)):
            if prob[j,i] > 0.5:
                values.append(_input[0][j])
                #index.update([token_indexs[0][j]])
                index.append(token_indexs[0][j])
                
        if len(values)>0:
            index = Counter(index).most_common()[0][0]
            #print(index)
            value_str = tokenizer.decode(values, skip_special_tokens=True).replace(" ","")            
            value_str = post_process(value_str, tag, sample.loc[sample['Index']==index, 'Text'].item())
            #value_str = post_process(value_str, tag)
            
            # add a tag&value to <Prediction>
            '''
            if sample[sample['Index']==index]['Prediction'].item() == "":
                sample.loc[sample['Index']==index,'Prediction'] = "{}: {} ".format(
                            tag, value_str)
            else :
                sample.loc[sample['Index']==index,'Prediction'] += "{}: {} ".format(
                            tag, value_str)
            '''
            
            # add a tag&value to <Tag> <Value>
            if sample[sample['Index']==index]['Tag'].item() == "":
                sample.loc[sample['Index']==index, 'Tag'] = "{}".format(tag)
                sample.loc[sample['Index']==index, 'Value'] = "{}".format(value_str)
            else:
                sample.loc[sample['Index']==index, 'Tag'] += ";{}".format(tag)
                sample.loc[sample['Index']==index, 'Value'] += ";{}".format(value_str)
        
    
    total_dataframe = total_dataframe.append(sample) if isinstance(total_dataframe, pd.DataFrame) else sample

    print(f'\t[Info] [{iii+1}/{len(valid_dataloader)}]', end='   \r')

In [None]:

total_dataframe[total_dataframe['Tag']!=""]
total_dataframe = total_dataframe.sort_values(by=['id','Index'], ascending=[True,True])


In [None]:

total_dataframe_clean = total_dataframe.drop('Page No', axis=1).drop('Parent Index', axis=1).drop('Is Title', axis=1).drop(
                'Is Table', axis=1).drop('id', axis=1).drop('Index', axis=1)
total_dataframe_clean.to_csv('./result/testout.csv', encoding='utf8')


# Submission metrics

In [None]:
from utils.convert import *
from utils.score import * 

convert('./result/testout.csv','./result/submission.csv')

In [None]:
score('/media/D/ADL2020-SPRING/project/cinnamon/dev/dev_ref.csv','./result/submission.csv')

In [None]:
0.9203583682805949

In [None]:

total_dataframe = total_dataframe.sort_values(by=['id','Index'], ascending=[True,True])
for idx in total_dataframe.ID:
    print(idx)


In [None]:

total_dataframe.dtypes


In [None]:
ord('a'),ord('z')

In [None]:
ord('A'),ord('Z')

In [None]:

a = set('令和2年5月22日(金)14時00分~'.split())
b = set('令和２年５月２２日（金）１４時００分～'.split())
len(a.intersection(b))


In [None]:
abc = tokenizer.encode('令和２年５月２２日（金）１４時００分～')
tokenizer.decode(abc).replace(" ","")

In [None]:
import unicodedata

string = '令和２年５月２２日（金）１４時００分～'

def _code(string):
    coding = []
    for char in string:
        status = unicodedata.east_asian_width(char)
        if status == 'F':
            #print('{0} is full-width.'.format(char))
            coding.append('F')
        elif status == 'H':
            #print('{0} is half-width.'.format(char))
            coding.append('H')
        else:
            #print('{0} is char'.format(char))
            coding.append('C')
    return coding
            
_code(string)
_code("１1１")

In [None]:
full = chr(ord('(')+65248)
half = chr(ord(full)-65248)
full, half

In [None]:
candidate = (ord('0'),ord('1'),ord('2'),ord('3'),ord('4'),ord('5'),ord('6'),ord('7'),ord('8'),ord('9'),
            ord('('),ord(')'),ord('~'),)
candidate

In [None]:

def fuller(text):
    candidate = (ord('0'),ord('1'),ord('2'),ord('3'),ord('4'),ord('5'),ord('6'),ord('7'),ord('8'),ord('9'),
            ord('('),ord(')'),ord('~'),)
    text_out = ''
    for c in text:
        if ord(c) in candidate:
            text_out += chr(ord(c)+65248)
        else:
            text_out += c
    return text_out

text_out = fuller('令和2年5月22日(金)14時00分~')
text_out

In [None]:
a = set('調達年度:令和2年 調達終了日:令和5年3月31日 調達開始日:令和2年4月1日'.split())
b = set('調達開始日:令和2年4月1日 調達終了日:令和5年3月31日 調達年度:令和2年'.split())
len(a.intersection(b))

In [None]:
chr(ord('3')+65248) in "３ａｄｆａ１"