In [1]:
import os
import random
import numpy as np
import pandas as pd
from collections import Counter
import json
import re

In [2]:
# 250124までのアノテーションデータ
ANNO_IDX = 435

In [3]:
# role, type, funcを取得
csv_dataset = pd.read_csv("../data/all_data.csv", encoding="utf-8", usecols=['role', 'type', 'function'], index_col=0)
choice_data = csv_dataset.query("role == 'Method' | role == 'Material'").reset_index()
choice_data['func'] = choice_data['function'].apply(lambda x: x.split('（')[0])
display(choice_data.head())

Unnamed: 0,role,type,function,func
0,Method,Code,Use（引用目的）,Use
1,Material,Knowledge,Use（引用目的）,Use
2,Method,Tool,Use（引用目的）,Use
3,Method,Tool,Use（引用目的）,Use
4,Method,Tool,Use（引用目的）,Use


In [4]:
def read_jsonl(file_path):
    # 余分なセパレータを削除
    pattern = r'\n\n(<Separator_Footnote>|<Seperator_Reference>)\n'
    # 余分なURL引用タグを削除
    pattern2 = r'(\[Cite\]|\[Cite_Ref\])'
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        # idをキーとして付与
        whole_id:int = 0
        # 1行ずつ読み込む
        for line in file:
            output_data = {}
            # 辞書型に変換
            row_data = json.loads(line.strip())
            # print(row_data)

            # initialize span
            name_span = []
            fullname_span = []
            genericmention_span = []
            description_span = []
            citationtag_span = []

            # initialize value
            ## 辞書は下記のキーを持つ
            output_data['id'] = whole_id
            output_data['name'] = []
            output_data['fullname'] = []
            output_data['genericmention'] = []
            output_data['description'] = []
            output_data['citationtag'] = []
            output_data['role'] = choice_data['role'][whole_id]
            output_data['type'] = choice_data['type'][whole_id]
            output_data['func'] = choice_data['func'][whole_id]

            # スパンのアノテーションを走査
            for me_span in row_data['label']:
                # 各スパンのラベルをチェック
                if me_span[-1] == 'URL':
                    url_span = me_span[:-1]
                    # print(url_span)
                elif me_span[-1] == 'Name':
                    name_span = me_span[:-1]
                    output_data['name'].append(row_data['data'][name_span[0]:name_span[1]])
                elif me_span[-1] == "Full name":
                    # print('HELLO')
                    fullname_span = me_span[:-1]
                    output_data['fullname'].append(row_data['data'][fullname_span[0]:fullname_span[1]])
                    # print(output_data['fullname'])
                elif me_span[-1] == 'Description':
                    description_span = me_span[:-1]
                    output_data['description'].append(row_data['data'][description_span[0]:description_span[1]])
                elif me_span[-1] == 'Citation tag':
                    citationtag_span = me_span[:-1]
                    output_data['citationtag'].append(row_data['data'][citationtag_span[0]:citationtag_span[1]])
                elif me_span[-1] == 'Generic mention':
                    genericmention_span = me_span[:-1]
                    output_data['genericmention'].append(row_data['data'][genericmention_span[0]:genericmention_span[1]])
                else:
                    print("Error")
                    print(row_data)
                    return
            
            # COMMENT ME!!!
            # print(row_data)
            # 最終的な値の決定
            ## 入力として用いるテキストや諸情報
            output_data['url'] = row_data['data'][url_span[0]:url_span[1]]
            output_data['section_title'] = row_data['data'].split('\n\n')[0]
            text = re.split(pattern, row_data['data'].split(output_data['section_title']+'\n\n')[1])[0]
            if re.search(pattern, row_data['data']):
                # print('HELLO')
                output_data['add_info'] = re.sub(pattern2, '', re.split(pattern, row_data['data'])[-1])
            else:
                output_data['add_info'] = None
            output_data['text'] = re.sub(pattern2, '', text)
            ## もしそのメタデータがなければ、'N/A'とする
            if output_data['name'] == []:
                output_data['name'] = 'N/A'
            else:
                output_data['name'] = output_data['name'][0]
            if output_data['fullname'] == []:
                output_data['fullname'] = 'N/A'
            else:
                output_data['fullname'] = output_data['fullname'][0]
            if output_data['genericmention'] == []:
                output_data['genericmention'] = ["N/A"]
            if output_data['description'] == []:
                output_data['description'] = ["N/A"]
            if output_data['citationtag'] == []:
                output_data['citationtag'] = ["N/A"]
            data.append(output_data)
            whole_id += 1
    return data

In [5]:
ANNO_IDX = 435
dict_data = read_jsonl('../data/all.jsonl')[:ANNO_IDX]

In [7]:
print(dict_data[:3])

[{'id': 0, 'name': 'UUParser', 'fullname': 'N/A', 'genericmention': ['the parser'], 'description': ['a near-SOTA model', 'a variant of the K&G transition-based parser that employs the arc-hybrid transition system from Kuhlmann et al. (2011) extended with a S WAP transition and a Static-Dynamic oracle'], 'citationtag': ['de Lhoneux et al. (2017b)'], 'role': 'Method', 'type': 'Code', 'func': 'Use', 'url': 'https://github.com/mdelhoneux/uuparser-composition', 'section_title': '4 Composition in a K&G Parser', 'add_info': '4 The code can be found at https://github.com/mdelhoneux/uuparser-composition', 'text': 'Parser We use UUParser, a variant of the K&G transition-based parser that employs the arc-hybrid transition system from Kuhlmann et al. (2011) extended with a S WAP transition and a Static-Dynamic oracle, as described in de Lhoneux et al. (2017b) [Cite_Footnote_4] . The S WAP transition is used to allow the construction of non-projective dependency trees (Nivre, 2009). We use default 

In [8]:
with open('../data/few_data_split/input_data.json', 'w', encoding='utf-8') as json_file:
    json.dump(dict_data, json_file, indent=2)