In [87]:
import os
import random
import numpy as np
import pandas as pd
from collections import Counter, defaultdict

In [88]:
# データの読み込み
full_data = pd.read_csv('./data/all_data.csv', encoding="utf-8", usecols=['id', 'url', 'passage-title', 'citation-info', 'citation-paragraph', 'citation-type', 'role','type', 'year'], index_col=0)
full_data['tsunokake_id'] = full_data.index
## 使用するのはRoleがMaterialとMethodであるもののみ
choice_data = full_data.query("role == 'Method' | role == 'Material'").reset_index().drop('id', axis=1)
## choice_dataのインデックスを保存
choice_data['choice_id'] = choice_data.index
display(choice_data.head())

Unnamed: 0,url,passage-title,citation-info,citation-paragraph,role,type,year,citation-type,tsunokake_id,choice_id
0,https://github.com/mdelhoneux/uuparser-compos...,['4 Composition in a K&G Parser'],4 The code can be found at https://github.com/...,"Parser We use UUParser, a variant of the K&G t...",Method,Code,2019,Footnote,0,0
1,http://hdl.handle.net/11234/1-2364,['5 What Correlates with Difficulty?'],Milan Straka and Jana Strakov. 2017. Tokenizin...,Head-POS Entropy Dehouck and Denis (2018) prop...,Material,Knowledge,2019,Reference,1,1
2,http://sjmielke.com/papers/tokenize/,['D Data selection: Europarl'],31 http://sjmielke.com/papers/tokenize/,"Finally, it should be said that the text in Co...",Method,Tool,2019,Footnote,2,2
3,https://developer.twitter.com/en/docs.html,"['2 Problem Formulation', '2.2 Data']",,• Negative examples: We have col-lected 1% of ...,Method,Tool,2019,Body,5,3
4,https://www.mturk.com/,['5 User study'],Amazon. 2005. MTurk. (https://www.mturk.com/).,To verify whether human evaluators are in agre...,Method,Tool,2019,Reference,6,4


In [89]:
sorted(choice_data['year'].unique())

[np.int64(2001),
 np.int64(2002),
 np.int64(2003),
 np.int64(2004),
 np.int64(2005),
 np.int64(2006),
 np.int64(2007),
 np.int64(2008),
 np.int64(2009),
 np.int64(2010),
 np.int64(2011),
 np.int64(2012),
 np.int64(2013),
 np.int64(2014),
 np.int64(2015),
 np.int64(2016),
 np.int64(2017),
 np.int64(2018),
 np.int64(2019),
 np.int64(2020),
 np.int64(2021)]

In [90]:
# split train & test based on "year"
## full test data = 317
## test_dataを2021年（最新論文）
test_df = choice_data.query("year==2021")
## それ以外を学習データとして使用
train_df = choice_data.query("year<2021")
display(test_df.head())
display(train_df.head())

print("train_size:::", len(train_df))
print("test_size:::", len(test_df))

print("train_ids:::", train_df.index)
print("test_ids:::", test_df.index)

Unnamed: 0,url,passage-title,citation-info,citation-paragraph,role,type,year,citation-type,tsunokake_id,choice_id
46,https://github.com/doug919/entity_based_narra...,['1 Introduction'],1 https://github.com/doug919/entity_ based_nar...,The evaluated downstream tasks include two cha...,Material,Knowledge,2021,Footnote,63,46
48,https://github.com/luyaojie/text2event,['1 Introduction'],1 Our source codes are openly available at htt...,We conducted experiments [Cite_Footnote_1] on ...,Method,Code,2021,Footnote,66,48
49,https://archive.org/download/,"['3 Analysis Setup', '3.1 Experiment Procedure']",1 The snapshot is available at https://archive...,Our goal is to analyze the influence in downst...,Material,Dataset,2021,Footnote,67,49
50,http://commoncrawl.org/2016/10/newsdatasetava...,['mixture of corpus 5 used to pre-train BART.'],Sebastian Nagel. 2016. Cc-news. URL: http://we...,"5 Similar to RoBERTa, BART uses the combinatio...",Material,Dataset,2021,Reference,68,50
51,https://github.com/facebookresearch/DPR/blob/...,"['D Reproducibility', 'D.1 Dataset Details']",,We obtain closed-book QA datasets from [Cite] ...,Material,DataSource,2021,Body,69,51


Unnamed: 0,url,passage-title,citation-info,citation-paragraph,role,type,year,citation-type,tsunokake_id,choice_id
0,https://github.com/mdelhoneux/uuparser-compos...,['4 Composition in a K&G Parser'],4 The code can be found at https://github.com/...,"Parser We use UUParser, a variant of the K&G t...",Method,Code,2019,Footnote,0,0
1,http://hdl.handle.net/11234/1-2364,['5 What Correlates with Difficulty?'],Milan Straka and Jana Strakov. 2017. Tokenizin...,Head-POS Entropy Dehouck and Denis (2018) prop...,Material,Knowledge,2019,Reference,1,1
2,http://sjmielke.com/papers/tokenize/,['D Data selection: Europarl'],31 http://sjmielke.com/papers/tokenize/,"Finally, it should be said that the text in Co...",Method,Tool,2019,Footnote,2,2
3,https://developer.twitter.com/en/docs.html,"['2 Problem Formulation', '2.2 Data']",,• Negative examples: We have col-lected 1% of ...,Method,Tool,2019,Body,5,3
4,https://www.mturk.com/,['5 User study'],Amazon. 2005. MTurk. (https://www.mturk.com/).,To verify whether human evaluators are in agre...,Method,Tool,2019,Reference,6,4


train_size::: 1655
test_size::: 317
train_ids::: Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
       ...
       1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971],
      dtype='int64', length=1655)
test_ids::: Index([  46,   48,   49,   50,   51,   52,   53,   65,   66,   68,
       ...
       1922, 1923, 1924, 1925, 1926, 1927, 1928, 1929, 1957, 1958],
      dtype='int64', length=317)


In [91]:
# check years of train_df
print(Counter(train_df['year']))

Counter({2020: 320, 2019: 270, 2018: 229, 2016: 143, 2015: 122, 2014: 84, 2017: 83, 2013: 69, 2012: 65, 2010: 46, 2009: 41, 2006: 39, 2007: 33, 2005: 28, 2011: 28, 2008: 24, 2003: 12, 2004: 9, 2002: 7, 2001: 3})


In [92]:
def split_years(lst, index, seed: int = 0):
    if seed is not None:
        random.seed(seed)

    # 各年のインデックスを格納
    year_indices = defaultdict(list)
    for i, year in enumerate(lst):
        year_indices[year].append(i)
        # print(year)

    # 各年のインデックスをシャッフル（バランスよく選ぶため）
    for year in year_indices:
        random.shuffle(year_indices[year])

    # 年ごとにラウンドロビンでインデックスを選択
    lst1_indices = []
    available_years = list(year_indices.keys())

    while len(lst1_indices) < index and available_years:
        for year in available_years[:]:  # `[:]` でコピーを作成して反復中の変更に対応
            if year_indices[year]:  # まだ選べるデータがある場合
                lst1_indices.append(year_indices[year].pop(0))
                if len(lst1_indices) >= index:  # 目標数に達したら終了
                    break
            elif not year_indices[year]:  # その年のデータが尽きたらリストから削除
                print('hello')
                print(year)
                available_years.remove(year)

    # 残りのデータを lst2 に格納
    lst2_indices = [i for i in range(len(lst)) if i not in lst1_indices]

    return lst1_indices, lst2_indices

# ==== 動作確認 ====
# lst = [2001, 2002, 2001, 2003, 2002, 2001, 2003, 2003, 2002, 2001]
# index = 6
# lst1_indices, lst2_indices = split_years(lst, index, seed=0)

# print("lst1:", [lst[i] for i in lst1_indices])
# print("lst2:", [lst[i] for i in lst2_indices])


In [93]:
# 各年号が開発に含まれるように分割
## 返されるのはリストのインデックス=train_dfのreset_index
reset_dev_ids, reset_train_ids = split_years(train_df['year'].to_list(), len(test_df))
## 最初のchoice_dataのidに変換
dev_ids = [int(train_df.iloc[reset_dev_id]['choice_id']) for reset_dev_id in reset_dev_ids]
train_ids = [int(train_df.iloc[reset_train_id]['choice_id']) for reset_train_id in reset_train_ids]

hello
2001
hello
2002
hello
2004
hello
2003


In [94]:
# インデックスの確認
print(dev_ids)
print(train_ids)

[804, 1116, 547, 16, 1435, 829, 1908, 988, 147, 1656, 736, 1306, 1871, 313, 121, 123, 1257, 562, 312, 1621, 652, 7, 13, 1153, 142, 551, 1859, 977, 375, 64, 1346, 1483, 735, 449, 141, 409, 1258, 289, 1743, 454, 626, 8, 1623, 1354, 18, 754, 89, 1094, 713, 575, 1303, 1680, 1476, 315, 1931, 607, 874, 452, 1744, 684, 1470, 865, 1723, 428, 166, 1002, 722, 884, 516, 1048, 1867, 281, 832, 529, 308, 1968, 1830, 288, 1246, 322, 771, 1626, 589, 1436, 877, 1343, 1407, 1685, 691, 926, 577, 352, 1357, 1010, 1062, 1683, 1498, 1619, 345, 867, 948, 1530, 1485, 1224, 1424, 150, 514, 602, 872, 1565, 351, 1938, 265, 1061, 1827, 1596, 1245, 1072, 774, 12, 1349, 17, 689, 1861, 1814, 908, 1182, 1844, 1563, 85, 530, 1930, 1058, 1351, 451, 1620, 136, 226, 1374, 1582, 1486, 212, 180, 1507, 1840, 609, 382, 419, 1693, 317, 120, 1448, 1526, 1500, 272, 765, 1914, 117, 143, 105, 91, 1023, 43, 1649, 1040, 557, 1907, 1356, 122, 1933, 211, 283, 198, 1514, 1912, 176, 882, 870, 1265, 1614, 888, 688, 420, 1237, 1618, 443,

In [95]:
# 開発と学習の確認
dev_count = []
for id in dev_ids:
    dev_count.append(choice_data.iloc[id]['year'])
print("dev:::", Counter(dev_count))
train_count = []
for id in train_ids:
    train_count.append(choice_data.iloc[id]['year'])
print("train:::", Counter(train_count))

dev::: Counter({np.int64(2019): 18, np.int64(2014): 18, np.int64(2005): 18, np.int64(2009): 18, np.int64(2015): 18, np.int64(2011): 18, np.int64(2020): 18, np.int64(2016): 18, np.int64(2013): 18, np.int64(2018): 18, np.int64(2008): 18, np.int64(2017): 18, np.int64(2012): 18, np.int64(2010): 18, np.int64(2007): 17, np.int64(2006): 17, np.int64(2003): 12, np.int64(2004): 9, np.int64(2002): 7, np.int64(2001): 3})
train::: Counter({np.int64(2020): 302, np.int64(2019): 252, np.int64(2018): 211, np.int64(2016): 125, np.int64(2015): 104, np.int64(2014): 66, np.int64(2017): 65, np.int64(2013): 51, np.int64(2012): 47, np.int64(2010): 28, np.int64(2009): 23, np.int64(2006): 22, np.int64(2007): 16, np.int64(2005): 10, np.int64(2011): 10, np.int64(2008): 6})


In [96]:
# サイズチェック
if (len(dev_ids) == len(test_df)) and (len(train_ids) + len(dev_ids) + len(test_df) == len(train_df) + len(test_df)):
    print('ok')

ok


In [97]:
os.makedirs('./data/full_data_split', exist_ok=True)
with open('./data/full_data_split/test_ids.txt', 'w') as test_idx_file:
    for test_id in test_df['choice_id'].to_list():
        test_idx_file.write(str(test_id)+'\n')

with open('./data/full_data_split/dev_ids.txt', 'w') as dev_idx_file:
    for dev_id in dev_ids:
        dev_idx_file.write(str(dev_id)+'\n')

with open('./data/full_data_split/train_ids.txt', 'w') as train_idx_file:
    for train_id in train_ids:
        train_idx_file.write(str(train_id)+'\n')