In [2]:
import pandas as pd
from tqdm.notebook import tqdm

In [3]:
article = pd.read_csv('./data/articles.csv')
article

Unnamed: 0,id,article
0,2,Right to life
1,3,Prohibition of torture
2,4,Prohibition of slavery and forced labour
3,5,Right to liberty and security
4,6,Right to a fair trial
5,7,No punishment without law
6,8,Right to respect for private and family life
7,9,"Freedom of thought, conscience and religion"
8,10,Freedom of expression
9,11,Freedom of assembly and association


In [4]:
train_data = pd.read_json('./data/train.jsonl', lines= True)
val_data = pd.read_json('./data/dev.jsonl', lines= True)
test_data = pd.read_json('./data/test.jsonl', lines= True)

In [5]:
data = pd.concat([train_data, val_data, test_data]).reset_index()
data

Unnamed: 0,index,case_id,case_no,title,judgment_date,facts,applicants,defendants,allegedly_violated_articles,violated_articles,court_assessment_references,silver_rationales,gold_rationales
0,0,001-59587,25702/94,CASE OF K. AND T. v. FINLAND,2001-07-12,[11. At the beginning of the events relevant ...,"[K., T.]",[FINLAND],"[13, 8]",[8],"{'8': ['12', '140', '155', '156', '157', '158'...","[1, 13, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30...",[]
1,1,001-59591,42527/98,CASE OF PRINCE HANS-ADAM II OF LIECHTENSTEIN v...,2001-07-12,[9. The applicant is the monarch of Liechtens...,[PRINCE HANS-ADAM II OF LIECHTENSTEIN],[GERMANY],"[14, P1-1, 6]",[],"{'6': ['12', '15', '24', '25', '26', '27', '28...","[3, 6]",[]
2,2,001-59590,33071/96,CASE OF MALHOUS v. THE CZECH REPUBLIC,2001-07-12,[9. In June 1949 plots of agricultural land o...,[MALHOUS],[CZECH REPUBLIC],[6],[6],"{'6': ['13', '14', '35', '40', '41', '42', '43...","[4, 5]",[]
3,3,001-59588,29032/95,CASE OF FELDEK v. SLOVAKIA,2001-07-12,"[8. In 1991 Mr Dušan Slobodník, a research wo...",[FELDEK],[SLOVAKIA],"[14, 10, 9]",[10],{'10': ['35']},[27],[]
4,4,001-59589,44759/98,CASE OF FERRAZZINI v. ITALY,2001-07-12,"[9. The applicant is an Italian citizen, born...",[FERRAZZINI],[ITALY],"[14, 6]",[],"{'6': ['13', '14', '35', '40', '41', '42', '43...",[4],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10995,995,001-194239,15989/11,CASE OF MITITELU AND ANTONOVICI v. THE REPUBLI...,2019-07-02,[5. The applicants were born in 1971 and 1976...,"[MITITELU, ANTONOVICI]",[MOLDOVA],"[10, 6]",[6],"{'10': ['21', '24', '31']}",[],[]
10996,996,001-194243,48108/07,CASE OF BEŞLEAGĂ v. THE REPUBLIC OF MOLDOVA AN...,2019-07-02,"[5. The applicant, who was born in 1948, live...",[BEŞLEAGĂ],"[MOLDOVA, RUSSIA]","[10, 5, 13, 3, 6]","[6, 5, 10, 3, 13]","{'10': ['21', '24', '31']}",[],[]
10997,997,001-194241,57468/08,CASE OF PANTELEICIUC v. THE REPUBLIC OF MOLDOV...,2019-07-02,[5. The applicant was born in 1980 and lives ...,[PANTELEICIUC],"[MOLDOVA, RUSSIA]",[6],[6],{'6': ['9']},[4],[]
10998,998,001-194248,25082/06,CASE OF POPOV v. RUSSIA,2019-07-02,[4. The applicant was born in 1972 and is det...,[POPOV],[RUSSIA],[3],[3],"{'3': ['15', '17']}","[11, 13]",[]


In [7]:
# def create_labels(data, articles, type_articles, desc):
#     case_id = []
#     labels = []
#     for i in range(len(data)):
#         case_id.append(data['case_id'][i])
#         label = [0] * len(articles)
#         for a in data[type_articles][i]:
#             for j in range(len(articles)):
#                 if articles['id'][j] == a:
#                     label[j] = 1
#         labels.append(label)
#     pd.concat([pd.DataFrame({'case_id': idx}), pd.DataFrame(labels)], axis= 1).to_csv(f'{desc}_{type_articles}_labels.csv', index= False)
def create_labels(data, articles, type_articles, desc):
    case_id = []
    labels = []
    for i in range(len(data)):
        case_id.append(data['case_id'][i])
        label = [0] * len(articles)
        for a in data[type_articles][i]:
            for j in range(len(articles)):
                if articles['id'][j] == a:
                    label[j] = 1
        labels.append(label)
    pd.DataFrame({'case_id': case_id, 'labels': labels}).to_csv(f'./data/{desc}_{type_articles}_labels.csv', index= False)

create_labels(train_data, article, 'allegedly_violated_articles', 'train')
create_labels(train_data, article, 'violated_articles', 'train')
create_labels(val_data, article, 'allegedly_violated_articles', 'val')
create_labels(val_data, article, 'violated_articles', 'val')
create_labels(test_data, article, 'allegedly_violated_articles', 'test')
create_labels(test_data, article, 'violated_articles', 'test')

In [6]:
def create_map(data, desc):
    case_id = []
    applicant_list = []
    for i in range(len(data)):
        for a in data['applicants'][i]:
            applicant_list.append(a)
            case_id.append(data['case_id'][i])

    applicant_map = pd.DataFrame({
        'case_id': case_id,
        'applicants': applicant_list
    })
    applicant_map.to_csv(f'./data/{desc}_applicant_map.csv', index= False)

    case_id = []
    defendant_list = []
    for i in range(len(data)):
        for d in data['defendants'][i]:
            defendant_list.append(d)
            case_id.append(data['case_id'][i])
    defendant_map = pd.DataFrame({
        'case_id': case_id,
        'defendants': defendant_list
    })
    defendant_map.to_csv(f'./data/{desc}_defendant_map.csv', index= False)

In [7]:
# create_map(train_data, 'train')
# create_map(val_data, 'val')
# create_map(test_data, 'test')

In [16]:
train_data.to_csv('train_data.csv', index= False)
val_data.to_csv('val_data.csv', index= False)
test_data.to_csv('test_data.csv', index= False)

In [24]:
test = pd.read_csv('./data/test_data_mapping.csv')

In [8]:
def create_list(data, desc):
    idx = []
    applicants = []
    counter = 0
    for i in range(len(data)):
        for a in data['applicants'][i]:
            if a not in applicants:
                applicants.append(a)
                idx.append(counter)
                counter += 1
    pd.DataFrame({'id': idx, 'applicants': applicants}).to_csv(f'./data/{desc}_applicants.csv', index= False)
    idx = []
    defendants = []
    counter = 0
    for i in range(len(data)):
        for a in data['defendants'][i]:
            if a not in defendants:
                defendants.append(a)
                idx.append(counter)
                counter += 1
    pd.DataFrame({'id': idx, 'defendants': defendants}).to_csv(f'./data/{desc}_defendants.csv', index= False)

# create_list(train_data, 'train')
# create_list(val_data, 'val')
# create_list(test_data, 'test')

In [2]:
def create_articles(data, label, desc):
    case_id = []
    articles = []
    for i in range(len(data)):
        for a in data[label][i]:
            case_id.append(data['case_id'][i])
            articles.append(a)
    pd.DataFrame({
        'case_id': case_id,
        'article_id': articles
    }).to_csv(f'./data/{desc}_{label}.csv', index= False)


In [7]:
# create_articles(train_data, 'allegedly_violated_articles', 'train')
# create_articles(train_data, 'violated_articles', 'train')
# create_articles(val_data, 'allegedly_violated_articles', 'val')
# create_articles(val_data, 'violated_articles', 'val')
# create_articles(test_data, 'allegedly_violated_articles', 'test')
# create_articles(test_data, 'violated_articles', 'test')
