In [1]:
import pandas as pd
import numpy as np
import json

from sklearn.model_selection import train_test_split

### Unifying the csv files:

#### For Training:

In [3]:
kp_df     = pd.read_csv('../KPA_2021_shared_task/kpm_data/key_points_train.csv')
arg_df    = pd.read_csv('../KPA_2021_shared_task/kpm_data/arguments_train.csv')
labels_df = pd.read_csv('../KPA_2021_shared_task/kpm_data/labels_train.csv')

In [4]:
all_df = labels_df.merge(arg_df, how='inner', left_on='arg_id', right_on='arg_id')
all_df = all_df.merge(kp_df[['key_point_id', 'key_point']], how='inner', left_on='key_point_id', right_on='key_point_id')

In [5]:
all_df.head()

Unnamed: 0,arg_id,key_point_id,label,argument,topic,stance,key_point
0,arg_0_0,kp_0_0,0,`people reach their limit when it comes to the...,Assisted suicide should be a criminal offence,-1,Assisted suicide gives dignity to the person t...
1,arg_0_1,kp_0_0,0,A patient should be able to decide when they h...,Assisted suicide should be a criminal offence,-1,Assisted suicide gives dignity to the person t...
2,arg_0_2,kp_0_0,0,a person has the right to end their suffering ...,Assisted suicide should be a criminal offence,-1,Assisted suicide gives dignity to the person t...
3,arg_0_4,kp_0_0,0,a person should have the right to be able to c...,Assisted suicide should be a criminal offence,-1,Assisted suicide gives dignity to the person t...
4,arg_0_5,kp_0_0,0,a person should have the right to die on their...,Assisted suicide should be a criminal offence,-1,Assisted suicide gives dignity to the person t...


In [56]:
all_df.to_csv('../data/training_df.csv')

#### For dev:

In [6]:
kp_df     = pd.read_csv('../KPA_2021_shared_task/kpm_data/key_points_dev.csv')
arg_df    = pd.read_csv('../KPA_2021_shared_task/kpm_data/arguments_dev.csv')
labels_df = pd.read_csv('../KPA_2021_shared_task/kpm_data/labels_dev.csv')

all_df = labels_df.merge(arg_df, how='inner', left_on='arg_id', right_on='arg_id')
all_df = all_df.merge(kp_df[['key_point_id', 'key_point']], how='inner', left_on='key_point_id', right_on='key_point_id')

all_df.to_csv('../data/valid_df.csv')

In [7]:
all_df.head()

Unnamed: 0,arg_id,key_point_id,label,argument,topic,stance,key_point
0,arg_4_121,kp_4_5,1,A real education is about giving students the ...,We should abandon the use of school uniform,1,School uniform is harming the student's self e...
1,arg_4_122,kp_4_5,1,children express themselves through the clothe...,We should abandon the use of school uniform,1,School uniform is harming the student's self e...
2,arg_4_123,kp_4_5,1,"children should be able to dress as they wish,...",We should abandon the use of school uniform,1,School uniform is harming the student's self e...
3,arg_4_124,kp_4_5,1,children should be allowed to express themselves,We should abandon the use of school uniform,1,School uniform is harming the student's self e...
4,arg_4_126,kp_4_5,1,freedom of expression extends to the right to ...,We should abandon the use of school uniform,1,School uniform is harming the student's self e...


In [8]:
len(all_df)

3458

### Split the develpment:

In [60]:
dev_df = pd.read_csv('../data/valid_df.csv')

In [67]:
argument_ids = list(set(dev_df.arg_id.tolist()))
test_arg_ids, valid_arg_ids = train_test_split(argument_ids, test_size=0.7)

In [68]:
dev_df.head()

Unnamed: 0.1,Unnamed: 0,arg_id,key_point_id,label,argument,topic,stance,key_point
0,0,arg_4_121,kp_4_5,1,A real education is about giving students the ...,We should abandon the use of school uniform,1,School uniform is harming the student's self e...
1,1,arg_4_122,kp_4_5,1,children express themselves through the clothe...,We should abandon the use of school uniform,1,School uniform is harming the student's self e...
2,2,arg_4_123,kp_4_5,1,"children should be able to dress as they wish,...",We should abandon the use of school uniform,1,School uniform is harming the student's self e...
3,3,arg_4_124,kp_4_5,1,children should be allowed to express themselves,We should abandon the use of school uniform,1,School uniform is harming the student's self e...
4,4,arg_4_126,kp_4_5,1,freedom of expression extends to the right to ...,We should abandon the use of school uniform,1,School uniform is harming the student's self e...


In [69]:
len(test_arg_ids)

279

In [70]:
len(valid_arg_ids)

653

In [71]:
test_df  = dev_df[dev_df.arg_id.isin(test_arg_ids)]
valid_df = dev_df[dev_df.arg_id.isin(valid_arg_ids)]

In [72]:
print('Valid:')
print(len(valid_df))
print(valid_df.arg_id.nunique())
print(valid_df.key_point_id.nunique())

Valid:
2400
653
36


In [73]:
print('Test:')
print(len(test_df))
print(test_df.arg_id.nunique())
print(test_df.key_point_id.nunique())

Test:
1058
279
36


In [75]:
valid_df.to_csv('../data/our_valid.csv')
test_df.to_csv('../data/our_test.csv')

### Matching our split to the format of IBM:

In [86]:
valid_key_points_df = valid_df[['key_point_id', 'key_point', 'topic', 'stance']].drop_duplicates()
valid_arguments_df = valid_df[['arg_id', 'argument', 'topic', 'stance']].drop_duplicates()
valid_labels_df = valid_df[['arg_id', 'key_point_id', 'label']]

valid_key_points_df.to_csv('../data/key_points_our_valid.csv')
valid_arguments_df.to_csv('../data/arguments_our_valid.csv')
valid_labels_df.to_csv('../data/labels_our_valid.csv')

In [89]:
test_key_points_df = test_df[['key_point_id', 'key_point', 'topic', 'stance']].drop_duplicates()
test_arguments_df = test_df[['arg_id', 'argument', 'topic', 'stance']].drop_duplicates()
test_labels_df = test_df[['arg_id', 'key_point_id', 'label']]

test_key_points_df.to_csv('../data/key_points_our_test.csv')
test_arguments_df.to_csv('../data/arguments_our_test.csv')
test_labels_df.to_csv('../data/labels_our_test.csv')

In [76]:
test_df.head()

Unnamed: 0.1,Unnamed: 0,arg_id,key_point_id,label,argument,topic,stance,key_point
2,2,arg_4_123,kp_4_5,1,"children should be able to dress as they wish,...",We should abandon the use of school uniform,1,School uniform is harming the student's self e...
4,4,arg_4_126,kp_4_5,1,freedom of expression extends to the right to ...,We should abandon the use of school uniform,1,School uniform is harming the student's self e...
6,6,arg_4_130,kp_4_5,0,Kids having to wear school uniforms is not fai...,We should abandon the use of school uniform,1,School uniform is harming the student's self e...
9,9,arg_4_133,kp_4_5,0,making kids wear uniforms that are all the sam...,We should abandon the use of school uniform,1,School uniform is harming the student's self e...
12,12,arg_4_137,kp_4_5,0,"Most students do not want school uniforms, the...",We should abandon the use of school uniform,1,School uniform is harming the student's self e...


In [77]:
kp_df.columns

Index(['key_point_id', 'key_point', 'topic', 'stance'], dtype='object')

In [78]:
arg_df.columns

Index(['arg_id', 'argument', 'topic', 'stance'], dtype='object')

In [85]:
labels_df.columns

Index(['arg_id', 'key_point_id', 'label'], dtype='object')