### This file combines arguments from the UKPConvArg1 corpus and the GAQCropus to create the corpus we use in our annotation study:

In [None]:
import glob
import pandas as pd
import numpy as np

from os import listdir
from collections import Counter
from os.path import isfile, join
from sklearn.model_selection import StratifiedKFold
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
from spacy.pipeline import Sentencizer

pd.set_option('display.max_colwidth', None)

In [None]:
data_dir = '../../data/'
ukp2_dir = data_dir+'emnlp2016-empirical-convincingness/data/CSV-format/'

In [None]:
rel_files_ukp2 = glob.glob(ukp2_dir+'*')
dfs = []
for file in rel_files_ukp2:
    if 'LICENSE.txt' not in file:
        tmp_df = pd.read_csv(file, sep='\t', names=['pair_id','gold_label','more_conv_arg','less_conv_arg'])
        tmp_df['issue'] = file.split('/')[-1].split('.csv')[0].split('.xml')[0].split('_')[0]
        tmp_df['stance'] = file.split('/')[-1].split('.csv')[0].split('.xml')[0].split('_')[1]
        dfs.append(tmp_df)
df_ukp2 = pd.concat(dfs)

In [None]:
df_ukp2['more_conv_id'] = df_ukp2['pair_id'].apply(lambda x: x.split('_')[0])
df_ukp2['less_conv_id'] = df_ukp2['pair_id'].apply(lambda x: x.split('_')[1])

In [None]:
real_ids = {}
real_issues = {}
for i, row in df_ukp2.iterrows():
    if row['more_conv_id'] not in real_ids:
        real_ids[row['more_conv_id']] = [row['more_conv_arg'],row['less_conv_arg']]
        real_issues[row['more_conv_id']] = row['issue']
    elif row['less_conv_id'] not in real_ids:
        real_ids[row['less_conv_id']] = [row['more_conv_arg'],row['less_conv_arg']]
        real_issues[row['less_conv_id']] = row['issue']
    elif len(real_ids[row['more_conv_id']]) == 2:
        if row['more_conv_arg'] in real_ids[row['more_conv_id']]:
            real_ids[row['more_conv_id']] = [row['more_conv_arg']]
        elif row['less_conv_arg'] in real_ids[row['more_conv_id']]:
            real_ids[row['more_conv_id']] = [row['less_conv_arg']]
    elif len(real_ids[row['less_conv_id']]) == 2:
        if row['more_conv_arg'] in real_ids[row['less_conv_id']]:
            real_ids[row['less_conv_id']] = [row['more_conv_arg']]
        elif row['less_conv_arg'] in real_ids[row['less_conv_id']]:
            real_ids[row['less_conv_id']] = [row['less_conv_arg']]

In [None]:
df_ukp = pd.DataFrame(data={'#id': real_ids.keys(), 'argument': [x[0] for x in real_ids.values()], 'issue': [real_issues[x] for x in real_ids.keys()]})

In [None]:
### Check that all the arguments from the Dagstuhl corpus are part of the UKPCorpus
df_dagstuhl = pd.read_csv(data_dir+'dagstuhl-15512-argquality-corpus-v2/dagstuhl-15512-argquality-corpus-annotated.csv', sep='\t', encoding='1254')
len(list(set(df_ukp[df_ukp['#id'].isin(df_dagstuhl['#id'])]['#id'].tolist())))

In [None]:
df_qa = pd.read_csv(data_dir+'GAQCorpus_split/qa_forums_mixtrain_overlaptest_crowdtest.csv')
df_debate = pd.read_csv(data_dir+'GAQCorpus_split/debate_forums_mixtrain_overlaptest_crowdtest.csv')
df_review = pd.read_csv(data_dir+'GAQCorpus_split/review_forums_mixtrain_overlaptest_crowdtest.csv')

In [None]:
ids = df_ukp['#id'].tolist() + df_qa['id'].tolist() + df_debate['id'].tolist() + df_review['id'].tolist()
sources = df_ukp['argument'].tolist() + df_qa['text'].tolist() + df_debate['text'].tolist() + df_review['text'].tolist()
issues = df_ukp['issue'].tolist() + df_qa['title'].tolist() + df_debate['title'].tolist() + df_review['title'].tolist()
batch = [1 for x in range(len(ids))]
types = [0 for x in range(len(df_ukp))] + [1 for x in range(len(df_qa))] + [2 for x in range(len(df_debate))] + [3 for x in range(len(df_review))]

In [None]:
corpus_df = pd.DataFrame(data={'id': ids, 'source': sources, 'issue': issues, 'batch': batch, 'types': types})

In [None]:
### Split into 14 batches for annotaion study
cv = StratifiedKFold(n_splits=14, random_state=42, shuffle=True)
fold = 0
dfs = []
for train_idxs, test_idxs in cv.split(corpus_df, corpus_df.types):
    fold_df = corpus_df.iloc[test_idxs]
    fold_df['batch'] = fold
    fold_df.drop(columns=['types'], inplace=True)
    fold_df.to_csv(data_dir+'appropriateness-corpus/annotation_dataset_{}.csv'.format(fold), index=False)
    fold += 1
    dfs.append(fold_df)

In [None]:
### Create full annotation study dataset
full_df = pd.concat(dfs)
#full_df['id'] = list(range(len(full_df)))
#full_df['source'] = full_df['source'].apply(lambda x : x.replace('\n', '\\n'))
#full_df.to_csv(data_dir+'appropriateness-corpus/annotation_dataset.csv', index=False)
#full_df.to_csv(data_dir+'appropriateness-corpus/annotation_dataset_types.csv', index=False)
full_df.to_csv(data_dir+'appropriateness-corpus/annotation_dataset_types_sourceids.csv', index=False)

### Compute corpus statistics:

In [None]:
len(real_ids)

In [None]:
len(set([x[0] for x in real_ids.values()]))

In [None]:
len(df_qa)

In [None]:
len(df_debate)

In [None]:
len(df_review)

In [None]:
corpus_df.issue.value_counts()

In [None]:
len(corpus_df.issue.unique())

In [None]:
Counter(corpus_df.issue.value_counts().tolist())

In [None]:
np.mean([len(x) for x in corpus_df.source.tolist()])

In [None]:
nlp = English()
sentencizer = Sentencizer()
tokenizer = Tokenizer(nlp.vocab)
nlp.add_pipe('sentencizer')

In [None]:
np.mean([len([z for z in nlp(x).sents]) for x in corpus_df.source.tolist()])

In [None]:
np.mean([len(tokenizer(x)) for x in corpus_df.source.tolist()])