### This file combines preprocesses the data and creates the cross validation split (Section 5 of the paper)

In [6]:
import os
import sys
import random
import numpy as np
import pandas as pd
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit, MultilabelStratifiedKFold

In [7]:
#random_states = [random.randint(0, 2**32) for i in range(5)]

In [8]:
random_states = [2746317213, 1181241943, 958682846, 3163119785, 1812140441]

In [9]:
def seed_everything(seed: int):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)

In [10]:
seed_everything(42)

In [11]:
DIMS = [
    'Inappropriateness',
    'Toxic Emotions',
    'Excessive Intensity',
    'Emotional Deception',
    'Missing Commitment',
    'Missing Seriousness',
    'Missing Openness',
    'Missing Intelligibility',
    'Unclear Meaning',
    'Missing Relevance',
    'Confusing Reasoning',
    'Other Reasons',
    'Detrimental Orthography',
    'Reason Unclassified'
]

In [12]:
data_dir = '../../data/'
df = pd.read_csv(data_dir+'appropriateness-corpus/appropriateness_corpus_conservative.csv')

X = df.post_id.values
y = df[DIMS].values
for j, random_state in enumerate(random_states):
    i = 0
    mskf = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
    for train_index, tmp_index in mskf.split(X, y):
        split_dict = {}
        X_tmp, X_test = X[train_index], X[tmp_index]
        y_tmp, y_test = y[train_index], y[tmp_index]
        for post_id in X_test:
            split_dict[post_id] = 'TEST'

        msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.125, random_state=42)
        for valid_index, test_index in msss.split(X_tmp, y_tmp):
            X_train, X_valid = X_tmp[valid_index], X_tmp[test_index]
            for post_id in X_train:
                split_dict[post_id] = 'TRAIN'
            for post_id in X_valid:
                split_dict[post_id] = 'VALID'
        df['fold{}.{}'.format(j,i)] = df['post_id'].apply(lambda x: split_dict[x])
        i+=1
df['arg_issue'] = df[['issue','post_text']].apply(lambda x: ' '.join(x), axis = 1)
df.to_csv(data_dir+'appropriateness-corpus/appropriateness_corpus_conservative_w_folds.csv')

In [13]:
df.head()

Unnamed: 0,post_id,source_dataset,issue,post_text,Inappropriateness,Toxic Emotions,Excessive Intensity,Emotional Deception,Missing Commitment,Missing Seriousness,...,fold3.1,fold3.2,fold3.3,fold3.4,fold4.0,fold4.1,fold4.2,fold4.3,fold4.4,arg_issue
0,0,0,Is the school uniform a good or bad idea:,students should wear what they like and feel f...,1,0,0,0,0,False,...,TRAIN,TRAIN,TRAIN,TEST,TRAIN,TRAIN,TRAIN,TRAIN,TEST,Is the school uniform a good or bad idea: stud...
1,1,0,Is the school uniform a good or bad idea:,"people cant be forced to wear school uniforms,...",1,0,0,0,1,False,...,TRAIN,TRAIN,VALID,TRAIN,TRAIN,TEST,TRAIN,VALID,TRAIN,Is the school uniform a good or bad idea: peop...
2,2,0,Firefox vs internet explorer:,"That form of argument degrades this forum, and...",1,1,0,1,1,True,...,TEST,TRAIN,TRAIN,TRAIN,TRAIN,TRAIN,TRAIN,TEST,VALID,Firefox vs internet explorer: That form of arg...
3,3,0,If your spouse committed murder and he or she ...,I wouldnt turn her in becuase she is my wife. ...,0,0,0,0,0,False,...,TRAIN,TRAIN,TRAIN,TRAIN,TRAIN,TEST,TRAIN,TRAIN,TRAIN,If your spouse committed murder and he or she ...
4,4,0,If your spouse committed murder and he or she ...,No I wouldn't turn in my spouse. Just because ...,1,0,0,0,1,False,...,TRAIN,TRAIN,VALID,TRAIN,TRAIN,TRAIN,TRAIN,TRAIN,TEST,If your spouse committed murder and he or she ...
