# CyberWallE at SemEval-2020 Task 11
(V. Blaschke, M. Korniyenko & S. Tureski, 2020)

This file contains the main script for the system for subtask 1 (span identification) and the base model for task 2 (technique classification), as well as the feature ablation configurations.

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
%tensorflow_version 1.x

# model.py

In [0]:
import pandas as pd
import numpy as np
from itertools import takewhile
import zipfile
import urllib.request
from keras.layers import Bidirectional, CuDNNLSTM, Dense, Dropout, \
    TimeDistributed, Activation
from keras.layers.merge import Concatenate
from keras.models import Sequential, Model
from sklearn import svm, preprocessing
from sklearn.linear_model import LinearRegression
from keras.layers import Activation, Dense, Dropout, Flatten, Input, MaxPooling1D, Convolution1D, Embedding, Conv1D
from xgboost import XGBClassifier

########################
# Processing the input #
########################


# Helper method for prepare_data
def get_comments(filename, url=True):
    if url:
        comments = []
        with urllib.request.urlopen(filename) as f:
            for line in f:
                if line.startswith(b'#'):
                    comments.append(line.decode("utf-8"))
                else:
                    break
        return comments
    with open(filename, 'r', encoding='utf8') as f:
        commentiter = takewhile(lambda s: s.startswith('#'), f)
        comments = list(commentiter)
    return comments


# Helper method for prepare_data
def get_cols(input_df, col):
    return input_df.groupby('sent_id')[col].apply(list).to_frame()


# Helper method for prepare_data
def add_sent_lens(input_df, col='token'):
    input_df['n_toks'] = input_df[col].apply(lambda x: len(x))
    return input_df


# Helper method for prepare_data
def get_features(input_df, feature_cols):
    x = add_sent_lens(get_cols(input_df, 'token'))
    for feature in feature_cols:
        x = pd.merge(left=x, right=get_cols(input_df, feature),
                     left_on='sent_id', right_on='sent_id')
    return x


# Helper method for encode_x_bert
def bert_embeddings_for_sent(bert_tokens, row, feature_header, embedding_matrix,
                             embed_dim, sent_idx, uncased):
    if len(bert_tokens) < len(row.token):
        # No need to worry if this prints something about '\ufeff'
        print('BERT', [i[0] for i in bert_tokens])
        print('X', row.token)
    word_idx = 0
    for (tok, embed) in bert_tokens:
        if word_idx == row.n_toks:
            break
        word = str(row.token[word_idx])
        if word == '\ufeff':  # Prints a warning, but is dealt with.
            word_idx += 1
            continue
        if uncased:
            word = word.lower()
        if tok == word or word.startswith(tok):
            # startswith: Use embedding of first subtoken
            embedding_matrix[sent_idx - 1][word_idx][:embed_dim] = embed
            for i, feature in enumerate(feature_header):
                embedding_matrix[sent_idx - 1][word_idx][embed_dim + i] = \
                    getattr(row, feature)[word_idx]
            word_idx += 1
            continue
        if tok.startswith('##') and not word.startswith('##'):
            # BERT word continutation prefix (e.g. per ##pet ##uate)
            continue


# Task 1: Token embeddings
def encode_x_bert(x, bert_file, feature_header, max_seq_len, embed_dim=768,
                  uncased=True):
    # TODO this currently assumes that the BERT file only contains information
    # about a single layer. extend this to multiple layers?
    embedding_matrix = np.zeros([len(x), max_seq_len,
                                 embed_dim + len(feature_header)])
    prev_sent_idx = 1
    bert_tokens = []
    sentences = x.itertuples()
    with open(bert_file, encoding='utf8') as f:
        for line in f:
            cells = line.split('\t')
            sent_idx = int(cells[0])
            layer = int(cells[1])
            token = cells[2]
            embedding = np.fromstring(cells[3][1:-1], sep=',')

            if sent_idx != prev_sent_idx:
                if sent_idx % 1000 == 0:
                    print("BERT embeddings for sentence", sent_idx)
                row = next(sentences)
                assert row.Index == prev_sent_idx
                bert_embeddings_for_sent(bert_tokens, row, feature_header,
                                         embedding_matrix, embed_dim,
                                         prev_sent_idx, uncased)
                bert_tokens = []

            bert_tokens.append((token, embedding))
            prev_sent_idx = sent_idx

    # Last line:
    row = next(sentences)
    bert_embeddings_for_sent(bert_tokens, row, feature_header, embedding_matrix,
                             embed_dim, prev_sent_idx, uncased)
    return embedding_matrix


# Task 2: Sequence embeddings
def encode_x_seq(x, bert_file, feature_header, embed_dim=768, uncased=True,
                 n_bert_layers=1):
    embedding_matrix = np.zeros([len(x),
                                 embed_dim * n_bert_layers + len(feature_header)])
    prev_sent_idx = 1
    bert_tokens = []
    sequences = x.itertuples()
    with open(bert_file, encoding='utf8') as f:
        idx = 0
        for line in f:
            row = next(sequences)
            for bert_layer in range(n_bert_layers):
                cells = line.split('\t')
                sent_idx = int(cells[0])
                layer = cells[1]
                seq = cells[2]
                embedding = np.fromstring(cells[3][1:-1], sep=',')
                text = row.text
                if uncased:
                    text = text.lower()
                # assert text == seq or text + ' ' + text == seq
                embedding_matrix[idx][embed_dim * bert_layer:embed_dim * (bert_layer + 1)] = embedding
                if n_bert_layers > 1 and bert_layer < n_bert_layers - 1:
                    line = next(f)
            for i, feature in enumerate(feature_header):
                embedding_matrix[idx][embed_dim * n_bert_layers + i] = getattr(row, feature)
            idx += 1
    return embedding_matrix


def encode_x(x, word2embedding, feature_header, max_seq_len,
             embed_dim, uncased):
    """Encode the input data.

    Arguments:
    x -- a Pandas dataframe
    word2embedding -- a dict(str -> np.array) from tokens to embeddings
    feature_header -- dataframe names of additional feature columns
    max_seq_len -- the maximum number of tokens per sentence in x
    embed_dim -- the array length of the vectors in word2embedding
    """
    embedding_matrix = np.zeros([len(x), max_seq_len,
                                 embed_dim + len(feature_header)])
    for row in x.itertuples():
        sent_idx = row.Index - 1
        for tok_idx in range(row.n_toks):
            word = str(row.token[tok_idx])
            if uncased:
                word = word.lower()
            embedding_matrix[sent_idx][tok_idx][:embed_dim] = \
                word2embedding.get(word, np.random.randn(embed_dim))
            for i, feature in enumerate(feature_header):
                embedding_matrix[sent_idx][tok_idx][embed_dim + i] = \
                    getattr(row, feature)[tok_idx]
    return embedding_matrix


def encode_y(y, label2idx, max_seq_len, n_classes):
    if n_classes == 1:
        if max_seq_len > 1:
            labels = np.zeros([len(y), max_seq_len])
        else:
            labels = np.zeros(len(y))
    else:
        labels = np.zeros([len(y), max_seq_len, n_classes])

    if max_seq_len > 1:
        for row in y.itertuples():
            sent_idx = row.Index - 1
            for tok_idx, label in enumerate(row.label):
                labels[sent_idx][tok_idx] = label2idx[label]
    else:
        for row in y.iteritems():
            labels[row[0]] = label2idx[row[1]]
    return labels


def prepare_data(config, word2embedding, phase):
    # We're getting the comments this way so we can:
    # - add them to the output
    # - parse lines that actually contain '#' as token
    if phase == 'train':
        infile = config.TRAIN_URL
    elif phase == 'dev':
        infile = config.DEV_URL
    elif phase == 'test':
        infile = config.TEST_URL
    comments = get_comments(infile, config.ONLINE_SOURCES)
    df = pd.read_csv(infile, sep='\t', skiprows=len(comments), quoting=3,
                     encoding='utf8')
    
    if config.TOKEN_LVL:
        std_cols = ['document_id', 'sent_id', 'token_start',
                    'token_end', 'token', 'label']
    else:
        std_cols = ['document_id', 'span_start', 'span_end', 'text', 'label']
    feature_cols = []
    for col in df.columns:
        if col in config.EXCLUDE_FEATURES:
            continue
        if config.FEATURES is None:  # Determine features based on file header
            if col not in std_cols:
                feature_cols.append(col)
        else:
            if col in config.FEATURES:
                feature_cols.append(col)

    if config.TOKEN_LVL:
        x_raw = get_features(df, feature_cols)
    else:
        x_raw = df

    if config.USE_BERT:
        if phase == 'train':
            bert_file = config.TRAIN_BERT
        elif phase == 'dev':
            bert_file = config.DEV_BERT
        elif phase == 'test':
            bert_file = config.TEST_BERT
        if config.TOKEN_LVL:
            x_enc = encode_x_bert(x_raw, bert_file, feature_cols,
                                  config.MAX_SEQ_LEN, config.EMBED_DIM,
                                  config.UNCASED)
        else:
            x_enc = encode_x_seq(x_raw, bert_file, feature_cols, 
                                 config.EMBED_DIM, config.UNCASED,
                                 config.N_BERT_LAYERS)
    else:
        x_enc = encode_x(x_raw, word2embedding, feature_cols,
                     config.MAX_SEQ_LEN, config.EMBED_DIM, config.UNCASED)
        
    
    print(x_enc.shape)

    y = None
    sample_weight = None
    if phase == 'train':
        if config.TOKEN_LVL:
            y_raw = get_cols(df, 'label')
            if config.N_CLASSES == 3:
                label2idx = {"O": [1, 0, 0], "B": [0, 0, 1], "I": [0, 1, 0]}
            elif config.N_CLASSES == 2:
                label2idx = {"O": [1, 0], "B": [0, 1], "I": [0, 1]}
            y = encode_y(y_raw, label2idx, config.MAX_SEQ_LEN, config.N_CLASSES)
            sample_weight = encode_y(y_raw, config.CLASS_WEIGHTS,
                                     config.MAX_SEQ_LEN, n_classes=1)
        else:
            y = df.label
            if config.CLASS_WEIGHTS:
                sample_weight = encode_y(y, config.CLASS_WEIGHTS,
                                         config.MAX_SEQ_LEN, n_classes=1)

    return df, x_raw, x_enc, y, sample_weight, comments, feature_cols


def load_zipped_embeddings(infile):
    word2embedding = {}
    with zipfile.ZipFile(infile) as f_in_zip:
        file_in = f_in_zip.filelist[0].filename
        i = 0
        with f_in_zip.open(file_in, 'r') as f_in:
            for line in f_in:
                values = line.decode().rstrip().split()
                word2embedding[values[0]] = np.asarray(values[1:],
                                                       dtype='float32')
                i += 1
                if i % 100000 == 0:
                    print("Read " + str(i) + " embeddings")
    return word2embedding


def get_data(config, word2embedding=None):
    if (not word2embedding) and (not config.USE_BERT):
        if config.EMBEDDING_PATH[-4:] == '.zip':
            word2embedding = load_zipped_embeddings(config.EMBEDDING_PATH)
        else:
            word2embedding = {}
            f = open(config.EMBEDDING_PATH)
            for line in f:
                values = line.rstrip().split()
                word2embedding[values[0]] = np.asarray(values[1:],
                                                       dtype='float32')
            f.close()

    _, _, train_x, train_y, sample_weight, comments, features = prepare_data(
        config, word2embedding, phase='train')
    dev_df, dev_raw, dev_x, _, _, _, _ = prepare_data(config, word2embedding,
                                                      phase='dev')
    if config.TEST_URL:
        test_df, test_raw, test_x, _, _, _, _ = prepare_data(config,
                                                             word2embedding,
                                                             phase='test')
    else:
        test_df, test_raw, test_x = None, None, None
    return Data(train_x, train_y,dev_df, dev_raw, dev_x, test_df, test_raw,
                test_x, sample_weight, comments, features)


class Data:
    def __init__(self,
                 # If initializing on the fly:
                 train_x=None, train_y=None,
                 dev_df=None, dev_raw=None, dev_x=None,
                 test_df=None, test_raw=None, test_x=None,
                 sample_weight=None, comments=None, features=None,
                 # If initializing from files:
                 path=None):
        self.train_x = train_x
        self.train_y = train_y
        self.sample_weight = sample_weight
        self.comments = comments
        self.features = features
        self.dev_df = dev_df
        self.dev_raw = dev_raw
        self.dev_x = dev_x
        self.test_df = test_df
        self.test_raw = test_raw
        self.test_x = test_x
        if path:
            self.load(path)


    def save(self, path='gdrive/My Drive/colab_projects/data/data/'):
        np.save(path + 'train_x', self.train_x)
        np.save(path + 'train_y', self.train_y)
        np.save(path + 'dev_x', self.dev_x)
        np.save(path + 'test_x', self.test_x)
        np.save(path + 'sample_weight', self.sample_weight)
        self.dev_raw.to_csv(path + 'dev_raw')
        self.dev_df.to_csv(path + 'dev_df')
        self.test_raw.to_csv(path + 'test_raw')
        self.test_df.to_csv(path + 'test_df')
        with open(path + 'comments.txt', 'w', encoding='utf8') as f:
            for comment in self.comments:
                f.write(comment + '\n')
        with open(path + 'features.txt', 'w', encoding='utf8') as f:
            for feature in self.features:
                f.write(feature + '\n')


    def load(self, path='gdrive/My Drive/colab_projects/data/data/'):
        self.train_x = np.load(path + 'train_x.npy')
        self.train_y = np.load(path + 'train_y.npy')
        self.dev_x = np.load(path + 'dev_x.npy')
        self.test_x = np.load(path + 'test_x.npy')
        self.sample_weight = np.load(path + 'sample_weight.npy')
        self.dev_raw = pd.read_csv(path + 'dev_raw')
        self.dev_df = pd.read_csv(path + 'dev_df')
        self.test_raw = pd.read_csv(path + 'test_raw')
        self.test_df = pd.read_csv(path + 'test_df')
        self.comments =[]
        with open(path + 'comments.txt', 'r', encoding='utf8') as f:
            for line in f:
                line = line.strip()
                if line:
                    self.comments.append(line)
        self.features =[]
        with open(path + 'features.txt', 'r', encoding='utf8') as f:
            for line in f:
                line = line.strip()
                if line:
                    self.features.append(line)


######################
# Creating the model #
######################]

def get_svm(train_x, train_y):
    model = svm.SVC(decision_function_shape='ovo')
    model.fit(train_x, train_y)
    return model


def get_xgb(train_x, train_y):
    model = XGBClassifier()
    model.fit(train_x, train_y)
    return model


def get_ffnn(config, train_x, train_y, sample_weight, single_layer=False):
    y_encoder = preprocessing.OneHotEncoder()
    train_y_enc = y_encoder.fit_transform(train_y.to_numpy().reshape(-1, 1))
    model = Sequential()
    if single_layer:
        model.add(Dense(config.N_CLASSES, input_dim=train_x.shape[1]))
    else:
        model.add(Dense(config.HIDDEN, activation='relu',
                        input_dim=train_x.shape[1]))
        model.add(Dropout(config.DROPOUT))
        model.add(Dense(config.N_CLASSES))
    model.add(Activation('softmax'))
    model.compile(loss=config.LOSS, optimizer=config.OPTIMIZER,
                  metrics=[config.METRIC])
    history = model.fit(train_x, train_y_enc, epochs=config.EPOCHS,
                        batch_size=config.BATCH_SIZE,
                        sample_weight=sample_weight, verbose=1)
    return model, history, y_encoder


def get_bilstm(config, train_x, train_y, sample_weight):
    model = Sequential()
    model.add(Bidirectional(CuDNNLSTM(config.LSTM_UNITS,
                                      return_sequences=True),
                            input_shape=train_x.shape[1:]))
    model.add(Dropout(config.DROPOUT))
    model.add(TimeDistributed(Dense(config.N_CLASSES, activation='softmax')))
    model.compile(loss=config.LOSS, optimizer=config.OPTIMIZER,
                  metrics=[config.METRIC], sample_weight_mode='temporal')
    history = model.fit(train_x, train_y, epochs=config.EPOCHS,
                        batch_size=config.BATCH_SIZE,
                        sample_weight=sample_weight, verbose=1)
    return model, history


def get_cnn(config, train_x, train_y, sample_weight):
    embedding_layer = Embedding(config.VOCAB_SIZE,
                              768,
                              input_length=train_x.shape[1],
                              trainable=False)
    y_encoder = preprocessing.OneHotEncoder()
    train_y_enc = y_encoder.fit_transform(train_y.to_numpy().reshape(-1, 1))

    sequence_input = Input(shape=(train_x.shape[1],), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    x = Dropout(config.DROPOUT)(embedded_sequences)
    x = Conv1D(128, 5, activation='relu')(embedded_sequences)
    x = MaxPooling1D(5)(x)
    x = Conv1D(128, 5, activation='relu')(x)
    x = MaxPooling1D(5)(x)
    x = Conv1D(128, 5, activation='relu')(x)
    x = MaxPooling1D(35)(x)  
    x = Flatten()(x)
    x = Dropout(config.DROPOUT)(x)
    x = Dense(128, activation='relu')(x)
    preds = Dense(config.N_CLASSES, activation='softmax')(x)
    model = Model(sequence_input, preds)
    model.compile(loss=config.LOSS, optimizer=config.OPTIMIZER,
                  metrics=[config.METRIC])

    history = model.fit(train_x, train_y_enc, epochs=config.EPOCHS,
                        batch_size=config.BATCH_SIZE,
                        sample_weight=sample_weight, verbose=2)
    
    return model, history, y_encoder


def get_kimcnn(config, train_x, train_y, sample_weight):
    '''Inspired by Alexander Rakhlin's keras implementation of Yoon Kim's paper 
    "Convolutional Neural Networks for Sentence Classification"
    Repository: https://github.com/alexander-rakhlin/CNN-for-Sentence-Classification-in-Keras
    Kim's paper: http://arxiv.org/pdf/1408.5882v2.pdf'''
    embedding_layer = Embedding(config.VOCAB_SIZE,
                              768,
                              input_length=train_x.shape[1],
                              trainable=False)
    y_encoder = preprocessing.OneHotEncoder()
    train_y_enc = y_encoder.fit_transform(train_y.to_numpy().reshape(-1, 1))
    sequence_input = Input(shape=(train_x.shape[1],), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    z = Dropout(config.DROPOUT_PROB[0])(embedded_sequences)
    conv_blocks = []
    for sz in config.FILTER_SIZES:
        conv = Convolution1D(filters=config.NUM_FILTERS,
                            kernel_size=sz,
                            padding="valid",
                            activation="relu",
                            strides=1)(z)
        conv = MaxPooling1D(pool_size=2)(conv)
        conv = Flatten()(conv)
        conv_blocks.append(conv)
    z = Concatenate()(conv_blocks) #if len(conv_blocks) > 1 else conv_blocks[0]

    z = Dropout(config.DROPOUT_PROB[1])(z)
    z = Dense(config.HIDDEN_DIMS, activation="relu")(z)
    model_output = Dense(config.N_CLASSES, activation='softmax')(z)

    model = Model(sequence_input, model_output)

    model.compile(loss=config.LOSS, optimizer=config.OPTIMIZER,
                  metrics=[config.METRIC])

    history = model.fit(train_x, train_y_enc, epochs=config.EPOCHS,
                        batch_size=config.BATCH_SIZE,
                        sample_weight=sample_weight, verbose=2)
    
    return model, history, y_encoder



###############
# Predictions #
###############


def get_bio_predictions(model, x, x_raw, n_classes):
    y_hat = model.predict(x)
    y_hat = y_hat.reshape(-1, n_classes).argmax(axis=1).reshape(x.shape[:2])
    labels = []
    for row in x_raw.itertuples():
        sent_idx = row.Index - 1
        for tok_idx in range(row.n_toks):
            if y_hat[sent_idx][tok_idx] == 0:
                label = "O"
            elif y_hat[sent_idx][tok_idx] == 1:
                label = "I"
            else:
                label = "B"
            labels.append(label)
    return labels


def si_predictions_to_spans(label_df):
    spans = []
    prev_label = 'O'
    prev_span_start = '-1'
    prev_span_end = '-1'
    prev_article = ''

    for row in label_df.itertuples():
        article = row.document_id
        span_start = row.token_start
        span_end = row.token_end
        label = row.label_pred

        span, prev_span_start = update_predicted_span(article, label,
                                                      span_start, span_end,
                                                      prev_article, prev_label,
                                                      prev_span_start,
                                                      prev_span_end)
        if span is not None:
            spans.append(span)

        prev_article = article
        prev_label = label
        prev_span_end = span_end

    # Make sure we get the last prediction
    span, _ = update_predicted_span(article, label, span_start, span_end,
                                    prev_article, prev_label, prev_span_start,
                                    prev_span_end)
    if span is not None:
        spans.append(span)
    return spans


# Helper method for si_predictions_to_spans
def update_predicted_span(article, label, span_start, span_end, prev_article,
                          prev_label, prev_span_start, prev_span_end):
    span = None
    cur_span_start = prev_span_start
    # Ending a span: I-O, B-O, I-B, B-B, new article
    if prev_label != 'O' and (label != 'I' or prev_article != article):
        span = (prev_article, prev_span_start, prev_span_end)

    # Starting a new span: O-B, O-I, I-B, B-B, new article
    if label == 'B' or (label == 'I' and prev_label == 'O') \
            or prev_article != article:
        # Update the start of the current label span
        cur_span_start = span_start
    return span, cur_span_start


def print_spans(spans, file_prefix, file_stem, file_suffix):
    outfile = file_prefix + 'spans_' + file_stem + '_' + file_suffix + '.txt'
    with open(outfile, mode='w') as f:
        for span in spans:
            f.write(str(span[0]) + '\t' + str(span[1]) + '\t' +
                    str(span[2]) + '\n')


def predict_si(config, model, history, dev_df, dev_raw, dev_x, comments,
               file_prefix, file_stem, file_suffix, features,
               predict_spans=True):
    y_hat = get_bio_predictions(model, dev_x, dev_raw, config.N_CLASSES)
    result_df = pd.concat([dev_df, pd.DataFrame(y_hat, columns=['label_pred'])],
                          axis=1, sort=False)

    logfile = file_prefix + 'log_' + file_stem + '_' + file_suffix + '.txt'

    with open(logfile, mode='w') as f:
        f.write('DATA PREPROCESSING\n\n')
        for comment in comments:
            comment = comment.replace('#', '')
            fields = comment.split(',')
            for field in fields:
                f.write(comment.strip() + '\n')
        f.write('Additional features:' + str(features) + '\n')
        f.write('\n\nCONFIG\n\n')
        f.write(config.pretty_str())
        f.write('\n\nMODEL HISTORY\n\n')
        f.write('Loss ' + config.LOSS + '\n')
        f.write(str(history.history['loss']) + '\n')
        f.write(config.METRIC + '\n')
        f.write(str(history.history[config.METRIC]) + '\n')
        f.write('\n\nMODEL SUMMARY\n\n')
        model.summary(print_fn=lambda x: f.write(x + '\n'))

    if predict_spans:
        spans = si_predictions_to_spans(result_df)
        print_spans(spans, file_prefix, file_stem, file_suffix)

    return result_df


def predict_tc(config, model, history, dev_df, dev_x, comments, file_prefix,
               file_stem, file_suffix, features, y_encoder=None,
               print_log=True):
    if print_log:
        logfile = file_prefix + 'log_' + file_stem + '_' + file_suffix + '.txt'
        with open(logfile, mode='w') as f:
            f.write('DATA PREPROCESSING\n\n')
            for comment in comments:
                comment = comment.replace('#', '')
                fields = comment.split(',')
                for field in fields:
                    f.write(comment.strip() + '\n')
            f.write('Additional features:' + str(features) + '\n')
            f.write('\n\nCONFIG\n\n')
            f.write(config.pretty_str())
            if history:
                f.write('\n\nMODEL HISTORY\n\n')
                f.write('Loss ' + config.LOSS + '\n')
                f.write(str(history.history['loss']) + '\n')
                f.write(config.METRIC + '\n')
                f.write(str(history.history[config.METRIC]) + '\n')
                f.write('\n\nMODEL SUMMARY\n\n')
                model.summary(print_fn=lambda x: f.write(x + '\n'))

    y_hat = model.predict(dev_x)
    if y_encoder:
        if config.PRED_ALTS:
            # Get the runner-up predictions
            y_hat_alt = []
            y_score = []
            y_score_alt = []
            for tok_idx in range(y_hat.shape[0]):
                preds = np.argsort(y_hat[tok_idx])
                top_idx = preds[-1]
                alt_idx = preds[-2]
                alt = np.eye(N=1, M=14, k=alt_idx)
                y_hat_alt.append(y_encoder.inverse_transform(alt)[0][0])
                y_score.append(y_hat[tok_idx][top_idx])
                y_score_alt.append(y_hat[tok_idx][alt_idx])

        # Decode the predictions
        y_hat = y_encoder.inverse_transform(y_hat)

        if config.PRED_ALTS:
            out_df = pd.concat([dev_df,
                                pd.DataFrame(data={'label_alt': y_hat_alt,
                                                   'pred_score': y_score,
                                                   'alt_score': y_score_alt})],
                          axis=1, sort=False)
            print_tc(y_hat, out_df, file_prefix, 'alt_' + file_stem,
                     file_suffix, cols=['document_id', 'label_pred',
                                        'span_start', 'span_end', 'pred_score',
                                        'label_alt', 'alt_score'],
                     print_header=True)
    return print_tc(y_hat, dev_df, file_prefix, file_stem, file_suffix)


def print_tc(y_hat, dev_df, file_prefix, file_stem, file_suffix,
             cols=['document_id', 'label_pred', 'span_start', 'span_end'],
             print_header=False):
    outfile = file_prefix + 'labels_' + file_stem + '_' + file_suffix + '.txt'
    result_df = pd.concat([dev_df, pd.DataFrame(y_hat, columns=['label_pred'])],
                          axis=1, sort=False)
    result_df = result_df[cols]
    result_df.to_csv(outfile, sep='\t', index=False, header=print_header)
    return result_df



###########################
# Putting it all together #
###########################


def run(config, file_stem, file_suffix, verbose=True, predict_spans=True,
        data=None, word2embedding=None, file_prefix=''):
    if verbose:
        print('Running with config:')
        print(config.pretty_str())
    if not data:
        if config.LOAD_DATA:
            print('Loading data from files')
            data = Data(path=config.DATA_PATH)
        else:
            if verbose:
                print('Encoding the data')
            data = get_data(config, word2embedding)
            if config.SAVE_DATA:
                data.save()

    if verbose:
        print('Additional features:', data.features)
        print('Building the model')
    if config.TOKEN_LVL:
        model, history = get_bilstm(config, data.train_x, data.train_y,
                                    data.sample_weight)
    else:
        history = None
        y_encoder = None
        if config.MODEL == 'SVM':
            model = get_svm(data.train_x, data.train_y)
        elif config.MODEL.startswith('FFNN'):
            model, history, y_encoder = get_ffnn(config, data.train_x,
                                                 data.train_y,
                                                 data.sample_weight,
                                                 single_layer=config.MODEL == 'FFNN-single')
        elif config.MODEL == 'LSTM':
            model, history = get_bilstm(config, data.train_x, data.train_y,
                                    data.sample_weight)
        elif config.MODEL == 'CNN':
            model, history, y_encoder = get_cnn(config, data.train_x, data.train_y,
                                    data.sample_weight)
        elif config.MODEL == 'KIMCNN':
           model, history, y_encoder = get_kimcnn(config, data.train_x, data.train_y,
                                    data.sample_weight)

    if verbose:
        print('Predicting the test data labels/spans')
    labels_test = None
    if config.TOKEN_LVL:
        labels_dev = predict_si(config, model, history, data.dev_df,
                                data.dev_raw, data.dev_x, data.comments,
                                file_prefix, file_stem, file_suffix,
                                data.features, predict_spans)
    else:
        labels_dev = predict_tc(config, model, history, data.dev_df, data.dev_x,
                                data.comments, file_prefix, 'dev_' + file_stem,
                                file_suffix, data.features, y_encoder)
        if config.TEST_URL:
            labels_test = predict_tc(config, model, history, data.test_df,
                                    data.test_x, data.comments, file_prefix,
                                    'test_' + file_stem, file_suffix,
                                     data.features, y_encoder, print_log=False)
    if verbose:
        print('Done!\n\n')

    return data, labels_dev, labels_test


# grid_search.py

In [0]:
# from model import run, si_predictions_to_spans, print_spans
from collections import Counter
import time


class Config:
    def __init__(self, args=None):
        """Creates a default configuration.

        Keyword arguments:
        args -- a dict(str -> ?) containing values diverging from the default
        """
        # Encoding the data:
        self.TOKEN_LVL = True  # True if task 1, False if task 2.
        if args and 'TOKEN_LVL' in args:
            self.TOKEN_LVL = args['TOKEN_LVL']

        self.ONLINE_SOURCES = True  # Input is given via URLs, not local files.

        self.UNCASED = True  # If true, words are turned into lower case.
        self.FEATURES = None  # If None, the features are determined from the
                              # input file.
        self.EXCLUDE_FEATURES = []
        self.SAVE_DATA = False  # If true, the following two values can be used
                                # for re-using the data next time.
        # In case the training & dev data were saved and can be reused:
        self.DATA_PATH = 'gdrive/My Drive/colab_projects/data/data/'
        self.LOAD_DATA = False

        # Building the model:
        self.MODEL = 'LSTM'
        self.BATCH_SIZE = 128
        self.LSTM_UNITS = 512
        self.DROPOUT = 0.25
        self.OPTIMIZER = 'adam'
        self.METRIC = 'categorical_accuracy'
        self.LOSS = 'categorical_crossentropy'

        # Making predictions:
        self.MAJORITY_VOTING = True

        # Task-specific options
        if self.TOKEN_LVL:
            # Task 1: Span identification
            # For using train+dev and test, see the end of this file.
            self.N_CLASSES = 2
            self.MAX_SEQ_LEN = 35
            self.EMBED_DIM = 300
            self.EPOCHS = 10
            self.CLASS_WEIGHTS = {'O': 1.0, 'I': 6.5, 'B': 6.5}
            self.TRAIN_URL = 'https://raw.githubusercontent.com/cicl-iscl/CyberWallE/master/data/si-train.tsv?token=AD7GEDMEHQSUS34AOSIHGF26Q4WYK'
            self.DEV_URL = 'https://raw.githubusercontent.com/cicl-iscl/CyberWallE/master/data/si-dev.tsv?token=AD7GEDI3J6KMIKA6XXTKT6S6Q4WYI'
            self.TEST_URL = ''
            #self.TEST_URL = ''
            self.EMBEDDING_PATH = 'gdrive/My Drive/colab_projects/data/glove.42B.300d.zip'
            self.USE_BERT = False
            self.TRAIN_BERT = 'gdrive/My Drive/colab_projects/data/train_bert-base-uncased.tsv'
            self.DEV_BERT = 'gdrive/My Drive/colab_projects/data/dev_bert-base-uncased.tsv'
            self.TEST_BERT = ''
        else:
            # Task 2: Technique classification
            # Options: 'SVM', 'FFNN', 'FFNN-single', 'LSTM', 'CNN', 'KIMCNN'
            self.MODEL = 'FFNN'
            self.HIDDEN = 128
            self.EPOCHS = 15
            self.CLASS_WEIGHTS = None
            self.TRAIN_URL = 'https://raw.githubusercontent.com/cicl-iscl/CyberWallE/master/data/tc-train.tsv?token=AD7GEDOONNZLYERAUKC4E5K6OFDNY'
            self.DEV_URL = 'https://raw.githubusercontent.com/cicl-iscl/CyberWallE/master/data/tc-dev.tsv?token=AD7GEDLMCCTZVHH5IHN7H4K6OFDN4'
            self.TEST_URL = 'https://raw.githubusercontent.com/cicl-iscl/CyberWallE/master/data/tc-test.tsv?token=AD7GEDMR4D6KMDMTUVI534C6OFDNY'
            self.USE_BERT = True  # Currently, we don't have an alternative to this.
            self.EMBED_DIM = 768
            self.N_BERT_LAYERS = 1
            self.TRAIN_BERT = 'gdrive/My Drive/colab_projects/data/tc_train_bert-base-uncased.tsv'
            self.DEV_BERT = 'gdrive/My Drive/colab_projects/data/tc_dev_bert-base-uncased.tsv'
            self.TEST_BERT = 'gdrive/My Drive/colab_projects/data/tc_test_bert-base-uncased.tsv'
            self.N_CLASSES = 14
            self.MAX_SEQ_LEN = -1  # Value is irrelevant (fixed-size input)
            self.VOCAB_SIZE = 30000
            self.FILTER_SIZES = (3, 5, 7)
            self.NUM_FILTERS = 10
            self.DROPOUT_PROB = (0.5, 0.5)
            self.HIDDEN_DIMS = 50
            # Runners-up to the softmax winner:
            self.PRED_ALTS = False

        self.FLATTEN = (not self.TOKEN_LVL) or (self.MODEL != 'LSTM')

        if args:
            for key in args:
                setattr(self, key, args[key])                

    def pretty_str(self):
        s = 'max seq len: ' + str(self.MAX_SEQ_LEN) + '\n' + \
            'embedding depth: ' + str(self.EMBED_DIM) + '\n' + \
            'BERT embeddings: ' + str(self.USE_BERT) + '\n' + \
            'TRAIN_BERT: ' + str(self.TRAIN_BERT) + '\n' + \
            'DEV_BERT: ' + str(self.DEV_BERT) + '\n' + \
            'number of labels: ' + str(config.N_CLASSES) + '\n' + \
            'batch size: ' + str(self.BATCH_SIZE) + '\n' + \
            'epochs: ' + str(self.EPOCHS) + '\n' + \
            'class weights: ' + str(self.CLASS_WEIGHTS) + '\n' + \
            'LSTM units: ' + str(self.LSTM_UNITS) + '\n' + \
            'dropout rate: ' + str(self.DROPOUT) + '\n' + \
            'optimizer: ' + self.OPTIMIZER + '\n' + \
            'metric: ' + self.METRIC + '\n' + \
            'loss: ' + self.LOSS + '\n'
        if not self.TOKEN_LVL:
            s += 'model: ' + str(self.MODEL) + '\n' + \
                 'BERT layers: ' + str(self.N_BERT_LAYERS) + '\n' + \
                 'predict alternatives: ' + str(self.PRED_ALTS) + '\n'
        return s


def get_majority_vote(votes, print_near_ties=False):
    if print_near_ties:
        votes = [(k, v) for k, v in sorted(dict(Counter(votes)).items(),
                                           key=lambda item: item[1],
                                           reverse=True)]
        if len(votes) > 1 and votes[0][1] - votes[1][1] < 2:
            print(votes)
        return votes[0][0]

    votes = [k for k, _ in sorted(dict(Counter(votes)).items(),
                                  key=lambda item: item[1],
                                  reverse=True)]
    # Task 1: For our data, preferring specific labels in tie situations
    # doesn't make a difference.
    # Task 2: Ties are extremely rare.
    return votes[0]


def print_majority_votes(config, predictions, label_cols, file_prefix,
                         file_stem, df, print_near_ties=False):
    labels = []
    for row in predictions.itertuples():
        labels.append(get_majority_vote(
            [getattr(row, l) for l in label_cols], print_near_ties))
    predictions['label_pred'] = labels
    if config.TOKEN_LVL:
        spans = si_predictions_to_spans(predictions)
        print_spans(spans, file_prefix, file_stem, 'majority')
    else:
        print_tc(labels, df, file_prefix, file_stem, 'majority')


def run_config(config, file_prefix, data=None, repetitions=5, verbose=True):
    now = time.strftime("%Y%m%d-%H%M%S", time.localtime())
    predictions_dev = None
    predictions_test = None
    label_cols = []
    for i in range(repetitions):
        if verbose:
            print("Iteration " + str(i + 1) + " of " + str(repetitions))
        data, labels_dev, labels_test = run(config, data=data, verbose=verbose,
                                            file_prefix=file_prefix,
                                            file_stem=now, file_suffix=str(i))
        if config.MAJORITY_VOTING:
            if predictions_dev is None:
                predictions_dev = labels_dev
                predictions_dev = predictions_dev.rename(
                    columns={'label_pred': 'label_0'})
                if labels_test is not None:
                    predictions_test = labels_test
                    predictions_test = predictions_test.rename(
                        columns={'label_pred': 'label_0'})
            else:
                predictions_dev.insert(loc=len(predictions_dev.columns),
                                       column='label_' + str(i),
                                       value=labels_dev.label_pred)
                if labels_test is not None:
                    predictions_test.insert(loc=len(predictions_test.columns),
                                            column='label_' + str(i),
                                            value=labels_test.label_pred)
            label_cols.append('label_' + str(i))
    if config.MAJORITY_VOTING:
        if verbose:
            print('Majority voting (dev)')
        print_majority_votes(config, predictions_dev, label_cols,
                             file_prefix, 'dev_' + now, data.dev_df, False)
        if labels_test is not None:
            print('Majority voting (test)')
            print_majority_votes(config, predictions_test, label_cols,
                                 file_prefix, 'test_' + now, data.test_df,
                                 False)

    # Return data in case the next config only changes model features
    return data, now


file_prefix = '/content/gdrive/My Drive/colab_projects/semeval-predictions/'
data = None
repetitions = 5

# Short-hands for the features:
f_none = []
f_all = None  # Determined from the input file header
f_rep = ['repetitions']
f_len = ['length']
f_senti = ['highest_pos', 'highest_neg']
f_arglex = ['authority', 'causation', 'contrast', 'emphasis', 'generalization',
            'inconsistency', 'necessity', 'possibility', 'priority',
            'structure', 'wants']
f_question = ['question']
f_emotion = ['fear', 'sadness', 'joy', 'anger', 'disgust']
f_ne_2 = ['NORP', 'GPE']
f_ne_6 = ['ORG', 'NORP', 'GPE', 'PERSON', 'CARDINAL', 'DATE']
f_america = ['america']
f_america_simple = ['america_simple']
f_reductio = ['reductio']
f_pos = ['ADJ', 'ADP', 'ADV', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART',
         'PRON', 'PROPN', 'PUNCT', 'SYM', 'VERB', 'X']

### You can change config values by passing a dictionary to the constructor.

### Hyperparameter tuning (example):
# for epochs in [5, 15, 20, 25]:
#     config = Config({'USE_BERT': True, 'TOKEN_LVL': False, 'EPOCHS': epochs})
#     data, _ = run_config(config, file_prefix, data)

##################################
# Subtask 1: Span identification #
##################################

## GloVe-100, no features
# config = Config({'EMBEDDING_PATH': 'gdrive/My Drive/colab_projects/data/glove.6B.100d.txt',
#                  'EMBED_DIM': 100,
#                  'FEATURES': []})
## GloVe-300, no features
# config = Config({'FEATURES': []})
## BERT (base-cased), no features:
# config = Config({'USE_BERT': True, 'EMBED_DIM': 768, 'UNCASED': False,
#                  'TRAIN_BERT': 'gdrive/My Drive/colab_projects/data/train_bert-base-cased.tsv',
#                  'DEV_BERT': 'gdrive/My Drive/colab_projects/data/dev_bert-base-cased.tsv',
#                  'FEATURES': []})
## BERT (base-uncased), no features:
# config = Config({'USE_BERT': True, 'EMBED_DIM': 768, 'FEATURES': []})

## GloVe-100, SentiWordNet:
# config = Config({'EMBEDDING_PATH': 'gdrive/My Drive/colab_projects/data/glove.6B.100d.txt',
#                  'EMBED_DIM': 100,
#                  'FEATURES': ['positive', 'negative']})
## GloVe-100, Arguing Lexicon:
# config = Config({'EMBEDDING_PATH': 'gdrive/My Drive/colab_projects/data/glove.6B.100d.txt',
#                  'EMBED_DIM': 100,
#                  'FEATURES': ['arglex_any']})
## GloVe-100, POS tags:
# config = Config({'EMBEDDING_PATH': 'gdrive/My Drive/colab_projects/data/glove.6B.100d.txt',
#                  'EMBED_DIM': 100,
#                  'FEATURES': f_pos})
## GloVe-100, SentiWordNet + Arguing Lexicon:
# config = Config({'EMBEDDING_PATH': 'gdrive/My Drive/colab_projects/data/glove.6B.100d.txt',
#                  'EMBED_DIM': 100,
#                  'FEATURES': ['positive', 'negative', 'arglex_any']})
## GloVe-100, SentiWordNet + Arguing Lexicon + POS:
# config = Config({'EMBEDDING_PATH': 'gdrive/My Drive/colab_projects/data/glove.6B.100d.txt',
#                  'EMBED_DIM': 100,
#                  'FEATURES': ['positive', 'negative', 'arglex_any'] + f_pos})

## BERT (base-uncased), SentiWordNet:
# config = Config({'USE_BERT': True, 'EMBED_DIM': 768, 
#                  'FEATURES': ['positive', 'negative']})
## BERT (base-uncased), Arguing Lexicon:
# config = Config({'USE_BERT': True, 'EMBED_DIM': 768, 
#                  'FEATURES': ['arglex_any']})
## BERT (base-uncased), POS tags:
# config = Config({'USE_BERT': True, 'EMBED_DIM': 768, 
#                  'FEATURES': f_pos})
## BERT (base-uncased), SentiWordNet + Arguing Lexicon:
# config = Config({'USE_BERT': True, 'EMBED_DIM': 768, 
#                  'FEATURES': ['positive', 'negative', 'arglex_any']})
## BERT (base-uncased), SentiWordNet + Arguing Lexicon + POS:
## (FINAL MODEL before post-processing)
# config = Config({'USE_BERT': True, 'EMBED_DIM': 768, 
#                  'FEATURES': ['positive', 'negative', 'arglex_any'] + f_pos})
### For predictions on the final test set (task 1):
# config = Config({'USE_BERT': True, 'EMBED_DIM': 768, 
#                  'FEATURES': ['positive', 'negative', 'arglex_any'] + f_pos,
#                  'TRAIN_URL': 'https://raw.githubusercontent.com/cicl-iscl/CyberWallE/master/data/si-train%2Bdev.tsv?token=AD7GEDJ7GSTS3RSP5ZSXLZ26LP4BS',
#                  'DEV_URL': 'https://raw.githubusercontent.com/cicl-iscl/CyberWallE/master/data/si-test.tsv?token=AD7GEDM7A3GFIAEZHHESFO26LP4BQ',
#                  'TRAIN_BERT': 'gdrive/My Drive/colab_projects/data/train+dev_bert-base-uncased.tsv',
#                  'DEV_BERT': 'gdrive/My Drive/colab_projects/data/test_bert-base-uncased.tsv'
#                  })

#######################################
# Subtask 2: Technique classification #
#######################################

## bert-base-uncased embeddings of [CLS] & the first 10 tokens + CNN
# config = Config({'USE_BERT': True, 'TOKEN_LVL': False, 
#                  'FEATURES': [], 'MODEL': 'CNN',
#                  'TRAIN_BERT': 'gdrive/My Drive/colab_projects/data/tc_train_bert-base-uncased_10.tsv',
#                  'DEV_BERT': 'gdrive/My Drive/colab_projects/data/tc_dev_bert-base-uncased_10.tsv',
#                  'TEST_BERT': 'gdrive/My Drive/colab_projects/data/tc_test_bert-base-uncased_10.tsv',
#                  'EMBED_DIM': 768 * 11})
## bert-base-uncased embeddings of [CLS] & the first 10 tokens + KimCNN
# config = Config({'USE_BERT': True, 'TOKEN_LVL': False, 
#                  'FEATURES': [], 'MODEL': 'KIMCNN',
#                  'TRAIN_BERT': 'gdrive/My Drive/colab_projects/data/tc_train_bert-base-uncased_10.tsv',
#                  'DEV_BERT': 'gdrive/My Drive/colab_projects/data/tc_dev_bert-base-uncased_10.tsv',
#                  'TEST_BERT': 'gdrive/My Drive/colab_projects/data/tc_test_bert-base-uncased_10.tsv',
#                  'EMBED_DIM': 768 * 11})
## bert-base-uncased embeddings of [CLS] & the first 10 tokens + MLP
# config = Config({'USE_BERT': True, 'TOKEN_LVL': False, 
#                  'FEATURES': [], 'MODEL': 'FFNN',
#                  'TRAIN_BERT': 'gdrive/My Drive/colab_projects/data/tc_train_bert-base-uncased_10.tsv',
#                  'DEV_BERT': 'gdrive/My Drive/colab_projects/data/tc_dev_bert-base-uncased_10.tsv',
#                  'TEST_BERT': 'gdrive/My Drive/colab_projects/data/tc_test_bert-base-uncased_10.tsv',
#                  'EMBED_DIM': 768 * 11})

## bert-base-uncased embeddings of [CLS] + rep & bert-base-uncased & pre-softmax + MLP
# config = Config({'USE_BERT': True, 'TOKEN_LVL': False, 
#                  'FEATURES': [], 'MODEL': 'FFNN',
#                  'TRAIN_BERT': '/content/gdrive/My Drive/colab_projects/data/full_bert_train.tsv',
#                  'DEV_BERT': '/content/gdrive/My Drive/colab_projects/data/full_bert_dev.tsv',
#                  'EMBED_DIM': 768 + 14})
## rep + bert-base-cased + pre-softmax + MLP
# config = Config({'USE_BERT': True, 'TOKEN_LVL': False, 
#                  'FEATURES': [], 'MODEL': 'FFNN',
#                  'TRAIN_BERT': 'gdrive/My Drive/colab_projects/data/tc_train_bert-base-cased_pre-softmax.tsv',
#                  'DEV_BERT': 'gdrive/My Drive/colab_projects/data/tc_dev_bert-base-cased_pre-softmax.tsv',
#                  'TEST_BERT': 'gdrive/My Drive/colab_projects/data/tc_test_bert-base-cased_pre-softmax.tsv',
#                  'EMBED_DIM': 14, 'UNCASED': False})
## bert-base-uncased + pre-softmax + MLP
# config = Config({'USE_BERT': True, 'TOKEN_LVL': False, 
#                  'FEATURES': [], 'MODEL': 'FFNN',
#                  'TRAIN_BERT': 'gdrive/My Drive/colab_projects/data/tc_train_bert-base-uncased_no-rep.tsv',
#                  'DEV_BERT': 'gdrive/My Drive/colab_projects/data/tc_dev_bert-base-uncased_no-rep.tsv',
#                  'TEST_BERT': 'gdrive/My Drive/colab_projects/data/tc_test_bert-base-uncased_no-rep.tsv',
#                  'EMBED_DIM': 14})

## rep + bert-base-uncased + linear classifier
## -> see bert_sequence_classification.ipynb
## rep + bert-base-uncased + pre-softmax + XGBoost
## TODO
## bert-base-uncased + pre-softmax + single-layer perceptron
# config = Config({'USE_BERT': True, 'TOKEN_LVL': False, 
#                  'FEATURES': [], 'MODEL': 'FFNN-single',
#                  'TRAIN_BERT': 'gdrive/My Drive/colab_projects/data/tc_train_20200308-221011_2.tsv',
#                  'DEV_BERT': 'gdrive/My Drive/colab_projects/data/tc_dev_20200308-221011_2.tsv',
#                  'TEST_BERT': 'gdrive/My Drive/colab_projects/data/tc_test_20200308-221011_2.tsv',
#                  'EMBED_DIM': 14})
## bert-base-uncased + pre-softmax + SVN
# config = Config({'USE_BERT': True, 'TOKEN_LVL': False, 
#                  'FEATURES': [], 'MODEL': 'SVN',
#                  'TRAIN_BERT': 'gdrive/My Drive/colab_projects/data/tc_train_20200308-221011_2.tsv',
#                  'DEV_BERT': 'gdrive/My Drive/colab_projects/data/tc_dev_20200308-221011_2.tsv',
#                  'TEST_BERT': 'gdrive/My Drive/colab_projects/data/tc_test_20200308-221011_2.tsv',
#                  'EMBED_DIM': 14})
## bert-base-uncased + pre-softmax + multilayer perceptron
# config = Config({'USE_BERT': True, 'TOKEN_LVL': False, 
#                  'FEATURES': [], 'MODEL': 'FFNN',
#                  'TRAIN_BERT': 'gdrive/My Drive/colab_projects/data/tc_train_20200308-221011_2.tsv',
#                  'DEV_BERT': 'gdrive/My Drive/colab_projects/data/tc_dev_20200308-221011_2.tsv',
#                  'TEST_BERT': 'gdrive/My Drive/colab_projects/data/tc_test_20200308-221011_2.tsv',
#                  'EMBED_DIM': 14})

## bert-base-uncased + pre-softmax + america + multilayer perceptron
# config = Config({'USE_BERT': True, 'TOKEN_LVL': False, 
#                  'FEATURES': f_america, 'MODEL': 'FFNN',
#                  'TRAIN_BERT': 'gdrive/My Drive/colab_projects/data/tc_train_20200308-221011_2.tsv',
#                  'DEV_BERT': 'gdrive/My Drive/colab_projects/data/tc_dev_20200308-221011_2.tsv',
#                  'TEST_BERT': 'gdrive/My Drive/colab_projects/data/tc_test_20200308-221011_2.tsv',
#                  'EMBED_DIM': 14})
## bert-base-uncased + pre-softmax + bag-of-words + multilayer perceptron
# config = Config({'USE_BERT': True, 'TOKEN_LVL': False, 
#                  'FEATURES': f_reductio, 'MODEL': 'FFNN',
#                  'TRAIN_BERT': 'gdrive/My Drive/colab_projects/data/tc_train_20200308-221011_2.tsv',
#                  'DEV_BERT': 'gdrive/My Drive/colab_projects/data/tc_dev_20200308-221011_2.tsv',
#                  'TEST_BERT': 'gdrive/My Drive/colab_projects/data/tc_test_20200308-221011_2.tsv',
#                  'EMBED_DIM': 14})
## bert-base-uncased + pre-softmax + emotion + multilayer perceptron
# config = Config({'USE_BERT': True, 'TOKEN_LVL': False, 
#                  'FEATURES': f_emotion, 'MODEL': 'FFNN',
#                  'TRAIN_BERT': 'gdrive/My Drive/colab_projects/data/tc_train_20200308-221011_2.tsv',
#                  'DEV_BERT': 'gdrive/My Drive/colab_projects/data/tc_dev_20200308-221011_2.tsv',
#                  'TEST_BERT': 'gdrive/My Drive/colab_projects/data/tc_test_20200308-221011_2.tsv',
#                  'EMBED_DIM': 14})
## bert-base-uncased + pre-softmax + sequence length + multilayer perceptron
# config = Config({'USE_BERT': True, 'TOKEN_LVL': False, 
#                  'FEATURES': f_len, 'MODEL': 'FFNN',
#                  'TRAIN_BERT': 'gdrive/My Drive/colab_projects/data/tc_train_20200308-221011_2.tsv',
#                  'DEV_BERT': 'gdrive/My Drive/colab_projects/data/tc_dev_20200308-221011_2.tsv',
#                  'TEST_BERT': 'gdrive/My Drive/colab_projects/data/tc_test_20200308-221011_2.tsv',
#                  'EMBED_DIM': 14})
## bert-base-uncased + pre-softmax + repetition count + multilayer perceptron
# config = Config({'USE_BERT': True, 'TOKEN_LVL': False, 
#                  'FEATURES': f_rep, 'MODEL': 'FFNN',
#                  'TRAIN_BERT': 'gdrive/My Drive/colab_projects/data/tc_train_20200308-221011_2.tsv',
#                  'DEV_BERT': 'gdrive/My Drive/colab_projects/data/tc_dev_20200308-221011_2.tsv',
#                  'TEST_BERT': 'gdrive/My Drive/colab_projects/data/tc_test_20200308-221011_2.tsv',
#                  'EMBED_DIM': 14})
## bert-base-uncased + pre-softmax + two named entity classes (NE-2) + multilayer perceptron
# config = Config({'USE_BERT': True, 'TOKEN_LVL': False, 
#                  'FEATURES': f_ne_2, 'MODEL': 'FFNN',
#                  'TRAIN_BERT': 'gdrive/My Drive/colab_projects/data/tc_train_20200308-221011_2.tsv',
#                  'DEV_BERT': 'gdrive/My Drive/colab_projects/data/tc_dev_20200308-221011_2.tsv',
#                  'TEST_BERT': 'gdrive/My Drive/colab_projects/data/tc_test_20200308-221011_2.tsv',
#                  'EMBED_DIM': 14})
## bert-base-uncased + pre-softmax + six named entity classes (NE-6) + multilayer perceptron
# config = Config({'USE_BERT': True, 'TOKEN_LVL': False, 
#                  'FEATURES': f_ne_6, 'MODEL': 'FFNN',
#                  'TRAIN_BERT': 'gdrive/My Drive/colab_projects/data/tc_train_20200308-221011_2.tsv',
#                  'DEV_BERT': 'gdrive/My Drive/colab_projects/data/tc_dev_20200308-221011_2.tsv',
#                  'TEST_BERT': 'gdrive/My Drive/colab_projects/data/tc_test_20200308-221011_2.tsv',
#                  'EMBED_DIM': 14})
## bert-base-uncased + pre-softmax + arguing lexicon (AL) + multilayer perceptron
# config = Config({'USE_BERT': True, 'TOKEN_LVL': False, 
#                  'FEATURES': f_arglex, 'MODEL': 'FFNN',
#                  'TRAIN_BERT': 'gdrive/My Drive/colab_projects/data/tc_train_20200308-221011_2.tsv',
#                  'DEV_BERT': 'gdrive/My Drive/colab_projects/data/tc_dev_20200308-221011_2.tsv',
#                  'TEST_BERT': 'gdrive/My Drive/colab_projects/data/tc_test_20200308-221011_2.tsv',
#                  'EMBED_DIM': 14})
## bert-base-uncased + pre-softmax + question mark feature (Q) + multilayer perceptron
# config = Config({'USE_BERT': True, 'TOKEN_LVL': False, 
#                  'FEATURES': f_question, 'MODEL': 'FFNN',
#                  'TRAIN_BERT': 'gdrive/My Drive/colab_projects/data/tc_train_20200308-221011_2.tsv',
#                  'DEV_BERT': 'gdrive/My Drive/colab_projects/data/tc_dev_20200308-221011_2.tsv',
#                  'TEST_BERT': 'gdrive/My Drive/colab_projects/data/tc_test_20200308-221011_2.tsv',
#                  'EMBED_DIM': 14})
## bert-base-uncased + pre-softmax + NE-6 + AL + Q + multilayer perceptron
# config = Config({'USE_BERT': True, 'TOKEN_LVL': False, 
#                  'FEATURES': f_ne_6 + f_arglex + f_question, 'MODEL': 'FFNN',
#                  'TRAIN_BERT': 'gdrive/My Drive/colab_projects/data/tc_train_20200308-221011_2.tsv',
#                  'DEV_BERT': 'gdrive/My Drive/colab_projects/data/tc_dev_20200308-221011_2.tsv',
#                  'TEST_BERT': 'gdrive/My Drive/colab_projects/data/tc_test_20200308-221011_2.tsv',
#                  'EMBED_DIM': 14})
## bert-base-uncased + pre-softmax + NE-2 + AL + Q + multilayer perceptron
## (base model: FINAL MODEL before post-processing)
# config = Config({'USE_BERT': True, 'TOKEN_LVL': False, 'PRED_ALTS': True,
#                  'FEATURES': f_ne_2 + f_arglex + f_question, 'MODEL': 'FFNN',
#                  'TRAIN_BERT': 'gdrive/My Drive/colab_projects/data/tc_train_20200308-221011_2.tsv',
#                  'DEV_BERT': 'gdrive/My Drive/colab_projects/data/tc_dev_20200308-221011_2.tsv',
#                  'TEST_BERT': 'gdrive/My Drive/colab_projects/data/tc_test_20200308-221011_2.tsv',
#                  'EMBED_DIM': 14})

## Unused configurations (preliminary experiments)
## Large BERT embeddings ([CLS])
# 'TRAIN_BERT': 'gdrive/My Drive/colab_projects/data/tc_train_bert-large-uncased.tsv',
# 'DEV_BERT': 'gdrive/My Drive/colab_projects/data/tc_dev_bert-large-uncased.tsv',
# 'EMBED_DIM': 1024,
## SentiWordNet feature, America-simple feature
# 'FEATURES': f_senti
# 'FEATURES': f_america_simple
## Other feature combinations
# 'FEATURES': f_rep + f_len + f_question + f_senti + f_arglex
# 'FEATURES': f_rep + f_len + f_question + f_senti + f_arglex + f_emotion
# 'FEATURES': f_rep + f_len + f_question + f_senti + f_arglex + f_emotion + f_ne_2 + f_america_simple
# 'FEATURES': f_question + f_ne_2
# 'FEATURES': f_senti + f_arglex
# 'FEATURES': f_ne_2 + f_emotion + f_question
# 'FEATURES': f_ne_2 + f_arglex + f_question + f_emotion
## Class-weighting
#  'CLASS_WEIGHTS': {'Loaded_Language': 1,
#                    'Name_Calling,Labeling': 1,
#                    'Repetition': 2,
#                    'Doubt': 2,
#                    'Exaggeration,Minimisation': 2,
#                    'Appeal_to_fear-prejudice': 2,
#                    'Flag-Waving': 1,
#                    'Causal_Oversimplification': 1,
#                    'Appeal_to_Authority': 1,
#                    'Slogans': 1,
#                    'Black-and-White_Fallacy': 1,
#                    'Whataboutism,Straw_Men,Red_Herring': 1,
#                    'Thought-terminating_Cliches': 1,
#                    'Bandwagon,Reductio_ad_hitlerum': 1},


data, now = run_config(config, file_prefix, data, repetitions)

In [0]:
if not config.TOKEN_LVL:
    phases = ['dev']
    if config.TEST_URL:
        phases.append('test')

    runs = list(range(repetitions))
    if config.MAJORITY_VOTING:
        runs += ['majority']
    for sfx in runs:
        for phase in phases:
            f = file_prefix + 'labels_' + phase + '_' + now + '_' + str(sfx) + '.txt'
            df = pd.read_csv(f, sep='\t', usecols=[1], names=['label'])
            df = df['label'].value_counts().rename_axis('labels').reset_index(name='counts')
            df['%'] = df['counts'] / df['counts'].sum()
            print(f)
            print(df)
            print('\n')