# RATIO 2019 - Benchmarking Workshop

In [1]:
import datetime
import time

import gensim
import nltk
import numpy as np
import pandas as pd

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.parsing.preprocessing import STOPWORDS
from gensim.utils import simple_preprocess
from nltk.corpus import wordnet
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix, f1_score)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from tqdm import tqdm

In [2]:
# import nltk
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to /home/koerner/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/koerner/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
class Timer:
    def __init__(self, name=None):
        self.name = name

    def __enter__(self):
        self.time_start = time.time()

    def __exit__(self, *exc):
        time_end = time.time()
        time_delta = datetime.timedelta(seconds=(time_end - self.time_start))
        if self.name:
            print(("Time for [{}]: {}".format(self.name, time_delta)))
        else:
            print(("Time: {}".format(time_delta)))

# Task 1 - Same Side Classification

In [4]:
data_cross_path = 'data/same-side-classification/cross-topic/{}.csv'
data_within_path = 'data/same-side-classification/within-topic/{}.csv'

### Load within-topics and cross-topics data

In [5]:
# escapechar to detect quoting escapes, else it fails

# na_filter=False, because pandas automatic "nan" detection fails with the topic column, too
# cross_test_df['topic'].astype(str)[9270]

with Timer("read cross"):
    cross_traindev_df = pd.read_csv(data_cross_path.format('training'), index_col='id', escapechar='\\', na_filter=False)
    cross_test_df = pd.read_csv(data_cross_path.format('test'), index_col='id', escapechar='\\', na_filter=False)

with Timer("read within"):
    within_traindev_df = pd.read_csv(data_within_path.format('training'), index_col='id', escapechar='\\', na_filter=False)
    within_test_df = pd.read_csv(data_within_path.format('test'), index_col='id', escapechar='\\', na_filter=False)

Time for [read cross]: 0:00:03.170606
Time for [read within]: 0:00:03.445253


In [6]:
# Adding a tag for the topics in focus: "gay marriage" and "abortion"
def add_tag(row):
    title = row['topic'].lower().strip()
    if "abortion" in title:
        row['tag'] = 'abortion'
    elif "gay marriage"  in title:
        row['tag'] = 'gay marriage'
    else:
        row['tag'] = 'NA'
    return row


with Timer("tag cross traindev"):
    cross_traindev_df = cross_traindev_df.apply(add_tag, axis=1)
with Timer("tag cross test"):
    cross_test_df = cross_test_df.apply(add_tag, axis=1)

with Timer("tag within traindev"):
    within_traindev_df = within_traindev_df.apply(add_tag, axis=1)
with Timer("tag within test"):
    within_test_df = within_test_df.apply(add_tag, axis=1)

Time for [tag cross]: 0:03:00.559536
Time for [tag within]: 0:02:57.208549


### Get an overview about each dataset

In [7]:
def get_overview(df, task='same-side', class_name='is_same_side'):
    # Total instance numbers
    total = len(df)
    print("Task: ", task)
    print('=' * 40, '\n')

    print('Total instances: ', total)
    print('\n')

    print('For each topic:')
    for tag, tag_df in df.groupby(['tag']):
        print(tag, ': ', len(tag_df), ' instances')
        if class_name in df.columns:
            for is_same_side, side_df in tag_df.groupby([class_name]):
                print('\t\t', is_same_side, ': ', len(side_df), ' instances')
    print('\n')

    if class_name in df.columns:
        print('For each class value:')
        for class_value, class_df in df.groupby([class_name]):
            print(class_value, ': ', len(class_df), ' instances')
        print('\n')

    print('Unique argument1:', len(df['argument1'].unique()))
    print('Unique argument2:', len(df['argument2'].unique()))
    arguments = df['argument1'].values
    arguments = np.concatenate([arguments, df['argument2'].values])

    print('Unique total arguments:', len(set(list(arguments))), '\n')

    print('-' * 40, '\n')

    arguments_length_lst = [
        len(word_tokenize(x)) for x in df['argument1'].values
    ]
    arguments_length_lst.extend(
        [len(word_tokenize(x)) for x in df['argument2'].values])
    print('Words:')
    print('\tshortest argument:', min(arguments_length_lst), ' words')
    print('\tlongest argument:', max(arguments_length_lst), ' words')
    print('\targument average length:', np.mean(arguments_length_lst),
          ' words')

    arguments_sent_length_lst = [
        len(sent_tokenize(x)) for x in df['argument1'].values
    ]
    arguments_sent_length_lst.extend(
        [len(sent_tokenize(x)) for x in df['argument2'].values])
    print('Sentences:')
    print('\tshortest argument:', min(arguments_sent_length_lst), ' sentences')
    print('\tlongest argument:', max(arguments_sent_length_lst), ' sentences')
    print('\targument average length:', np.mean(arguments_sent_length_lst),
          ' sentences')

In [None]:
with Timer("overview cross"):
    get_overview(cross_traindev_df)

In [None]:
with Timer("overview within"):
    get_overview(within_traindev_df)

## Train model - Baseline

### train dev set - 70% 30%

In [8]:
def get_train_test_sets(df):
    X = df[['argument1', 'argument2', 'argument1_id', 'argument2_id', 'topic']]
    y = df[['is_same_side']]

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.30,
                                                        random_state=1,
                                                        shuffle=True)
    return X_train, X_test, y_train, y_test

### lemmatizing

In [9]:
def get_wordnet_pos(treebank_tag):
    """return WORDNET POS compliance to WORDENT lemmatization (a,n,r,v)"""
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        # As default pos in lemmatization is Noun
        return wordnet.NOUN


def lemmatize_stemming(token, pos_tag):
    '''lemmatize words (with POS information) and then stem'''
    stemmer = SnowballStemmer(
        "english")  # pOrter, M. "An algorithm for suffix stripping."
    return stemmer.stem(WordNetLemmatizer().lemmatize(token, pos=pos_tag))


def do_segmentation(text):
    '''do sentence segmentation, tokenization (with lemmatization&stemming)'''
    lemma = []
    for sentence in sent_tokenize(text):
        sentence = sentence.replace('\n', ' ').strip()
        tokens = [token for token in word_tokenize(sentence)]
        pos_tags = nltk.pos_tag(tokens)

        for idx in range(0, len(tokens)):
            token = tokens[idx].lower()
            if token not in gensim.parsing.preprocessing.STOPWORDS and len(
                    token) > 3:
                wordnet_pos = get_wordnet_pos(pos_tags[idx][1])
                l_ = lemmatize_stemming(token, wordnet_pos)
                lemma.append(l_)
    return lemma


def preprocess(text):
    '''concat lemmatized words together again'''
    lemma = do_segmentation(text)
    return ' '.join(lemma)

### Extracting n grams lemma for argument1 and argument2

In [10]:
def extract_ngrams(X_train, X_dev, col, idx='id'):
    vectorizer = CountVectorizer(min_df=600,
                                 max_df=0.7,
                                 ngram_range=(3, 3),
                                 max_features=5000)

    vectorizer.fit(X_train[col])
    features = vectorizer.transform(X_train[col])
    features_dev = vectorizer.transform(X_dev[col])

    train_df = pd.DataFrame(features.todense(),
                            columns=vectorizer.get_feature_names())
    train_df = train_df.add_prefix(col)

    aid_df = X_train[[idx]]

    train_df = train_df.merge(aid_df,
                              left_index=True,
                              right_index=True,
                              suffixes=(False, False),
                              how='inner')
    train_df.set_index(idx, inplace=True)

    dev_df = pd.DataFrame(features_dev.todense(),
                          columns=vectorizer.get_feature_names())
    dev_df = dev_df.add_prefix(col)

    aid_dev_df = X_dev[[idx]]

    dev_df = dev_df.merge(aid_dev_df,
                          left_index=True,
                          right_index=True,
                          suffixes=(False, False),
                          how='inner')
    dev_df.set_index(idx, inplace=True)
    return train_df, dev_df


def extract_n_grams_features(X_train, X_dev, columns, idx='id'):
    X_train = X_train.reset_index()
    result_train_df = X_train[[idx]]
    result_train_df.set_index(idx, inplace=True)

    X_dev = X_dev.reset_index()
    result_dev_df = X_dev[[idx]]
    result_dev_df.set_index(idx, inplace=True)

    for col in columns:
        result_train_df_, result_dev_df_ = extract_ngrams(X_train, X_dev, col)
        result_train_df = result_train_df.join(result_train_df_)
        result_dev_df = result_dev_df.join(result_dev_df_)
    return result_train_df, result_dev_df

### Train Doc2Vec model and vectorize argument1 and argument2

In [11]:
def make_d2v_docs(row):
    words1 = do_segmentation(row['argument1'])
    words2 = do_segmentation(row['argument2'])

    row['argument1_doc'] = TaggedDocument(words=words1,
                                          tags=[row['argument1_id']])
    row['argument2_doc'] = TaggedDocument(words=words2,
                                          tags=[row['argument2_id']])

    row['argument1_lemmas'] = ' '.join(words1)
    row['argument2_lemmas'] = ' '.join(words2)


class DatasetIter:
    def __init__(self, ds):
        self.ds = ds

    def _make_taggeddocs(self, row):
        yield row['argument1_doc']
        yield row['argument2_doc']

    def __iter__(self):
        for row in self.ds.values:
            for doc in self._make_taggeddocs(row):
                yield doc

In [13]:
def train_model(X_train, X_dev):
    # model_dbow = Doc2Vec(dm=0,
    #                      vector_size=300,
    #                      negative=5,
    #                      hs=0,
    #                      min_count=2,
    #                      sample=0,
    #                      workers=2)
    # with Timer("d2v dbow vocab"):
    #     model_dbow.build_vocab(tqdm(DatasetIter(X_train)))
    #
    # %%time
    # with Timer("d2v dbow train"):
    #     for epoch in range(30):
    #         model_dbow.train(utils.shuffle(tqdm(X_train)), total_examples=len(X_train), epochs=1)
    #         model_dbow.alpha -= 0.002
    #         model_dbow.min_alpha = model_dbow.alpha

    with Timer("doc2vec dbow"):
        # columns=['argument1_lemmas', 'argument2_lemmas']
        # pd.concat([X_train[columns], X_dev[columns]])
        model_dbow = Doc2Vec(DatasetIter(X_train),
                             dm=0,
                             vector_size=300,
                             negative=5,
                             hs=0,
                             min_count=2,
                             sample=0,
                             workers=2)
        
    return model_dbow

In [None]:
# unused
def vec_for_learning(model, df):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

In [14]:
def make_vectors(X_train, X_dev, model):
    def make_d2v_vecs(row):
        vec1 = model.infer_vector(row['argument1_doc'].words, steps=20)
        vec2 = model.infer_vector(row['argument2_doc'].words, steps=20)

        row['argument1_vec'] = vec1
        row['argument2_vec'] = vec2

    X_train.apply(make_d2v_vecs, axis=1)
    X_dev.apply(make_d2v_vecs, axis=1)

In [15]:
def make_vector_comparison_diff(X_train, X_dev):
    def ret_vec_diff(row):
        return row['argument1_vec'] - row['argument2_vec']

    X_train_diff = X_train.apply(ret_vec_diff, axis=1)
    X_dev_diff = X_dev.apply(ret_vec_diff, axis=1)

    return X_train_diff, X_dev_diff


def make_vector_comparison_concat(X_train, X_dev):
    def ret_vec_concat(row):
        return np.concatenate((row['argument1_vec'], row['argument2_vec']))

    X_train_diff = X_train.apply(ret_vec_diff, axis=1)
    X_dev_diff = X_dev.apply(ret_vec_diff, axis=1)

    return X_train_diff, X_dev_diff


def make_vector_comparison(X_train, X_dev):
    return make_vector_comparison_diff(X_train, X_dev)

### Train model and evaluate

In [16]:
def train_test_svm(X_train, y_train, X_test):
    scaler = StandardScaler(copy=True, with_mean=False)
    scaler.fit(X_train)

    X_train = scaler.transform(X_train)

    svclassifier = SVC(kernel='linear')
    svclassifier.fit(X_train, y_train)

    X_test = scaler.transform(X_test)
    y_pred = svclassifier.predict(X_test)

    return y_pred


def report_training_results(y_test, y_pred):
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, y_pred))
    print()
    print('Accuracy: ', round(accuracy_score(y_test, y_pred), 2))  #
    print()

    print('Report:')
    print(classification_report(y_test, y_pred))
    f1_dic = {}

    f1_dic['macro'] = round(
        f1_score(y_pred=y_pred, y_true=y_test, average='macro'), 2)
    f1_dic['micro'] = round(
        f1_score(y_pred=y_pred, y_true=y_test, average='micro'), 2)
    return f1_dic

### Cross topic - Training and evaluating model 

In [17]:
# 1. Getting train and dev data
with Timer("1 - test/train"):
    X_train, X_dev, y_train, y_dev = get_train_test_sets(cross_traindev_df)

Time for [1 - test/train]: 0:00:00.031638


In [None]:
# 2. tokenize (make doc2vec docs + lemma string)
with Timer("2 - tokenize"):
    with Timer("2.1 - tokenize train"):
        X_train = X_train.apply(make_d2v_docs, axis=1)
    with Timer("2.2 - tokenize dev"):
        X_dev = X_dev.apply(make_d2v_docs, axis=1)

In [None]:
# 3. train doc2vec model
with Timer("3 - doc2vec model"):
    model_dbow = train_model(X_train, X_dev)

In [None]:
# 4. vectorize arguments
with Timer("4 - vectorize arguments"):
    make_vectors(X_train, X_dev, model_dbow)

# 5. combine two argument vectors into a single one
# - diff / concat / ...
with Timer("5 - vector comparison of arguments"):
    X_train_diff, X_dev_diff = make_vector_comparison(X_train, X_dev)

X_train_ = X_train_diff
X_dev_ = X_dev_diff

In [None]:
# 4. train
with Timer("4 - SVM (train -> predict)"):
    y_pred = train_test_svm(X_train_, y_train, X_dev_)

# 5. Evaluate
with Timer("5 - report"):
    report_training_results(y_dev, y_pred)

In [None]:
# old

# 2. Lemmatizing argument1 and argument2
with Timer("2 - lemmatize"):
    X_train = X_train.apply(get_lemma, axis=1)
    X_dev = X_dev.apply(get_lemma, axis=1)

# 3. Extracting features - 1-3 grams lemma
with Timer("3 - n-grams"):
    X_train_, X_dev_ = extract_n_grams_features(
        X_train, X_dev, columns=['argument1_lemmas', 'argument2_lemmas'])

# 4. train
with Timer("4 - SVM (train -> predict)"):
    y_pred = train_test_svm(X_train_, y_train, X_dev_)

# 5. Evaluate
with Timer("5 - report"):
    report_training_results(y_dev, y_pred)