# RATIO 2019 - Benchmarking Workshop

In [1]:
import datetime
import logging
import time

import gensim
import nltk
import numpy as np
import pandas as pd

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.parsing.preprocessing import STOPWORDS
# from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
from gensim.utils import simple_preprocess
from nltk.corpus import wordnet
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix, f1_score)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC, SVC
from sklearn import utils

from tqdm import tqdm
# from tqdm import tqdm_notebook as tqdm
# from tqdm.autonotebook import tqdm

In [None]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

In [None]:
# import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

In [2]:
# apply progress bars for pandas .apply() -> .progress_apply()
tqdm.pandas()

In [3]:
class Timer:
    def __init__(self, name=None):
        self.name = name

    def __enter__(self):
        self.time_start = time.time()

    def __exit__(self, *exc):
        time_end = time.time()
        time_delta = datetime.timedelta(seconds=(time_end - self.time_start))
        if self.name:
            print(("Time for [{}]: {}".format(self.name, time_delta)))
        else:
            print(("Time: {}".format(time_delta)))

# Task 1 - Same Side Classification

In [4]:
data_cross_path = 'data/same-side-classification/cross-topic/{}.csv'
data_within_path = 'data/same-side-classification/within-topic/{}.csv'

### Load within-topics and cross-topics data

In [5]:
# escapechar to detect quoting escapes, else it fails

# na_filter=False, because pandas automatic "nan" detection fails with the topic column, too
# cross_test_df['topic'].astype(str)[9270]

with Timer("read cross"):
    cross_traindev_df = pd.read_csv(data_cross_path.format('training'), index_col='id', escapechar='\\', na_filter=False)
    cross_test_df = pd.read_csv(data_cross_path.format('test'), index_col='id', escapechar='\\', na_filter=False)

with Timer("read within"):
    within_traindev_df = pd.read_csv(data_within_path.format('training'), index_col='id', escapechar='\\', na_filter=False)
    within_test_df = pd.read_csv(data_within_path.format('test'), index_col='id', escapechar='\\', na_filter=False)

Time for [read cross]: 0:00:00.930528
Time for [read within]: 0:00:01.014369


In [6]:
# Adding a tag for the topics in focus: "gay marriage" and "abortion"
def add_tag(row):
    title = row['topic'].lower().strip()
    if "abortion" in title:
        row['tag'] = 'abortion'
    elif "gay marriage"  in title:
        row['tag'] = 'gay marriage'
    else:
        row['tag'] = 'NA'
    return row


with Timer("tag cross traindev"):
    cross_traindev_df = cross_traindev_df.apply(add_tag, axis=1)
with Timer("tag cross test"):
    cross_test_df = cross_test_df.apply(add_tag, axis=1)

with Timer("tag within traindev"):
    within_traindev_df = within_traindev_df.apply(add_tag, axis=1)
with Timer("tag within test"):
    within_test_df = within_test_df.apply(add_tag, axis=1)

Time for [tag cross traindev]: 0:00:35.189107
Time for [tag cross test]: 0:00:19.834640
Time for [tag within traindev]: 0:00:37.309147
Time for [tag within test]: 0:00:18.014437


### Get an overview about each dataset

In [7]:
def get_overview(df, task='same-side', class_name='is_same_side'):
    # Total instance numbers
    total = len(df)
    print("Task: ", task)
    print('=' * 40, '\n')

    print('Total instances: ', total)
    print('\n')

    print('For each topic:')
    for tag, tag_df in df.groupby(['tag']):
        print(tag, ': ', len(tag_df), ' instances')
        if class_name in df.columns:
            for is_same_side, side_df in tag_df.groupby([class_name]):
                print('\t\t', is_same_side, ': ', len(side_df), ' instances')
    print('\n')

    if class_name in df.columns:
        print('For each class value:')
        for class_value, class_df in df.groupby([class_name]):
            print(class_value, ': ', len(class_df), ' instances')
        print('\n')

    print('Unique argument1:', len(df['argument1'].unique()))
    print('Unique argument2:', len(df['argument2'].unique()))
    arguments = df['argument1'].values
    arguments = np.concatenate([arguments, df['argument2'].values])

    print('Unique total arguments:', len(set(list(arguments))), '\n')

    print('-' * 40, '\n')

    arguments_length_lst = [
        len(word_tokenize(x)) for x in df['argument1'].values
    ]
    arguments_length_lst.extend(
        [len(word_tokenize(x)) for x in df['argument2'].values])
    print('Words:')
    print('\tshortest argument:', min(arguments_length_lst), ' words')
    print('\tlongest argument:', max(arguments_length_lst), ' words')
    print('\targument average length:', np.mean(arguments_length_lst),
          ' words')

    arguments_sent_length_lst = [
        len(sent_tokenize(x)) for x in df['argument1'].values
    ]
    arguments_sent_length_lst.extend(
        [len(sent_tokenize(x)) for x in df['argument2'].values])
    print('Sentences:')
    print('\tshortest argument:', min(arguments_sent_length_lst), ' sentences')
    print('\tlongest argument:', max(arguments_sent_length_lst), ' sentences')
    print('\targument average length:', np.mean(arguments_sent_length_lst),
          ' sentences')

In [8]:
# with Timer("overview cross"):
#     get_overview(cross_traindev_df)

In [9]:
# with Timer("overview within"):
#     get_overview(within_traindev_df)

## Train model - Baseline

### train dev set - 70% 30%

In [10]:
def get_train_test_sets(df, ratio=0.30, random_state=1):
    X = df[['argument1', 'argument2', 'argument1_id', 'argument2_id', 'topic']]
    y = df[['is_same_side']]

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=ratio,
                                                        random_state=random_state,
                                                        shuffle=True)
    return X_train, X_test, y_train, y_test

### lemmatizing

In [11]:
def get_wordnet_pos(treebank_tag):
    """return WORDNET POS compliance to WORDENT lemmatization (a,n,r,v)"""
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        # As default pos in lemmatization is Noun
        return wordnet.NOUN


def lemmatize_stemming(token, pos_tag):
    '''lemmatize words (with POS information) and then stem'''
    stemmer = SnowballStemmer(
        "english")  # pOrter, M. "An algorithm for suffix stripping."
    return stemmer.stem(WordNetLemmatizer().lemmatize(token, pos=pos_tag))


def do_segmentation(text):
    '''do sentence segmentation, tokenization (with lemmatization&stemming)'''
    lemma = []
    for sentence in sent_tokenize(text):
        sentence = sentence.replace('\n', ' ').strip()
        tokens = [token for token in word_tokenize(sentence)]
        pos_tags = nltk.pos_tag(tokens)

        for idx in range(0, len(tokens)):
            token = tokens[idx].lower()
            if token not in gensim.parsing.preprocessing.STOPWORDS and len(
                    token) > 3:
                wordnet_pos = get_wordnet_pos(pos_tags[idx][1])
                l_ = lemmatize_stemming(token, wordnet_pos)
                lemma.append(l_)
    return lemma


def preprocess(text):
    '''concat lemmatized words together again'''
    lemma = do_segmentation(text)
    return ' '.join(lemma)

### Extracting n grams lemma for argument1 and argument2

In [12]:
def extract_ngrams(X_train, X_dev, col, idx='id'):
    vectorizer = CountVectorizer(min_df=600,
                                 max_df=0.7,
                                 ngram_range=(3, 3),
                                 max_features=5000)

    vectorizer.fit(X_train[col])
    features = vectorizer.transform(X_train[col])
    features_dev = vectorizer.transform(X_dev[col])

    train_df = pd.DataFrame(features.todense(),
                            columns=vectorizer.get_feature_names())
    train_df = train_df.add_prefix(col)

    aid_df = X_train[[idx]]

    train_df = train_df.merge(aid_df,
                              left_index=True,
                              right_index=True,
                              suffixes=(False, False),
                              how='inner')
    train_df.set_index(idx, inplace=True)

    dev_df = pd.DataFrame(features_dev.todense(),
                          columns=vectorizer.get_feature_names())
    dev_df = dev_df.add_prefix(col)

    aid_dev_df = X_dev[[idx]]

    dev_df = dev_df.merge(aid_dev_df,
                          left_index=True,
                          right_index=True,
                          suffixes=(False, False),
                          how='inner')
    dev_df.set_index(idx, inplace=True)
    return train_df, dev_df


def extract_n_grams_features(X_train, X_dev, columns, idx='id'):
    X_train = X_train.reset_index()
    result_train_df = X_train[[idx]]
    result_train_df.set_index(idx, inplace=True)

    X_dev = X_dev.reset_index()
    result_dev_df = X_dev[[idx]]
    result_dev_df.set_index(idx, inplace=True)

    for col in columns:
        result_train_df_, result_dev_df_ = extract_ngrams(X_train, X_dev, col)
        result_train_df = result_train_df.join(result_train_df_)
        result_dev_df = result_dev_df.join(result_dev_df_)
    return result_train_df, result_dev_df

### Train Doc2Vec model and vectorize argument1 and argument2

In [13]:
def make_d2v_docs(row):
    words1 = do_segmentation(row['argument1'])
    words2 = do_segmentation(row['argument2'])

    row['argument1_doc'] = TaggedDocument(words=words1,
                                          tags=[row['argument1_id']])
    row['argument2_doc'] = TaggedDocument(words=words2,
                                          tags=[row['argument2_id']])

    row['argument1_lemmas'] = ' '.join(words1)
    row['argument2_lemmas'] = ' '.join(words2)

    return row


class DatasetIter:
    def __init__(self, ds, shuffle=True):
        self.ds = ds
        self.shuffle = shuffle

    def _make_taggeddocs(self, row):
        yield row['argument1_doc']
        yield row['argument2_doc']

    def __iter__(self):
        if self.shuffle:
            self.ds = self.ds.sample(frac=1)

        for _, row in self.ds.iterrows():
            for doc in self._make_taggeddocs(row):
                yield doc


# https://github.com/RaRe-Technologies/gensim/blob/2024be9053094fbb2e765b9a06b9dc580f55c505/gensim/test/test_doc2vec.py#L501
class ConcatenatedDoc2Vec(object):
    """
    Concatenation of multiple models for reproducing the Paragraph Vectors paper.
    Models must have exactly-matching vocabulary and document IDs. (Models should
    be trained separately; this wrapper just returns concatenated results.)
    """

    def __init__(self, models):
        self.models = models
        if hasattr(models[0], 'docvecs'):
            self.docvecs = ConcatenatedDocvecs([model.docvecs for model in models])

    def __getitem__(self, token):
        return np.concatenate([model[token] for model in self.models])

    def infer_vector(self, document, alpha=0.1, min_alpha=0.0001, steps=5):
        return np.concatenate([model.infer_vector(document, alpha, min_alpha, steps) for model in self.models])

    def train(self, *ignore_args, **ignore_kwargs):
        pass  # train subcomponents individually


class ConcatenatedDocvecs(object):
    def __init__(self, models):
        self.models = models

    def __getitem__(self, token):
        return np.concatenate([model[token] for model in self.models])

In [27]:
def train_model(X_train, X_dev, workers=2, epochs=30):
    with Timer("doc2vec dbow"):
        # columns=['argument1_lemmas', 'argument2_lemmas']
        # pd.concat([X_train[columns], X_dev[columns]])
        alpha = 0.025  # https://radimrehurek.com/gensim/models/base_any2vec.html#gensim.models.base_any2vec.BaseWordEmbeddingsModel
        # %%time
        model_dbow = Doc2Vec(DatasetIter(X_train, shuffle=True),
                             dm=0,
                             vector_size=300,
                             negative=5,
                             hs=0,
                             min_count=2,
                             sample=0,
                             workers=workers,
                             epochs=epochs,
                             alpha=alpha,
                             min_alpha=alpha - (epochs * 0.002))
        
    with Timer("doc2vec dmm"):
        model_dmm = Doc2Vec(DatasetIter(X_train, shuffle=True),
                            dm=1,
                            dm_mean=1,
                            vector_size=300,
                            window=10,
                            negative=5,
                            min_count=1,
                            workers=workers,
                            epochs=epochs,
                            alpha=0.065,
                            min_alpha=0.065 - (epochs * 0.002))
        
    return model_dbow, model_dmm

In [15]:
# unused
def vec_for_learning(model, df):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

In [16]:
def make_vectors(X_train, X_dev, model):
    def make_d2v_vecs(row):
        vec1 = model.infer_vector(row['argument1_doc'].words, steps=20)
        vec2 = model.infer_vector(row['argument2_doc'].words, steps=20)

        row['argument1_vec'] = vec1
        row['argument2_vec'] = vec2
        
        return row

    X_train = X_train.progress_apply(make_d2v_vecs, axis=1)
    X_dev = X_dev.progress_apply(make_d2v_vecs, axis=1)
    
    return X_train, X_dev

In [32]:
def make_vector_comparison_diff(X_train, X_dev):
    def ret_vec_diff(row):
        return row['argument1_vec'] - row['argument2_vec']

    X_train_diff = X_train.progress_apply(ret_vec_diff, axis=1)
    X_dev_diff = X_dev.progress_apply(ret_vec_diff, axis=1)

    return X_train_diff, X_dev_diff


def make_vector_comparison_concat(X_train, X_dev):
    def ret_vec_concat(row):
        return np.concatenate((row['argument1_vec'], row['argument2_vec']))

    X_train_concat = X_train.progress_apply(ret_vec_concat, axis=1)
    X_dev_concat = X_dev.progress_apply(ret_vec_concat, axis=1)

    return X_train_concat, X_dev_concat


def make_vector_comparison(X_train, X_dev, mode="diff"):
    if mode == "concat":
        X_train, X_dev = make_vector_comparison_concat(X_train, X_dev)
    else:
        X_train, X_dev = make_vector_comparison_diff(X_train, X_dev)

    # array of array to 2d array
    X_train = np.array(list(X_train.values))
    X_dev = np.array(list(X_dev.values))

    return X_train, X_dev

### Train model and evaluate

In [18]:
def train_test_svm(X_train, y_train, X_test):
    with Timer("StandardScaler fit"):
        scaler = StandardScaler(copy=True, with_mean=False)
        scaler.fit(X_train)

    with Timer("StandardScaler transform"):
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)

    # ------------------

    with Timer("SVC (linear) fit"):
        # svclassifier = SVC(kernel='linear')
        svclassifier = LinearSVC()        
        svclassifier.fit(X_train, y_train)

    with Timer("SVC predict"):
        y_pred = svclassifier.predict(X_test)

    return y_pred


def train_test_logreg(X_train, y_train, X_test):
    with Timer("LogisticRegression fit"):
        logreg = LogisticRegression(n_jobs=1, C=1e5)
        logreg.fit(X_train, y_train)
    
    with Timer("LogisticRegression predict"):
        y_pred = logreg.predict(X_test)
    
    return y_pred


def report_training_results(y_test, y_pred):
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, y_pred))
    print()
    print('Accuracy: ', round(accuracy_score(y_test, y_pred), 2), '\n')  #

    print('Report:')
    print(classification_report(y_test, y_pred))
    f1_dic = {}

    f1_dic['macro'] = round(
        f1_score(y_pred=y_pred, y_true=y_test, average='macro'), 2)
    f1_dic['micro'] = round(
        f1_score(y_pred=y_pred, y_true=y_test, average='micro'), 2)
    return f1_dic

### Cross topic - Training and evaluating model 

In [19]:
# 1. Getting train and dev data
with Timer("1 - test/train"):
    X_train, X_dev, y_train, y_dev = get_train_test_sets(cross_traindev_df)

Time for [1 - test/train]: 0:00:00.013205


In [20]:
# 2. tokenize (make doc2vec docs + lemma string)
# tqdm.pandas()
with Timer("2 - tokenize"):
    X_train = X_train.progress_apply(make_d2v_docs, axis=1)
    X_dev = X_dev.progress_apply(make_d2v_docs, axis=1)

100%|██████████| 42733/42733 [14:09<00:00, 50.30it/s]
100%|██████████| 18315/18315 [06:15<00:00, 48.83it/s]

Time for [2 - tokenize]: 0:20:24.600272





In [21]:
# 3. train doc2vec model
with Timer("3 - doc2vec model"):
    model_dbow, model_dmm = train_model(X_train, X_dev, workers=3, epochs=15)
    
    model_dbow.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
    model_dmm.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
    
    model_concat = ConcatenatedDoc2Vec([model_dbow, model_dmm])  # todo: same vocab?

Time for [doc2vec dbow]: 0:03:40.805410
Time for [doc2vec dbow]: 0:04:19.307701
Time for [3 - doc2vec model]: 0:08:00.113433


In [22]:
# 4. vectorize arguments
with Timer("4 - vectorize arguments"):
    # X_train, X_dev = make_vectors(X_train, X_dev, model_dbow)
    # X_train, X_dev = make_vectors(X_train, X_dev, model_dmm)
    X_train, X_dev = make_vectors(X_train, X_dev, model_concat)

100%|██████████| 42733/42733 [09:16<00:00, 76.85it/s] 
100%|██████████| 18315/18315 [04:03<00:00, 75.28it/s] 

Time for [4 - vectorize arguments]: 0:13:19.368227





In [33]:
# 5. combine two argument vectors into a single one
# - diff / concat / ...
with Timer("5 - vector comparison of arguments"):
    X_train_diff, X_dev_diff = make_vector_comparison(X_train, X_dev, mode="concat")

X_train_ = X_train_diff
X_dev_ = X_dev_diff

100%|██████████| 42733/42733 [00:03<00:00, 13296.97it/s]
100%|██████████| 18315/18315 [00:01<00:00, 13273.71it/s]


Time for [5 - vector comparison of arguments]: 0:00:04.786261


In [34]:
# 6. train
with Timer("6 - SVM (train -> predict)"):
    y_pred_svm = train_test_svm(X_train_, y_train, X_dev_)

# 7. Evaluate
with Timer("7 - report"):
    report_training_results(y_dev, y_pred_svm)

Time for [StandardScaler fit]: 0:00:00.756135
Time for [StandardScaler transform]: 0:00:00.287616


  y = column_or_1d(y, warn=True)


Time for [SVC (linear) fit]: 0:02:05.663633
Time for [SVC predict]: 0:00:00.093925
Time for [6 - SVM (train -> predict)]: 0:02:06.823927
Confusion Matrix:
[[3672 5264]
 [2982 6397]]

Accuracy:  0.55 

Report:
              precision    recall  f1-score   support

       False       0.55      0.41      0.47      8936
        True       0.55      0.68      0.61      9379

    accuracy                           0.55     18315
   macro avg       0.55      0.55      0.54     18315
weighted avg       0.55      0.55      0.54     18315

Time for [7 - report]: 0:00:00.045930




In [35]:
# 6. train
with Timer("6 - LogReg (train -> predict)"):
    y_pred_logreg = train_test_logreg(X_train_, y_train, X_dev_)

# 7. Evaluate
with Timer("7 - report"):
    report_training_results(y_dev, y_pred_logreg)

  y = column_or_1d(y, warn=True)


Time for [LogisticRegression fit]: 0:00:32.509970
Time for [LogisticRegression predict]: 0:00:00.094768
Time for [6 - LogReg (train -> predict)]: 0:00:32.604980
Confusion Matrix:
[[5010 3926]
 [3875 5504]]

Accuracy:  0.57 

Report:
              precision    recall  f1-score   support

       False       0.56      0.56      0.56      8936
        True       0.58      0.59      0.59      9379

    accuracy                           0.57     18315
   macro avg       0.57      0.57      0.57     18315
weighted avg       0.57      0.57      0.57     18315

Time for [7 - report]: 0:00:00.049424


In [None]:
# old
return

asdf

# 2. Lemmatizing argument1 and argument2
with Timer("2 - lemmatize"):
    X_train = X_train.apply(get_lemma, axis=1)
    X_dev = X_dev.apply(get_lemma, axis=1)

# 3. Extracting features - 1-3 grams lemma
with Timer("3 - n-grams"):
    X_train_, X_dev_ = extract_n_grams_features(
        X_train, X_dev, columns=['argument1_lemmas', 'argument2_lemmas'])

# 4. train
with Timer("4 - SVM (train -> predict)"):
    y_pred = train_test_svm(X_train_, y_train, X_dev_)

# 5. Evaluate
with Timer("5 - report"):
    report_training_results(y_dev, y_pred)