# RATIO 2019 - Benchmarking Workshop

In [1]:
import datetime
import logging
import time

import csv
import gensim
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import seaborn as sns

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.parsing.preprocessing import STOPWORDS
# from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
from gensim.utils import simple_preprocess
from nltk.corpus import wordnet
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix, f1_score)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC, SVC
from sklearn import utils

from tqdm import tqdm
# from tqdm import tqdm_notebook as tqdm
# from tqdm.autonotebook import tqdm

In [2]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

In [3]:
# import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /home/ekoerner/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ekoerner/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ekoerner/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [4]:
# apply progress bars for pandas .apply() -> .progress_apply()
tqdm.pandas()

In [5]:
class Timer:
    def __init__(self, name=None):
        self.name = name

    def __enter__(self):
        self.time_start = time.time()

    def __exit__(self, *exc):
        time_end = time.time()
        time_delta = datetime.timedelta(seconds=(time_end - self.time_start))
        if self.name:
            print(("Time for [{}]: {}".format(self.name, time_delta)))
        else:
            print(("Time: {}".format(time_delta)))

# Task 1 - Same Side Classification

In [6]:
data_cross_path = 'data/same-side-classification/cross-topic/{}.csv'
data_within_path = 'data/same-side-classification/within-topic/{}.csv'

### Load within-topics and cross-topics data

In [7]:
# escapechar to detect quoting escapes, else it fails

# na_filter=False, because pandas automatic "nan" detection fails with the topic column, too
# cross_test_df['topic'].astype(str)[9270]

# within has "is_same_side" as string (boolean after latest update)
# cross has "is_same_side" as boolean (auto cast?)

with Timer("read cross"):
    # cross_traindev_df = pd.read_csv(data_cross_path.format('training'), index_col='id', escapechar='\\', na_filter=False)
    # cross_test_df = pd.read_csv(data_cross_path.format('test'), index_col='id', escapechar='\\', na_filter=False)
    cross_traindev_df = pd.read_csv(data_cross_path.format('training'),
                                    quotechar='"',
                                    quoting=csv.QUOTE_ALL,
                                    encoding='utf-8',
                                    escapechar='\\',
                                    doublequote=False,
                                    index_col='id')
    cross_test_df = pd.read_csv(data_cross_path.format('test'),
                                quotechar='"',
                                quoting=csv.QUOTE_ALL,
                                encoding='utf-8',
                                escapechar='\\',
                                doublequote=False,
                                index_col='id')

with Timer("read within"):
    # within_traindev_df = pd.read_csv(data_within_path.format('training'), index_col='id', escapechar='\\', na_filter=False)
    # within_test_df = pd.read_csv(data_within_path.format('test'), index_col='id', escapechar='\\', na_filter=False)
    within_traindev_df = pd.read_csv(data_within_path.format('training'),
                                     quotechar='"',
                                     quoting=csv.QUOTE_ALL,
                                     encoding='utf-8',
                                     escapechar='\\',
                                     doublequote=False,
                                     index_col='id')
    within_test_df = pd.read_csv(data_within_path.format('test'),
                                 quotechar='"',
                                 quoting=csv.QUOTE_ALL,
                                 encoding='utf-8',
                                 escapechar='\\',
                                 doublequote=False,
                                 index_col='id')

Time for [read cross]: 0:00:00.948239
Time for [read within]: 0:00:01.072199


In [8]:
# Adding a tag for the topics in focus: "gay marriage" and "abortion"
def add_tag(row):
    title = row['topic'].lower().strip()
    if "abortion" in title:
        row['tag'] = 'abortion'
    elif "gay marriage"  in title:
        row['tag'] = 'gay marriage'
    else:
        row['tag'] = 'NA'
    return row


with Timer("tag cross traindev"):
    cross_traindev_df = cross_traindev_df.apply(add_tag, axis=1)
with Timer("tag cross test"):
    cross_test_df = cross_test_df.apply(add_tag, axis=1)

with Timer("tag within traindev"):
    within_traindev_df = within_traindev_df.apply(add_tag, axis=1)
with Timer("tag within test"):
    within_test_df = within_test_df.apply(add_tag, axis=1)

Time for [tag cross traindev]: 0:00:36.762735
Time for [tag cross test]: 0:00:20.746263
Time for [tag within traindev]: 0:00:39.041026
Time for [tag within test]: 0:00:20.569204


### Get an overview about each dataset

In [9]:
def get_overview(df, task='same-side', class_name='is_same_side'):
    # Total instance numbers
    total = len(df)
    print("Task: ", task)
    print('=' * 40, '\n')

    print('Total instances: ', total)
    print('\n')

    print('For each topic:')
    for tag, tag_df in df.groupby(['tag']):
        print(tag, ': ', len(tag_df), ' instances')
        if class_name in df.columns:
            for is_same_side, side_df in tag_df.groupby([class_name]):
                print('\t\t', is_same_side, ': ', len(side_df), ' instances')
    print('\n')

    if class_name in df.columns:
        print('For each class value:')
        for class_value, class_df in df.groupby([class_name]):
            print(class_value, ': ', len(class_df), ' instances')
        print('\n')

    print('Unique argument1:', len(df['argument1'].unique()))
    print('Unique argument2:', len(df['argument2'].unique()))
    arguments = df['argument1'].values
    arguments = np.concatenate([arguments, df['argument2'].values])

    print('Unique total arguments:', len(set(list(arguments))), '\n')

    print('-' * 40, '\n')

    arguments_length_lst = [
        len(word_tokenize(x)) for x in df['argument1'].values
    ]
    arguments_length_lst.extend(
        [len(word_tokenize(x)) for x in df['argument2'].values])
    print('Words:')
    print('\tshortest argument:', min(arguments_length_lst), ' words')
    print('\tlongest argument:', max(arguments_length_lst), ' words')
    print('\targument average length:', np.mean(arguments_length_lst),
          ' words')

    arguments_sent_length_lst = [
        len(sent_tokenize(x)) for x in df['argument1'].values
    ]
    arguments_sent_length_lst.extend(
        [len(sent_tokenize(x)) for x in df['argument2'].values])
    print('Sentences:')
    print('\tshortest argument:', min(arguments_sent_length_lst), ' sentences')
    print('\tlongest argument:', max(arguments_sent_length_lst), ' sentences')
    print('\targument average length:', np.mean(arguments_sent_length_lst),
          ' sentences')

In [10]:
# with Timer("overview cross"):
#     get_overview(cross_traindev_df)

In [11]:
# with Timer("overview within"):
#     get_overview(within_traindev_df)

### Filter to only tagged input?

In [12]:
# within_traindev_df = within_traindev_df[(within_traindev_df['tag'] == 'gay marriage')]

In [13]:
# cross_traindev_df = cross_traindev_df[(cross_traindev_df['tag'] == 'gay marriage') | (cross_traindev_df['tag'] == 'abortion')]

## Train model - Baseline

### train dev set - 70% 30%

In [14]:
def get_train_test_sets(df, ratio=0.30, random_state=1):
    X = df[['argument1', 'argument2', 'argument1_id', 'argument2_id', 'topic']]
    y = df[['is_same_side']]

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=ratio,
                                                        random_state=random_state,
                                                        shuffle=True)
    return X_train, X_test, y_train, y_test

### lemmatizing

In [15]:
def get_wordnet_pos(treebank_tag):
    """return WORDNET POS compliance to WORDENT lemmatization (a,n,r,v)"""
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        # As default pos in lemmatization is Noun
        return wordnet.NOUN


def lemmatize_stemming(token, pos_tag):
    '''lemmatize words (with POS information) and then stem'''
    stemmer = SnowballStemmer(
        "english")  # pOrter, M. "An algorithm for suffix stripping."
    return stemmer.stem(WordNetLemmatizer().lemmatize(token, pos=pos_tag))


def do_segmentation(text):
    '''do sentence segmentation, tokenization (with lemmatization&stemming)'''
    lemma = []
    for sentence in sent_tokenize(text):
        sentence = sentence.replace('\n', ' ').strip()
        tokens = [token for token in word_tokenize(sentence)]
        pos_tags = nltk.pos_tag(tokens)

        for idx in range(0, len(tokens)):
            token = tokens[idx].lower()
            if token not in gensim.parsing.preprocessing.STOPWORDS and len(
                    token) > 3:
                wordnet_pos = get_wordnet_pos(pos_tags[idx][1])
                l_ = lemmatize_stemming(token, wordnet_pos)
                lemma.append(l_)
    return lemma


def preprocess(text):
    '''concat lemmatized words together again'''
    lemma = do_segmentation(text)
    return ' '.join(lemma)

### Extracting n grams lemma for argument1 and argument2

In [16]:
def extract_ngrams(X_train, X_dev, col, idx='id'):
    vectorizer = CountVectorizer(min_df=600,
                                 max_df=0.7,
                                 ngram_range=(3, 3),
                                 max_features=5000)

    vectorizer.fit(X_train[col])
    features = vectorizer.transform(X_train[col])
    features_dev = vectorizer.transform(X_dev[col])

    train_df = pd.DataFrame(features.todense(),
                            columns=vectorizer.get_feature_names())
    train_df = train_df.add_prefix(col)

    aid_df = X_train[[idx]]

    train_df = train_df.merge(aid_df,
                              left_index=True,
                              right_index=True,
                              suffixes=(False, False),
                              how='inner')
    train_df.set_index(idx, inplace=True)

    dev_df = pd.DataFrame(features_dev.todense(),
                          columns=vectorizer.get_feature_names())
    dev_df = dev_df.add_prefix(col)

    aid_dev_df = X_dev[[idx]]

    dev_df = dev_df.merge(aid_dev_df,
                          left_index=True,
                          right_index=True,
                          suffixes=(False, False),
                          how='inner')
    dev_df.set_index(idx, inplace=True)
    return train_df, dev_df


def extract_n_grams_features(X_train, X_dev, columns, idx='id'):
    X_train = X_train.reset_index()
    result_train_df = X_train[[idx]]
    result_train_df.set_index(idx, inplace=True)

    X_dev = X_dev.reset_index()
    result_dev_df = X_dev[[idx]]
    result_dev_df.set_index(idx, inplace=True)

    for col in columns:
        result_train_df_, result_dev_df_ = extract_ngrams(X_train, X_dev, col)
        result_train_df = result_train_df.join(result_train_df_)
        result_dev_df = result_dev_df.join(result_dev_df_)
    return result_train_df, result_dev_df

### Train Doc2Vec model and vectorize argument1 and argument2

In [17]:
def make_d2v_docs(row):
    words1 = do_segmentation(row['argument1'])
    words2 = do_segmentation(row['argument2'])

    row['argument1_doc'] = TaggedDocument(words=words1,
                                          tags=[row['argument1_id']])
    row['argument2_doc'] = TaggedDocument(words=words2,
                                          tags=[row['argument2_id']])

    row['argument1_lemmas'] = ' '.join(words1)
    row['argument2_lemmas'] = ' '.join(words2)

    return row


class DatasetIter:
    def __init__(self, ds, shuffle=True):
        self.ds = ds
        self.shuffle = shuffle

    def _make_taggeddocs(self, row):
        yield row['argument1_doc']
        yield row['argument2_doc']

    def __iter__(self):
        if self.shuffle:
            self.ds = self.ds.sample(frac=1)

        for _, row in self.ds.iterrows():
            for doc in self._make_taggeddocs(row):
                yield doc


# https://github.com/RaRe-Technologies/gensim/blob/2024be9053094fbb2e765b9a06b9dc580f55c505/gensim/test/test_doc2vec.py#L501
class ConcatenatedDoc2Vec(object):
    """
    Concatenation of multiple models for reproducing the Paragraph Vectors paper.
    Models must have exactly-matching vocabulary and document IDs. (Models should
    be trained separately; this wrapper just returns concatenated results.)
    """

    def __init__(self, models):
        self.models = models
        if hasattr(models[0], 'docvecs'):
            self.docvecs = ConcatenatedDocvecs([model.docvecs for model in models])

    def __getitem__(self, token):
        return np.concatenate([model[token] for model in self.models])

    def infer_vector(self, document, alpha=0.1, min_alpha=0.0001, steps=5):
        return np.concatenate([model.infer_vector(document, alpha, min_alpha, steps) for model in self.models])

    def train(self, *ignore_args, **ignore_kwargs):
        pass  # train subcomponents individually


class ConcatenatedDocvecs(object):
    def __init__(self, models):
        self.models = models

    def __getitem__(self, token):
        return np.concatenate([model[token] for model in self.models])

In [18]:
def train_model(X_train, X_dev, workers=2, epochs=30):
    with Timer("doc2vec dbow"):
        # columns=['argument1_lemmas', 'argument2_lemmas']
        # pd.concat([X_train[columns], X_dev[columns]])
        alpha = 0.025  # https://radimrehurek.com/gensim/models/base_any2vec.html#gensim.models.base_any2vec.BaseWordEmbeddingsModel
        # %%time
        model_dbow = Doc2Vec(DatasetIter(X_train, shuffle=True),
                             dm=0,
                             vector_size=300,
                             negative=5,
                             hs=0,
                             min_count=2,
                             sample=0,
                             workers=workers,
                             epochs=epochs,
                             alpha=alpha,
                             min_alpha=alpha - (epochs * 0.002))
        
    with Timer("doc2vec dmm"):
        model_dmm = Doc2Vec(DatasetIter(X_train, shuffle=True),
                            dm=1,
                            dm_mean=1,
                            vector_size=300,
                            window=10,
                            negative=5,
                            min_count=1,
                            workers=workers,
                            epochs=epochs,
                            alpha=0.065,
                            min_alpha=0.065 - (epochs * 0.002))
        
    return model_dbow, model_dmm

In [20]:
def make_vectors(X_train, X_dev, model):
    def make_d2v_vecs(row):
        vec1 = model.infer_vector(row['argument1_doc'].words, steps=20)
        vec2 = model.infer_vector(row['argument2_doc'].words, steps=20)

        row['argument1_vec'] = vec1
        row['argument2_vec'] = vec2
        
        return row

    X_train = X_train.progress_apply(make_d2v_vecs, axis=1)
    X_dev = X_dev.progress_apply(make_d2v_vecs, axis=1)
    
    return X_train, X_dev

In [21]:
def make_vector_comparison_diff(X_train, X_dev):
    def ret_vec_diff(row):
        return row['argument1_vec'] - row['argument2_vec']

    X_train_diff = X_train.progress_apply(ret_vec_diff, axis=1)
    X_dev_diff = X_dev.progress_apply(ret_vec_diff, axis=1)

    return X_train_diff, X_dev_diff


def make_vector_comparison_concat(X_train, X_dev):
    def ret_vec_concat(row):
        return np.concatenate((row['argument1_vec'], row['argument2_vec']))

    X_train_concat = X_train.progress_apply(ret_vec_concat, axis=1)
    X_dev_concat = X_dev.progress_apply(ret_vec_concat, axis=1)

    return X_train_concat, X_dev_concat


def make_vector_comparison(X_train, X_dev, mode="diff"):
    if mode == "concat":
        X_train, X_dev = make_vector_comparison_concat(X_train, X_dev)
    else:
        X_train, X_dev = make_vector_comparison_diff(X_train, X_dev)

    # array of array to 2d array
    X_train = np.array(list(X_train.values))
    X_dev = np.array(list(X_dev.values))

    return X_train, X_dev

### Train model and evaluate

In [22]:
def train_test_svm(X_train, y_train, X_test):
    with Timer("StandardScaler fit"):
        scaler = StandardScaler(copy=True, with_mean=False)
        scaler.fit(X_train)

    with Timer("StandardScaler transform"):
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)

    # ------------------

    with Timer("SVC (linear) fit"):
        # svclassifier = SVC(kernel='linear')
        svclassifier = LinearSVC()        
        svclassifier.fit(X_train, y_train)

    with Timer("SVC predict"):
        y_pred = svclassifier.predict(X_test)

    return y_pred


def train_test_logreg(X_train, y_train, X_test):
    with Timer("LogisticRegression fit"):
        logreg = LogisticRegression(n_jobs=1, C=1e5)
        logreg.fit(X_train, y_train)
    
    with Timer("LogisticRegression predict"):
        y_pred = logreg.predict(X_test)
    
    return y_pred


def train_test_sgd(X_train, y_train, X_test):
    with Timer("SGDClassifier fit"):
        sgdcla = SGDClassifier()
        sgdcla.fit(X_train, y_train)
    
    with Timer("SGDClassifier predict"):
        y_pred = sgdcla.predict(X_test)
    
    return y_pred


def heatconmat(y_test, y_pred):
    sns.set_context('talk')
    plt.figure(figsize=(9, 6))
    sns.heatmap(confusion_matrix(y_test, y_pred),
                annot=True,
                fmt='d',
                cbar=False,
                cmap='gist_earth_r',
                yticklabels=sorted(y_test.unique()))
    plt.show()


def report_training_results(y_test, y_pred, name=None, heatmap=True):
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, y_pred))
    if heatmap:
        heatconmat(y_test['is_same_side'], y_pred)
    print()
    print('Accuracy: ', round(accuracy_score(y_test, y_pred), 2), '\n')  #

    print('Report{}:'.format("" if not name else " for [{}]".format(name)))
    print(classification_report(y_test, y_pred))

    f1_dic = {}
    f1_dic['macro'] = round(
        f1_score(y_pred=y_pred, y_true=y_test, average='macro'), 2)
    f1_dic['micro'] = round(
        f1_score(y_pred=y_pred, y_true=y_test, average='micro'), 2)
    return f1_dic

### Cross topic - Training and evaluating model 

In [23]:
# 1. Getting train and dev data
with Timer("1 - test/train"):
    X_train, X_dev, y_train, y_dev = get_train_test_sets(cross_traindev_df)

Time for [1 - test/train]: 0:00:00.018567


In [24]:
# 2. tokenize (make doc2vec docs + lemma string)
# tqdm.pandas()
with Timer("2 - tokenize"):
    X_train = X_train.progress_apply(make_d2v_docs, axis=1)
    X_dev = X_dev.progress_apply(make_d2v_docs, axis=1)

100%|██████████| 42733/42733 [15:12<00:00, 46.81it/s]
100%|██████████| 18315/18315 [06:17<00:00, 48.49it/s]

Time for [2 - tokenize]: 0:21:30.520007





In [29]:
with Timer("2a - pickle"):
    X_train.to_pickle("data/X_train.cross_td.p")
    X_dev.to_pickle("data/X_dev.cross_td.p")

Time for [2a - pickle]: 0:00:06.731561


In [26]:
with Timer("2b - unpickle"):
    X_train = pd.read_pickle("data/X_train.cross_td.p")
    X_dev = pd.read_pickle("data/X_dev.cross_td.p")

Time for [2b - unpickle]: 0:00:02.655215


In [27]:
# 3. train doc2vec model
with Timer("3 - doc2vec model"):
    model_dbow, model_dmm = train_model(X_train, X_dev, workers=3, epochs=30)

    model_dbow.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
    model_dmm.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

    model_concat = ConcatenatedDoc2Vec([model_dbow, model_dmm])

2019-06-26 17:12:49,850 : INFO : collecting all words and their counts
2019-06-26 17:12:49,863 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2019-06-26 17:12:50,374 : INFO : PROGRESS: at example #10000, processed 751115 words (1472751/s), 21065 word types, 2130 tags
2019-06-26 17:12:50,896 : INFO : PROGRESS: at example #20000, processed 1548613 words (1530059/s), 29037 word types, 3486 tags
2019-06-26 17:12:51,414 : INFO : PROGRESS: at example #30000, processed 2316013 words (1484035/s), 32922 word types, 4530 tags
2019-06-26 17:12:51,936 : INFO : PROGRESS: at example #40000, processed 3097340 words (1498464/s), 35373 word types, 5319 tags
2019-06-26 17:12:52,455 : INFO : PROGRESS: at example #50000, processed 3864183 words (1480769/s), 36689 word types, 5923 tags
2019-06-26 17:12:52,972 : INFO : PROGRESS: at example #60000, processed 4617395 words (1460185/s), 37384 word types, 6408 tags
2019-06-26 17:12:53,492 : INFO : PROGRESS: at example #70000, pr

2019-06-26 17:13:40,891 : INFO : EPOCH 4 - PROGRESS: at 41.48% examples, 451395 words/s, in_qsize 6, out_qsize 0
2019-06-26 17:13:41,893 : INFO : EPOCH 4 - PROGRESS: at 50.66% examples, 475130 words/s, in_qsize 6, out_qsize 0
2019-06-26 17:13:42,928 : INFO : EPOCH 4 - PROGRESS: at 57.93% examples, 476021 words/s, in_qsize 6, out_qsize 0
2019-06-26 17:13:43,943 : INFO : EPOCH 4 - PROGRESS: at 64.06% examples, 469942 words/s, in_qsize 5, out_qsize 0
2019-06-26 17:13:44,954 : INFO : EPOCH 4 - PROGRESS: at 69.50% examples, 459381 words/s, in_qsize 5, out_qsize 0
2019-06-26 17:13:45,977 : INFO : EPOCH 4 - PROGRESS: at 77.59% examples, 464600 words/s, in_qsize 6, out_qsize 0
2019-06-26 17:13:46,990 : INFO : EPOCH 4 - PROGRESS: at 86.64% examples, 475075 words/s, in_qsize 6, out_qsize 0
2019-06-26 17:13:48,005 : INFO : EPOCH 4 - PROGRESS: at 96.38% examples, 486445 words/s, in_qsize 6, out_qsize 0
2019-06-26 17:13:48,327 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-

2019-06-26 17:14:41,741 : INFO : EPOCH 8 - PROGRESS: at 80.67% examples, 484863 words/s, in_qsize 5, out_qsize 0
2019-06-26 17:14:42,752 : INFO : EPOCH 8 - PROGRESS: at 86.63% examples, 476546 words/s, in_qsize 6, out_qsize 0
2019-06-26 17:14:43,762 : INFO : EPOCH 8 - PROGRESS: at 94.17% examples, 477076 words/s, in_qsize 5, out_qsize 0
2019-06-26 17:14:44,519 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-06-26 17:14:44,528 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-06-26 17:14:44,529 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-06-26 17:14:44,530 : INFO : EPOCH - 8 : training on 6557240 raw words (6641536 effective words) took 13.9s, 478732 effective words/s
2019-06-26 17:14:45,550 : INFO : EPOCH 9 - PROGRESS: at 5.48% examples, 352800 words/s, in_qsize 5, out_qsize 0
2019-06-26 17:14:46,580 : INFO : EPOCH 9 - PROGRESS: at 11.68% examples, 365646 words/s, in_qsize 6, out_qsize 0
2019-06-26 17:14:47,583 :

2019-06-26 17:15:42,490 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-06-26 17:15:42,497 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-06-26 17:15:42,504 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-06-26 17:15:42,505 : INFO : EPOCH - 12 : training on 6557240 raw words (6641536 effective words) took 15.0s, 443309 effective words/s
2019-06-26 17:15:43,511 : INFO : EPOCH 13 - PROGRESS: at 6.66% examples, 437075 words/s, in_qsize 6, out_qsize 0
2019-06-26 17:15:44,524 : INFO : EPOCH 13 - PROGRESS: at 15.13% examples, 490496 words/s, in_qsize 6, out_qsize 0
2019-06-26 17:15:45,537 : INFO : EPOCH 13 - PROGRESS: at 22.24% examples, 475176 words/s, in_qsize 6, out_qsize 0
2019-06-26 17:15:46,542 : INFO : EPOCH 13 - PROGRESS: at 28.13% examples, 455892 words/s, in_qsize 6, out_qsize 0
2019-06-26 17:15:47,559 : INFO : EPOCH 13 - PROGRESS: at 35.47% examples, 462675 words/s, in_qsize 6, out_qsize 0
2019-06-26 17:15:48

2019-06-26 17:16:41,129 : INFO : EPOCH 17 - PROGRESS: at 7.11% examples, 472018 words/s, in_qsize 6, out_qsize 0
2019-06-26 17:16:42,139 : INFO : EPOCH 17 - PROGRESS: at 15.28% examples, 498204 words/s, in_qsize 5, out_qsize 0
2019-06-26 17:16:43,149 : INFO : EPOCH 17 - PROGRESS: at 22.67% examples, 497086 words/s, in_qsize 6, out_qsize 0
2019-06-26 17:16:44,157 : INFO : EPOCH 17 - PROGRESS: at 29.04% examples, 474620 words/s, in_qsize 5, out_qsize 0
2019-06-26 17:16:45,160 : INFO : EPOCH 17 - PROGRESS: at 35.78% examples, 467431 words/s, in_qsize 5, out_qsize 0
2019-06-26 17:16:46,165 : INFO : EPOCH 17 - PROGRESS: at 43.05% examples, 472070 words/s, in_qsize 5, out_qsize 0
2019-06-26 17:16:47,173 : INFO : EPOCH 17 - PROGRESS: at 50.85% examples, 476776 words/s, in_qsize 6, out_qsize 0
2019-06-26 17:16:48,196 : INFO : EPOCH 17 - PROGRESS: at 57.31% examples, 470557 words/s, in_qsize 6, out_qsize 0
2019-06-26 17:16:49,211 : INFO : EPOCH 17 - PROGRESS: at 64.14% examples, 467465 words/s,

2019-06-26 17:17:42,892 : INFO : EPOCH 21 - PROGRESS: at 47.14% examples, 442174 words/s, in_qsize 5, out_qsize 0
2019-06-26 17:17:43,897 : INFO : EPOCH 21 - PROGRESS: at 53.24% examples, 437873 words/s, in_qsize 6, out_qsize 0
2019-06-26 17:17:44,914 : INFO : EPOCH 21 - PROGRESS: at 61.81% examples, 451401 words/s, in_qsize 6, out_qsize 0
2019-06-26 17:17:45,939 : INFO : EPOCH 21 - PROGRESS: at 69.94% examples, 457004 words/s, in_qsize 5, out_qsize 0
2019-06-26 17:17:46,952 : INFO : EPOCH 21 - PROGRESS: at 77.10% examples, 458357 words/s, in_qsize 5, out_qsize 0
2019-06-26 17:17:47,970 : INFO : EPOCH 21 - PROGRESS: at 83.98% examples, 458352 words/s, in_qsize 6, out_qsize 0
2019-06-26 17:17:48,995 : INFO : EPOCH 21 - PROGRESS: at 90.61% examples, 455348 words/s, in_qsize 6, out_qsize 0
2019-06-26 17:17:49,980 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-06-26 17:17:49,987 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-06-26 17:17:49,

2019-06-26 17:18:42,989 : INFO : EPOCH 25 - PROGRESS: at 89.55% examples, 481642 words/s, in_qsize 6, out_qsize 0
2019-06-26 17:18:44,009 : INFO : EPOCH 25 - PROGRESS: at 96.90% examples, 483042 words/s, in_qsize 5, out_qsize 0
2019-06-26 17:18:44,345 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-06-26 17:18:44,354 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-06-26 17:18:44,362 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-06-26 17:18:44,363 : INFO : EPOCH - 25 : training on 6557240 raw words (6641536 effective words) took 13.6s, 487542 effective words/s
2019-06-26 17:18:45,375 : INFO : EPOCH 26 - PROGRESS: at 7.50% examples, 475396 words/s, in_qsize 5, out_qsize 0
2019-06-26 17:18:46,390 : INFO : EPOCH 26 - PROGRESS: at 14.87% examples, 493153 words/s, in_qsize 5, out_qsize 0
2019-06-26 17:18:47,411 : INFO : EPOCH 26 - PROGRESS: at 22.05% examples, 488341 words/s, in_qsize 6, out_qsize 0
2019-06-26 17:18:48

2019-06-26 17:19:41,071 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-06-26 17:19:41,072 : INFO : EPOCH - 29 : training on 6557240 raw words (6641536 effective words) took 13.3s, 499569 effective words/s
2019-06-26 17:19:42,086 : INFO : EPOCH 30 - PROGRESS: at 5.73% examples, 393628 words/s, in_qsize 6, out_qsize 0
2019-06-26 17:19:43,098 : INFO : EPOCH 30 - PROGRESS: at 13.88% examples, 458239 words/s, in_qsize 5, out_qsize 0
2019-06-26 17:19:44,120 : INFO : EPOCH 30 - PROGRESS: at 22.64% examples, 497400 words/s, in_qsize 6, out_qsize 0
2019-06-26 17:19:45,134 : INFO : EPOCH 30 - PROGRESS: at 31.29% examples, 511094 words/s, in_qsize 5, out_qsize 0
2019-06-26 17:19:46,151 : INFO : EPOCH 30 - PROGRESS: at 38.14% examples, 497297 words/s, in_qsize 5, out_qsize 0
2019-06-26 17:19:47,172 : INFO : EPOCH 30 - PROGRESS: at 45.08% examples, 487817 words/s, in_qsize 5, out_qsize 0
2019-06-26 17:19:48,193 : INFO : EPOCH 30 - PROGRESS: at 52.45% examples, 486466 words/

Time for [doc2vec dbow]: 0:07:05.432712


2019-06-26 17:19:55,841 : INFO : PROGRESS: at example #10000, processed 769107 words (1429876/s), 20976 word types, 2141 tags
2019-06-26 17:19:56,340 : INFO : PROGRESS: at example #20000, processed 1525244 words (1517898/s), 28526 word types, 3501 tags
2019-06-26 17:19:56,836 : INFO : PROGRESS: at example #30000, processed 2289251 words (1542602/s), 32810 word types, 4549 tags
2019-06-26 17:19:57,334 : INFO : PROGRESS: at example #40000, processed 3067423 words (1567663/s), 35082 word types, 5341 tags
2019-06-26 17:19:57,837 : INFO : PROGRESS: at example #50000, processed 3843979 words (1544550/s), 36582 word types, 5946 tags
2019-06-26 17:19:58,335 : INFO : PROGRESS: at example #60000, processed 4626312 words (1573650/s), 37319 word types, 6408 tags
2019-06-26 17:19:58,829 : INFO : PROGRESS: at example #70000, processed 5366029 words (1501829/s), 37702 word types, 6758 tags
2019-06-26 17:19:59,333 : INFO : PROGRESS: at example #80000, processed 6121569 words (1502776/s), 37993 word ty

2019-06-26 17:20:50,359 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-06-26 17:20:50,360 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-06-26 17:20:50,361 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-06-26 17:20:50,362 : INFO : EPOCH - 3 : training on 6557240 raw words (5679245 effective words) took 17.4s, 325685 effective words/s
2019-06-26 17:20:51,365 : INFO : EPOCH 4 - PROGRESS: at 5.98% examples, 342070 words/s, in_qsize 5, out_qsize 0
2019-06-26 17:20:52,375 : INFO : EPOCH 4 - PROGRESS: at 11.52% examples, 322969 words/s, in_qsize 6, out_qsize 0
2019-06-26 17:20:53,402 : INFO : EPOCH 4 - PROGRESS: at 18.61% examples, 348292 words/s, in_qsize 5, out_qsize 0
2019-06-26 17:20:54,455 : INFO : EPOCH 4 - PROGRESS: at 25.08% examples, 348412 words/s, in_qsize 6, out_qsize 0
2019-06-26 17:20:55,465 : INFO : EPOCH 4 - PROGRESS: at 30.44% examples, 338280 words/s, in_qsize 6, out_qsize 0
2019-06-26 17:20:56,468 :

2019-06-26 17:21:51,716 : INFO : EPOCH 7 - PROGRESS: at 70.25% examples, 355921 words/s, in_qsize 6, out_qsize 0
2019-06-26 17:21:52,724 : INFO : EPOCH 7 - PROGRESS: at 75.57% examples, 351679 words/s, in_qsize 5, out_qsize 0
2019-06-26 17:21:53,739 : INFO : EPOCH 7 - PROGRESS: at 81.36% examples, 350550 words/s, in_qsize 5, out_qsize 0
2019-06-26 17:21:54,770 : INFO : EPOCH 7 - PROGRESS: at 89.08% examples, 356482 words/s, in_qsize 6, out_qsize 0
2019-06-26 17:21:55,792 : INFO : EPOCH 7 - PROGRESS: at 96.99% examples, 361746 words/s, in_qsize 5, out_qsize 0
2019-06-26 17:21:56,068 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-06-26 17:21:56,077 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-06-26 17:21:56,082 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-06-26 17:21:56,083 : INFO : EPOCH - 7 : training on 6557240 raw words (5679751 effective words) took 15.5s, 365800 effective words/s
2019-06-26 17:21:57,090 

2019-06-26 17:22:52,460 : INFO : EPOCH 11 - PROGRESS: at 38.81% examples, 306469 words/s, in_qsize 5, out_qsize 0
2019-06-26 17:22:53,466 : INFO : EPOCH 11 - PROGRESS: at 44.82% examples, 311654 words/s, in_qsize 5, out_qsize 0
2019-06-26 17:22:54,466 : INFO : EPOCH 11 - PROGRESS: at 51.21% examples, 318744 words/s, in_qsize 5, out_qsize 0
2019-06-26 17:22:55,484 : INFO : EPOCH 11 - PROGRESS: at 56.55% examples, 316354 words/s, in_qsize 6, out_qsize 0
2019-06-26 17:22:56,514 : INFO : EPOCH 11 - PROGRESS: at 62.98% examples, 320205 words/s, in_qsize 5, out_qsize 0
2019-06-26 17:22:57,541 : INFO : EPOCH 11 - PROGRESS: at 68.58% examples, 318973 words/s, in_qsize 6, out_qsize 0
2019-06-26 17:22:58,557 : INFO : EPOCH 11 - PROGRESS: at 76.37% examples, 328594 words/s, in_qsize 5, out_qsize 0
2019-06-26 17:22:59,570 : INFO : EPOCH 11 - PROGRESS: at 82.33% examples, 329173 words/s, in_qsize 5, out_qsize 0
2019-06-26 17:23:00,605 : INFO : EPOCH 11 - PROGRESS: at 89.86% examples, 334707 words/s

2019-06-26 17:23:53,852 : INFO : EPOCH 15 - PROGRESS: at 18.25% examples, 339183 words/s, in_qsize 5, out_qsize 0
2019-06-26 17:23:54,860 : INFO : EPOCH 15 - PROGRESS: at 23.73% examples, 330764 words/s, in_qsize 5, out_qsize 0
2019-06-26 17:23:55,864 : INFO : EPOCH 15 - PROGRESS: at 28.74% examples, 322476 words/s, in_qsize 6, out_qsize 0
2019-06-26 17:23:56,872 : INFO : EPOCH 15 - PROGRESS: at 34.27% examples, 319487 words/s, in_qsize 6, out_qsize 0
2019-06-26 17:23:57,912 : INFO : EPOCH 15 - PROGRESS: at 39.59% examples, 313426 words/s, in_qsize 5, out_qsize 0
2019-06-26 17:23:58,925 : INFO : EPOCH 15 - PROGRESS: at 46.90% examples, 326704 words/s, in_qsize 5, out_qsize 0
2019-06-26 17:23:59,926 : INFO : EPOCH 15 - PROGRESS: at 54.88% examples, 339556 words/s, in_qsize 5, out_qsize 0
2019-06-26 17:24:00,939 : INFO : EPOCH 15 - PROGRESS: at 61.66% examples, 343599 words/s, in_qsize 6, out_qsize 0
2019-06-26 17:24:01,948 : INFO : EPOCH 15 - PROGRESS: at 68.93% examples, 350846 words/s

2019-06-26 17:24:54,616 : INFO : EPOCH 19 - PROGRESS: at 5.04% examples, 280435 words/s, in_qsize 5, out_qsize 0
2019-06-26 17:24:55,639 : INFO : EPOCH 19 - PROGRESS: at 10.61% examples, 303106 words/s, in_qsize 6, out_qsize 0
2019-06-26 17:24:56,659 : INFO : EPOCH 19 - PROGRESS: at 17.50% examples, 324574 words/s, in_qsize 5, out_qsize 0
2019-06-26 17:24:57,672 : INFO : EPOCH 19 - PROGRESS: at 22.49% examples, 315083 words/s, in_qsize 5, out_qsize 0
2019-06-26 17:24:58,697 : INFO : EPOCH 19 - PROGRESS: at 27.76% examples, 311767 words/s, in_qsize 5, out_qsize 0
2019-06-26 17:24:59,703 : INFO : EPOCH 19 - PROGRESS: at 35.20% examples, 330331 words/s, in_qsize 5, out_qsize 0
2019-06-26 17:25:00,710 : INFO : EPOCH 19 - PROGRESS: at 40.19% examples, 324144 words/s, in_qsize 5, out_qsize 0
2019-06-26 17:25:01,736 : INFO : EPOCH 19 - PROGRESS: at 45.97% examples, 324269 words/s, in_qsize 6, out_qsize 0
2019-06-26 17:25:02,747 : INFO : EPOCH 19 - PROGRESS: at 51.92% examples, 326412 words/s,

2019-06-26 17:25:58,709 : INFO : EPOCH 22 - PROGRESS: at 91.05% examples, 341647 words/s, in_qsize 5, out_qsize 0
2019-06-26 17:25:59,725 : INFO : EPOCH 22 - PROGRESS: at 98.11% examples, 343951 words/s, in_qsize 6, out_qsize 0
2019-06-26 17:25:59,891 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-06-26 17:25:59,914 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-06-26 17:25:59,918 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-06-26 17:25:59,918 : INFO : EPOCH - 22 : training on 6557240 raw words (5679481 effective words) took 16.4s, 346267 effective words/s
2019-06-26 17:26:00,924 : INFO : EPOCH 23 - PROGRESS: at 5.75% examples, 322423 words/s, in_qsize 5, out_qsize 0
2019-06-26 17:26:01,944 : INFO : EPOCH 23 - PROGRESS: at 11.15% examples, 311543 words/s, in_qsize 5, out_qsize 0
2019-06-26 17:26:02,972 : INFO : EPOCH 23 - PROGRESS: at 16.69% examples, 309890 words/s, in_qsize 6, out_qsize 0
2019-06-26 17:26:03

2019-06-26 17:26:59,158 : INFO : EPOCH 26 - PROGRESS: at 45.60% examples, 320654 words/s, in_qsize 5, out_qsize 0
2019-06-26 17:27:00,166 : INFO : EPOCH 26 - PROGRESS: at 53.80% examples, 334824 words/s, in_qsize 5, out_qsize 0
2019-06-26 17:27:01,176 : INFO : EPOCH 26 - PROGRESS: at 60.51% examples, 340136 words/s, in_qsize 5, out_qsize 0
2019-06-26 17:27:02,177 : INFO : EPOCH 26 - PROGRESS: at 65.72% examples, 337079 words/s, in_qsize 5, out_qsize 0
2019-06-26 17:27:03,188 : INFO : EPOCH 26 - PROGRESS: at 71.12% examples, 333560 words/s, in_qsize 6, out_qsize 0
2019-06-26 17:27:04,195 : INFO : EPOCH 26 - PROGRESS: at 77.70% examples, 335237 words/s, in_qsize 5, out_qsize 0
2019-06-26 17:27:05,212 : INFO : EPOCH 26 - PROGRESS: at 84.13% examples, 336502 words/s, in_qsize 5, out_qsize 0
2019-06-26 17:27:06,216 : INFO : EPOCH 26 - PROGRESS: at 90.16% examples, 336700 words/s, in_qsize 6, out_qsize 0
2019-06-26 17:27:07,218 : INFO : EPOCH 26 - PROGRESS: at 98.54% examples, 344829 words/s

2019-06-26 17:27:59,623 : INFO : EPOCH 30 - PROGRESS: at 30.63% examples, 341465 words/s, in_qsize 5, out_qsize 0
2019-06-26 17:28:00,660 : INFO : EPOCH 30 - PROGRESS: at 36.33% examples, 335268 words/s, in_qsize 5, out_qsize 0
2019-06-26 17:28:01,670 : INFO : EPOCH 30 - PROGRESS: at 42.43% examples, 335638 words/s, in_qsize 5, out_qsize 0
2019-06-26 17:28:02,691 : INFO : EPOCH 30 - PROGRESS: at 48.97% examples, 336479 words/s, in_qsize 6, out_qsize 0
2019-06-26 17:28:03,705 : INFO : EPOCH 30 - PROGRESS: at 54.31% examples, 332739 words/s, in_qsize 5, out_qsize 0
2019-06-26 17:28:04,736 : INFO : EPOCH 30 - PROGRESS: at 60.85% examples, 335990 words/s, in_qsize 6, out_qsize 0
2019-06-26 17:28:05,743 : INFO : EPOCH 30 - PROGRESS: at 66.54% examples, 335502 words/s, in_qsize 5, out_qsize 0
2019-06-26 17:28:06,757 : INFO : EPOCH 30 - PROGRESS: at 70.94% examples, 328615 words/s, in_qsize 6, out_qsize 0
2019-06-26 17:28:07,765 : INFO : EPOCH 30 - PROGRESS: at 75.96% examples, 327370 words/s

Time for [doc2vec dmm]: 0:08:16.949758
Time for [3 - doc2vec model]: 0:15:22.382699


In [28]:
# 4. vectorize arguments
with Timer("4 - vectorize arguments"):
    # X_train, X_dev = make_vectors(X_train, X_dev, model_dbow)
    # X_train, X_dev = make_vectors(X_train, X_dev, model_dmm)
    X_train, X_dev = make_vectors(X_train, X_dev, model_concat)

100%|██████████| 42733/42733 [09:11<00:00, 77.49it/s] 
100%|██████████| 18315/18315 [04:04<00:00, 82.95it/s] 

Time for [4 - vectorize arguments]: 0:13:17.009946





In [30]:
# 5. combine two argument vectors into a single one
# - diff / concat / ...
with Timer("5 - vector comparison of arguments"):
    X_train_diff, X_dev_diff = make_vector_comparison(X_train, X_dev, mode="concat")

X_train_ = X_train_diff
X_dev_ = X_dev_diff

100%|██████████| 42733/42733 [00:03<00:00, 13011.80it/s]
100%|██████████| 18315/18315 [00:01<00:00, 13006.89it/s]


Time for [5 - vector comparison of arguments]: 0:00:04.853041


In [31]:
# 6. train
with Timer("6 - SVM (train -> predict)"):
    y_pred_svm = train_test_svm(X_train_, y_train, X_dev_)

# 7. Evaluate
with Timer("7 - report"):
    print(report_training_results(y_dev, y_pred_svm, name="SVM", heatmap=False))

Time for [StandardScaler fit]: 0:00:00.770995
Time for [StandardScaler transform]: 0:00:00.295924


  y = column_or_1d(y, warn=True)


Time for [SVC (linear) fit]: 0:02:06.679089
Time for [SVC predict]: 0:00:00.094965
Time for [6 - SVM (train -> predict)]: 0:02:07.860045
Confusion Matrix:
[[5061 3875]
 [4041 5338]]

Accuracy:  0.57 

Report for [SVM]:
              precision    recall  f1-score   support

       False       0.56      0.57      0.56      8936
        True       0.58      0.57      0.57      9379

    accuracy                           0.57     18315
   macro avg       0.57      0.57      0.57     18315
weighted avg       0.57      0.57      0.57     18315

{'macro': 0.57, 'micro': 0.57}
Time for [7 - report]: 0:00:00.048020




In [32]:
# 6. train
with Timer("6 - LogReg (train -> predict)"):
    y_pred_logreg = train_test_logreg(X_train_, y_train, X_dev_)

# 7. Evaluate
with Timer("7 - report"):
    print(report_training_results(y_dev, y_pred_logreg, name="LogisticRegression", heatmap=False))

  y = column_or_1d(y, warn=True)


Time for [LogisticRegression fit]: 0:00:24.710920
Time for [LogisticRegression predict]: 0:00:00.091657
Time for [6 - LogReg (train -> predict)]: 0:00:24.802823
Confusion Matrix:
[[5047 3889]
 [3974 5405]]

Accuracy:  0.57 

Report for [LogisticRegression]:
              precision    recall  f1-score   support

       False       0.56      0.56      0.56      8936
        True       0.58      0.58      0.58      9379

    accuracy                           0.57     18315
   macro avg       0.57      0.57      0.57     18315
weighted avg       0.57      0.57      0.57     18315

{'macro': 0.57, 'micro': 0.57}
Time for [7 - report]: 0:00:00.049836


In [33]:
# 6. train
with Timer("6 - SGDClassifier (train -> predict)"):
    y_pred_sgdcla = train_test_sgd(X_train_, y_train, X_dev_)

# 7. Evaluate
with Timer("7 - report"):
    print(report_training_results(y_dev, y_pred_sgdcla, name="SGDClassifier", heatmap=False))

  y = column_or_1d(y, warn=True)


Time for [SGDClassifier fit]: 0:00:15.890665
Time for [SGDClassifier predict]: 0:00:00.092300
Time for [6 - SGDClassifier (train -> predict)]: 0:00:15.983215
Confusion Matrix:
[[4644 4292]
 [3525 5854]]

Accuracy:  0.57 

Report for [SGDClassifier]:
              precision    recall  f1-score   support

       False       0.57      0.52      0.54      8936
        True       0.58      0.62      0.60      9379

    accuracy                           0.57     18315
   macro avg       0.57      0.57      0.57     18315
weighted avg       0.57      0.57      0.57     18315

{'macro': 0.57, 'micro': 0.57}
Time for [7 - report]: 0:00:00.049750


In [None]:
# old
return

asdf

# 2. Lemmatizing argument1 and argument2
with Timer("2 - lemmatize"):
    X_train = X_train.apply(get_lemma, axis=1)
    X_dev = X_dev.apply(get_lemma, axis=1)

# 3. Extracting features - 1-3 grams lemma
with Timer("3 - n-grams"):
    X_train_, X_dev_ = extract_n_grams_features(
        X_train, X_dev, columns=['argument1_lemmas', 'argument2_lemmas'])

# 4. train
with Timer("4 - SVM (train -> predict)"):
    y_pred = train_test_svm(X_train_, y_train, X_dev_)

# 5. Evaluate
with Timer("5 - report"):
    report_training_results(y_dev, y_pred)