# RATIO 2019 - Benchmarking Workshop

In [56]:
import datetime
import logging
import time

import gensim
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import seaborn as sns

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.parsing.preprocessing import STOPWORDS
# from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
from gensim.utils import simple_preprocess
from nltk.corpus import wordnet
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix, f1_score)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC, SVC
from sklearn import utils

from tqdm import tqdm
# from tqdm import tqdm_notebook as tqdm
# from tqdm.autonotebook import tqdm

In [38]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

In [39]:
# import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /home/ekoerner/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ekoerner/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ekoerner/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
# apply progress bars for pandas .apply() -> .progress_apply()
tqdm.pandas()

In [3]:
class Timer:
    def __init__(self, name=None):
        self.name = name

    def __enter__(self):
        self.time_start = time.time()

    def __exit__(self, *exc):
        time_end = time.time()
        time_delta = datetime.timedelta(seconds=(time_end - self.time_start))
        if self.name:
            print(("Time for [{}]: {}".format(self.name, time_delta)))
        else:
            print(("Time: {}".format(time_delta)))

# Task 1 - Same Side Classification

In [4]:
data_cross_path = 'data/same-side-classification/cross-topic/{}.csv'
data_within_path = 'data/same-side-classification/within-topic/{}.csv'

### Load within-topics and cross-topics data

In [5]:
# escapechar to detect quoting escapes, else it fails

# na_filter=False, because pandas automatic "nan" detection fails with the topic column, too
# cross_test_df['topic'].astype(str)[9270]

with Timer("read cross"):
    cross_traindev_df = pd.read_csv(data_cross_path.format('training'), index_col='id', escapechar='\\', na_filter=False)
    cross_test_df = pd.read_csv(data_cross_path.format('test'), index_col='id', escapechar='\\', na_filter=False)

with Timer("read within"):
    within_traindev_df = pd.read_csv(data_within_path.format('training'), index_col='id', escapechar='\\', na_filter=False)
    within_test_df = pd.read_csv(data_within_path.format('test'), index_col='id', escapechar='\\', na_filter=False)

Time for [read cross]: 0:00:00.930528
Time for [read within]: 0:00:01.014369


In [6]:
# Adding a tag for the topics in focus: "gay marriage" and "abortion"
def add_tag(row):
    title = row['topic'].lower().strip()
    if "abortion" in title:
        row['tag'] = 'abortion'
    elif "gay marriage"  in title:
        row['tag'] = 'gay marriage'
    else:
        row['tag'] = 'NA'
    return row


with Timer("tag cross traindev"):
    cross_traindev_df = cross_traindev_df.apply(add_tag, axis=1)
with Timer("tag cross test"):
    cross_test_df = cross_test_df.apply(add_tag, axis=1)

with Timer("tag within traindev"):
    within_traindev_df = within_traindev_df.apply(add_tag, axis=1)
with Timer("tag within test"):
    within_test_df = within_test_df.apply(add_tag, axis=1)

Time for [tag cross traindev]: 0:00:35.189107
Time for [tag cross test]: 0:00:19.834640
Time for [tag within traindev]: 0:00:37.309147
Time for [tag within test]: 0:00:18.014437


### Get an overview about each dataset

In [7]:
def get_overview(df, task='same-side', class_name='is_same_side'):
    # Total instance numbers
    total = len(df)
    print("Task: ", task)
    print('=' * 40, '\n')

    print('Total instances: ', total)
    print('\n')

    print('For each topic:')
    for tag, tag_df in df.groupby(['tag']):
        print(tag, ': ', len(tag_df), ' instances')
        if class_name in df.columns:
            for is_same_side, side_df in tag_df.groupby([class_name]):
                print('\t\t', is_same_side, ': ', len(side_df), ' instances')
    print('\n')

    if class_name in df.columns:
        print('For each class value:')
        for class_value, class_df in df.groupby([class_name]):
            print(class_value, ': ', len(class_df), ' instances')
        print('\n')

    print('Unique argument1:', len(df['argument1'].unique()))
    print('Unique argument2:', len(df['argument2'].unique()))
    arguments = df['argument1'].values
    arguments = np.concatenate([arguments, df['argument2'].values])

    print('Unique total arguments:', len(set(list(arguments))), '\n')

    print('-' * 40, '\n')

    arguments_length_lst = [
        len(word_tokenize(x)) for x in df['argument1'].values
    ]
    arguments_length_lst.extend(
        [len(word_tokenize(x)) for x in df['argument2'].values])
    print('Words:')
    print('\tshortest argument:', min(arguments_length_lst), ' words')
    print('\tlongest argument:', max(arguments_length_lst), ' words')
    print('\targument average length:', np.mean(arguments_length_lst),
          ' words')

    arguments_sent_length_lst = [
        len(sent_tokenize(x)) for x in df['argument1'].values
    ]
    arguments_sent_length_lst.extend(
        [len(sent_tokenize(x)) for x in df['argument2'].values])
    print('Sentences:')
    print('\tshortest argument:', min(arguments_sent_length_lst), ' sentences')
    print('\tlongest argument:', max(arguments_sent_length_lst), ' sentences')
    print('\targument average length:', np.mean(arguments_sent_length_lst),
          ' sentences')

In [8]:
# with Timer("overview cross"):
#     get_overview(cross_traindev_df)

In [9]:
# with Timer("overview within"):
#     get_overview(within_traindev_df)

## Train model - Baseline

### train dev set - 70% 30%

In [10]:
def get_train_test_sets(df, ratio=0.30, random_state=1):
    X = df[['argument1', 'argument2', 'argument1_id', 'argument2_id', 'topic']]
    y = df[['is_same_side']]

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=ratio,
                                                        random_state=random_state,
                                                        shuffle=True)
    return X_train, X_test, y_train, y_test

### lemmatizing

In [11]:
def get_wordnet_pos(treebank_tag):
    """return WORDNET POS compliance to WORDENT lemmatization (a,n,r,v)"""
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        # As default pos in lemmatization is Noun
        return wordnet.NOUN


def lemmatize_stemming(token, pos_tag):
    '''lemmatize words (with POS information) and then stem'''
    stemmer = SnowballStemmer(
        "english")  # pOrter, M. "An algorithm for suffix stripping."
    return stemmer.stem(WordNetLemmatizer().lemmatize(token, pos=pos_tag))


def do_segmentation(text):
    '''do sentence segmentation, tokenization (with lemmatization&stemming)'''
    lemma = []
    for sentence in sent_tokenize(text):
        sentence = sentence.replace('\n', ' ').strip()
        tokens = [token for token in word_tokenize(sentence)]
        pos_tags = nltk.pos_tag(tokens)

        for idx in range(0, len(tokens)):
            token = tokens[idx].lower()
            if token not in gensim.parsing.preprocessing.STOPWORDS and len(
                    token) > 3:
                wordnet_pos = get_wordnet_pos(pos_tags[idx][1])
                l_ = lemmatize_stemming(token, wordnet_pos)
                lemma.append(l_)
    return lemma


def preprocess(text):
    '''concat lemmatized words together again'''
    lemma = do_segmentation(text)
    return ' '.join(lemma)

### Extracting n grams lemma for argument1 and argument2

In [12]:
def extract_ngrams(X_train, X_dev, col, idx='id'):
    vectorizer = CountVectorizer(min_df=600,
                                 max_df=0.7,
                                 ngram_range=(3, 3),
                                 max_features=5000)

    vectorizer.fit(X_train[col])
    features = vectorizer.transform(X_train[col])
    features_dev = vectorizer.transform(X_dev[col])

    train_df = pd.DataFrame(features.todense(),
                            columns=vectorizer.get_feature_names())
    train_df = train_df.add_prefix(col)

    aid_df = X_train[[idx]]

    train_df = train_df.merge(aid_df,
                              left_index=True,
                              right_index=True,
                              suffixes=(False, False),
                              how='inner')
    train_df.set_index(idx, inplace=True)

    dev_df = pd.DataFrame(features_dev.todense(),
                          columns=vectorizer.get_feature_names())
    dev_df = dev_df.add_prefix(col)

    aid_dev_df = X_dev[[idx]]

    dev_df = dev_df.merge(aid_dev_df,
                          left_index=True,
                          right_index=True,
                          suffixes=(False, False),
                          how='inner')
    dev_df.set_index(idx, inplace=True)
    return train_df, dev_df


def extract_n_grams_features(X_train, X_dev, columns, idx='id'):
    X_train = X_train.reset_index()
    result_train_df = X_train[[idx]]
    result_train_df.set_index(idx, inplace=True)

    X_dev = X_dev.reset_index()
    result_dev_df = X_dev[[idx]]
    result_dev_df.set_index(idx, inplace=True)

    for col in columns:
        result_train_df_, result_dev_df_ = extract_ngrams(X_train, X_dev, col)
        result_train_df = result_train_df.join(result_train_df_)
        result_dev_df = result_dev_df.join(result_dev_df_)
    return result_train_df, result_dev_df

### Train Doc2Vec model and vectorize argument1 and argument2

In [13]:
def make_d2v_docs(row):
    words1 = do_segmentation(row['argument1'])
    words2 = do_segmentation(row['argument2'])

    row['argument1_doc'] = TaggedDocument(words=words1,
                                          tags=[row['argument1_id']])
    row['argument2_doc'] = TaggedDocument(words=words2,
                                          tags=[row['argument2_id']])

    row['argument1_lemmas'] = ' '.join(words1)
    row['argument2_lemmas'] = ' '.join(words2)

    return row


class DatasetIter:
    def __init__(self, ds, shuffle=True):
        self.ds = ds
        self.shuffle = shuffle

    def _make_taggeddocs(self, row):
        yield row['argument1_doc']
        yield row['argument2_doc']

    def __iter__(self):
        if self.shuffle:
            self.ds = self.ds.sample(frac=1)

        for _, row in self.ds.iterrows():
            for doc in self._make_taggeddocs(row):
                yield doc


# https://github.com/RaRe-Technologies/gensim/blob/2024be9053094fbb2e765b9a06b9dc580f55c505/gensim/test/test_doc2vec.py#L501
class ConcatenatedDoc2Vec(object):
    """
    Concatenation of multiple models for reproducing the Paragraph Vectors paper.
    Models must have exactly-matching vocabulary and document IDs. (Models should
    be trained separately; this wrapper just returns concatenated results.)
    """

    def __init__(self, models):
        self.models = models
        if hasattr(models[0], 'docvecs'):
            self.docvecs = ConcatenatedDocvecs([model.docvecs for model in models])

    def __getitem__(self, token):
        return np.concatenate([model[token] for model in self.models])

    def infer_vector(self, document, alpha=0.1, min_alpha=0.0001, steps=5):
        return np.concatenate([model.infer_vector(document, alpha, min_alpha, steps) for model in self.models])

    def train(self, *ignore_args, **ignore_kwargs):
        pass  # train subcomponents individually


class ConcatenatedDocvecs(object):
    def __init__(self, models):
        self.models = models

    def __getitem__(self, token):
        return np.concatenate([model[token] for model in self.models])

In [27]:
def train_model(X_train, X_dev, workers=2, epochs=30):
    with Timer("doc2vec dbow"):
        # columns=['argument1_lemmas', 'argument2_lemmas']
        # pd.concat([X_train[columns], X_dev[columns]])
        alpha = 0.025  # https://radimrehurek.com/gensim/models/base_any2vec.html#gensim.models.base_any2vec.BaseWordEmbeddingsModel
        # %%time
        model_dbow = Doc2Vec(DatasetIter(X_train, shuffle=True),
                             dm=0,
                             vector_size=300,
                             negative=5,
                             hs=0,
                             min_count=2,
                             sample=0,
                             workers=workers,
                             epochs=epochs,
                             alpha=alpha,
                             min_alpha=alpha - (epochs * 0.002))
        
    with Timer("doc2vec dmm"):
        model_dmm = Doc2Vec(DatasetIter(X_train, shuffle=True),
                            dm=1,
                            dm_mean=1,
                            vector_size=300,
                            window=10,
                            negative=5,
                            min_count=1,
                            workers=workers,
                            epochs=epochs,
                            alpha=0.065,
                            min_alpha=0.065 - (epochs * 0.002))
        
    return model_dbow, model_dmm

In [15]:
# unused
def vec_for_learning(model, df):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

In [16]:
def make_vectors(X_train, X_dev, model):
    def make_d2v_vecs(row):
        vec1 = model.infer_vector(row['argument1_doc'].words, steps=20)
        vec2 = model.infer_vector(row['argument2_doc'].words, steps=20)

        row['argument1_vec'] = vec1
        row['argument2_vec'] = vec2
        
        return row

    X_train = X_train.progress_apply(make_d2v_vecs, axis=1)
    X_dev = X_dev.progress_apply(make_d2v_vecs, axis=1)
    
    return X_train, X_dev

In [32]:
def make_vector_comparison_diff(X_train, X_dev):
    def ret_vec_diff(row):
        return row['argument1_vec'] - row['argument2_vec']

    X_train_diff = X_train.progress_apply(ret_vec_diff, axis=1)
    X_dev_diff = X_dev.progress_apply(ret_vec_diff, axis=1)

    return X_train_diff, X_dev_diff


def make_vector_comparison_concat(X_train, X_dev):
    def ret_vec_concat(row):
        return np.concatenate((row['argument1_vec'], row['argument2_vec']))

    X_train_concat = X_train.progress_apply(ret_vec_concat, axis=1)
    X_dev_concat = X_dev.progress_apply(ret_vec_concat, axis=1)

    return X_train_concat, X_dev_concat


def make_vector_comparison(X_train, X_dev, mode="diff"):
    if mode == "concat":
        X_train, X_dev = make_vector_comparison_concat(X_train, X_dev)
    else:
        X_train, X_dev = make_vector_comparison_diff(X_train, X_dev)

    # array of array to 2d array
    X_train = np.array(list(X_train.values))
    X_dev = np.array(list(X_dev.values))

    return X_train, X_dev

### Train model and evaluate

In [59]:
def train_test_svm(X_train, y_train, X_test):
    with Timer("StandardScaler fit"):
        scaler = StandardScaler(copy=True, with_mean=False)
        scaler.fit(X_train)

    with Timer("StandardScaler transform"):
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)

    # ------------------

    with Timer("SVC (linear) fit"):
        # svclassifier = SVC(kernel='linear')
        svclassifier = LinearSVC()        
        svclassifier.fit(X_train, y_train)

    with Timer("SVC predict"):
        y_pred = svclassifier.predict(X_test)

    return y_pred


def train_test_logreg(X_train, y_train, X_test):
    with Timer("LogisticRegression fit"):
        logreg = LogisticRegression(n_jobs=1, C=1e5)
        logreg.fit(X_train, y_train)
    
    with Timer("LogisticRegression predict"):
        y_pred = logreg.predict(X_test)
    
    return y_pred


def train_test_sgd(X_train, y_train, X_test):
    with Timer("SGDClassifier fit"):
        sgdcla = SGDClassifier()
        sgdcla.fit(X_train, y_train)
    
    with Timer("SGDClassifier predict"):
        y_pred = sgdcla.predict(X_test)
    
    return y_pred


def heatconmat(y_test, y_pred):
    sns.set_context('talk')
    plt.figure(figsize=(9, 6))
    sns.heatmap(confusion_matrix(y_test, y_pred),
                annot=True,
                fmt='d',
                cbar=False,
                cmap='gist_earth_r',
                yticklabels=sorted(y_test.unique()))
    plt.show()


def report_training_results(y_test, y_pred, name=None, heatmap=True):
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, y_pred))
    if heatmap:
        heatconmat(y_test['is_same_side'], y_pred)
    print()
    print('Accuracy: ', round(accuracy_score(y_test, y_pred), 2), '\n')  #

    print('Report{}:'.format("" if not name else " for [{}]".format(name)))
    print(classification_report(y_test, y_pred))

    f1_dic = {}
    f1_dic['macro'] = round(
        f1_score(y_pred=y_pred, y_true=y_test, average='macro'), 2)
    f1_dic['micro'] = round(
        f1_score(y_pred=y_pred, y_true=y_test, average='micro'), 2)
    return f1_dic

### Cross topic - Training and evaluating model 

In [19]:
# 1. Getting train and dev data
with Timer("1 - test/train"):
    X_train, X_dev, y_train, y_dev = get_train_test_sets(cross_traindev_df)

Time for [1 - test/train]: 0:00:00.013205


In [20]:
# 2. tokenize (make doc2vec docs + lemma string)
# tqdm.pandas()
with Timer("2 - tokenize"):
    X_train = X_train.progress_apply(make_d2v_docs, axis=1)
    X_dev = X_dev.progress_apply(make_d2v_docs, axis=1)

100%|██████████| 42733/42733 [14:09<00:00, 50.30it/s]
100%|██████████| 18315/18315 [06:15<00:00, 48.83it/s]

Time for [2 - tokenize]: 0:20:24.600272





In [65]:
with Timer("2a - pickle"):
    X_train.to_pickle("X_train.cross_td.p")
    X_dev.to_pickle("X_dev.cross_td.p")

Time for [2a - pickle]: 0:00:04.754327


In [66]:
with Timer("2b - unpickle"):
    X_train = pd.read_pickle("X_train.cross_td.p")
    X_dev = pd.read_pickle("X_dev.cross_td.p")

Time for [2b - unpickle]: 0:00:03.149843


In [58]:
# 3. train doc2vec model
with Timer("3 - doc2vec model"):
    model_dbow, model_dmm = train_model(X_train, X_dev, workers=3, epochs=30)

    model_dbow.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
    model_dmm.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

    model_concat = ConcatenatedDoc2Vec([model_dbow, model_dmm])

2019-06-26 14:35:39,467 : INFO : collecting all words and their counts
2019-06-26 14:35:39,555 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2019-06-26 14:35:40,328 : INFO : PROGRESS: at example #10000, processed 756855 words (980205/s), 21416 word types, 2136 tags
2019-06-26 14:35:41,071 : INFO : PROGRESS: at example #20000, processed 1517256 words (1027139/s), 28668 word types, 3475 tags
2019-06-26 14:35:41,810 : INFO : PROGRESS: at example #30000, processed 2300874 words (1063249/s), 32981 word types, 4535 tags
2019-06-26 14:35:42,538 : INFO : PROGRESS: at example #40000, processed 3055071 words (1039104/s), 35228 word types, 5294 tags
2019-06-26 14:35:43,312 : INFO : PROGRESS: at example #50000, processed 3831527 words (1005748/s), 36541 word types, 5933 tags
2019-06-26 14:35:44,236 : INFO : PROGRESS: at example #60000, processed 4602568 words (848394/s), 37180 word types, 6380 tags
2019-06-26 14:35:45,127 : INFO : PROGRESS: at example #70000, proc

2019-06-26 14:36:36,055 : INFO : EPOCH 3 - PROGRESS: at 91.87% examples, 399432 words/s, in_qsize 5, out_qsize 0
2019-06-26 14:36:37,055 : INFO : EPOCH 3 - PROGRESS: at 97.90% examples, 399396 words/s, in_qsize 6, out_qsize 0
2019-06-26 14:36:37,287 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-06-26 14:36:37,315 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-06-26 14:36:37,323 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-06-26 14:36:37,324 : INFO : EPOCH - 3 : training on 6557240 raw words (6641536 effective words) took 16.5s, 401304 effective words/s
2019-06-26 14:36:38,358 : INFO : EPOCH 4 - PROGRESS: at 5.54% examples, 351959 words/s, in_qsize 5, out_qsize 0
2019-06-26 14:36:39,360 : INFO : EPOCH 4 - PROGRESS: at 11.50% examples, 380331 words/s, in_qsize 6, out_qsize 0
2019-06-26 14:36:40,366 : INFO : EPOCH 4 - PROGRESS: at 18.02% examples, 395881 words/s, in_qsize 5, out_qsize 0
2019-06-26 14:36:41,367 :

2019-06-26 14:37:36,348 : INFO : EPOCH 7 - PROGRESS: at 61.99% examples, 406966 words/s, in_qsize 5, out_qsize 0
2019-06-26 14:37:37,356 : INFO : EPOCH 7 - PROGRESS: at 68.03% examples, 406923 words/s, in_qsize 6, out_qsize 0
2019-06-26 14:37:38,358 : INFO : EPOCH 7 - PROGRESS: at 74.13% examples, 407846 words/s, in_qsize 6, out_qsize 0
2019-06-26 14:37:39,380 : INFO : EPOCH 7 - PROGRESS: at 80.38% examples, 407260 words/s, in_qsize 5, out_qsize 0
2019-06-26 14:37:40,410 : INFO : EPOCH 7 - PROGRESS: at 86.67% examples, 405930 words/s, in_qsize 5, out_qsize 0
2019-06-26 14:37:41,411 : INFO : EPOCH 7 - PROGRESS: at 92.90% examples, 406130 words/s, in_qsize 5, out_qsize 0
2019-06-26 14:37:42,417 : INFO : EPOCH 7 - PROGRESS: at 99.27% examples, 406217 words/s, in_qsize 5, out_qsize 0
2019-06-26 14:37:42,476 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-06-26 14:37:42,478 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-06-26 14:37:42,487 : I

2019-06-26 14:38:37,325 : INFO : EPOCH 11 - PROGRESS: at 36.32% examples, 398256 words/s, in_qsize 5, out_qsize 0
2019-06-26 14:38:38,339 : INFO : EPOCH 11 - PROGRESS: at 42.96% examples, 401966 words/s, in_qsize 6, out_qsize 0
2019-06-26 14:38:39,343 : INFO : EPOCH 11 - PROGRESS: at 49.31% examples, 403883 words/s, in_qsize 6, out_qsize 0
2019-06-26 14:38:40,374 : INFO : EPOCH 11 - PROGRESS: at 55.60% examples, 404273 words/s, in_qsize 6, out_qsize 0
2019-06-26 14:38:41,406 : INFO : EPOCH 11 - PROGRESS: at 62.03% examples, 405293 words/s, in_qsize 5, out_qsize 0
2019-06-26 14:38:42,411 : INFO : EPOCH 11 - PROGRESS: at 68.31% examples, 406414 words/s, in_qsize 5, out_qsize 0
2019-06-26 14:38:43,423 : INFO : EPOCH 11 - PROGRESS: at 74.68% examples, 406974 words/s, in_qsize 5, out_qsize 0
2019-06-26 14:38:44,435 : INFO : EPOCH 11 - PROGRESS: at 81.37% examples, 409801 words/s, in_qsize 5, out_qsize 0
2019-06-26 14:38:45,449 : INFO : EPOCH 11 - PROGRESS: at 87.31% examples, 408763 words/s

2019-06-26 14:39:38,899 : INFO : EPOCH 15 - PROGRESS: at 35.04% examples, 445963 words/s, in_qsize 6, out_qsize 0
2019-06-26 14:39:39,902 : INFO : EPOCH 15 - PROGRESS: at 42.07% examples, 449360 words/s, in_qsize 6, out_qsize 0
2019-06-26 14:39:40,916 : INFO : EPOCH 15 - PROGRESS: at 49.56% examples, 455342 words/s, in_qsize 5, out_qsize 0
2019-06-26 14:39:41,929 : INFO : EPOCH 15 - PROGRESS: at 57.01% examples, 458740 words/s, in_qsize 5, out_qsize 0
2019-06-26 14:39:42,934 : INFO : EPOCH 15 - PROGRESS: at 64.37% examples, 461818 words/s, in_qsize 5, out_qsize 0
2019-06-26 14:39:43,945 : INFO : EPOCH 15 - PROGRESS: at 71.68% examples, 464972 words/s, in_qsize 5, out_qsize 0
2019-06-26 14:39:44,949 : INFO : EPOCH 15 - PROGRESS: at 78.84% examples, 465340 words/s, in_qsize 5, out_qsize 0
2019-06-26 14:39:45,957 : INFO : EPOCH 15 - PROGRESS: at 86.23% examples, 467631 words/s, in_qsize 5, out_qsize 0
2019-06-26 14:39:46,966 : INFO : EPOCH 15 - PROGRESS: at 93.18% examples, 468870 words/s

2019-06-26 14:40:40,321 : INFO : EPOCH 19 - PROGRESS: at 71.44% examples, 470219 words/s, in_qsize 4, out_qsize 0
2019-06-26 14:40:41,326 : INFO : EPOCH 19 - PROGRESS: at 77.86% examples, 465310 words/s, in_qsize 5, out_qsize 0
2019-06-26 14:40:42,338 : INFO : EPOCH 19 - PROGRESS: at 85.00% examples, 465239 words/s, in_qsize 4, out_qsize 0
2019-06-26 14:40:43,350 : INFO : EPOCH 19 - PROGRESS: at 92.21% examples, 465653 words/s, in_qsize 6, out_qsize 0
2019-06-26 14:40:44,385 : INFO : EPOCH 19 - PROGRESS: at 98.84% examples, 463371 words/s, in_qsize 6, out_qsize 0
2019-06-26 14:40:44,501 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-06-26 14:40:44,520 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-06-26 14:40:44,528 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-06-26 14:40:44,530 : INFO : EPOCH - 19 : training on 6557240 raw words (6641536 effective words) took 14.3s, 463822 effective words/s
2019-06-26 14:40:4

2019-06-26 14:41:40,359 : INFO : EPOCH 23 - PROGRESS: at 87.88% examples, 479819 words/s, in_qsize 6, out_qsize 0
2019-06-26 14:41:41,367 : INFO : EPOCH 23 - PROGRESS: at 97.06% examples, 490124 words/s, in_qsize 6, out_qsize 0
2019-06-26 14:41:41,591 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-06-26 14:41:41,597 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-06-26 14:41:41,599 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-06-26 14:41:41,599 : INFO : EPOCH - 23 : training on 6557240 raw words (6641536 effective words) took 13.4s, 495878 effective words/s
2019-06-26 14:41:42,616 : INFO : EPOCH 24 - PROGRESS: at 6.14% examples, 392365 words/s, in_qsize 5, out_qsize 0
2019-06-26 14:41:43,628 : INFO : EPOCH 24 - PROGRESS: at 13.11% examples, 427050 words/s, in_qsize 6, out_qsize 0
2019-06-26 14:41:44,628 : INFO : EPOCH 24 - PROGRESS: at 21.97% examples, 477742 words/s, in_qsize 5, out_qsize 0
2019-06-26 14:41:45

2019-06-26 14:42:40,177 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-06-26 14:42:40,178 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-06-26 14:42:40,193 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-06-26 14:42:40,194 : INFO : EPOCH - 27 : training on 6557240 raw words (6641536 effective words) took 15.3s, 433553 effective words/s
2019-06-26 14:42:41,204 : INFO : EPOCH 28 - PROGRESS: at 6.68% examples, 447221 words/s, in_qsize 6, out_qsize 0
2019-06-26 14:42:42,216 : INFO : EPOCH 28 - PROGRESS: at 14.18% examples, 465550 words/s, in_qsize 5, out_qsize 0
2019-06-26 14:42:43,228 : INFO : EPOCH 28 - PROGRESS: at 20.71% examples, 461668 words/s, in_qsize 6, out_qsize 0
2019-06-26 14:42:44,231 : INFO : EPOCH 28 - PROGRESS: at 29.47% examples, 485994 words/s, in_qsize 6, out_qsize 0
2019-06-26 14:42:45,249 : INFO : EPOCH 28 - PROGRESS: at 38.32% examples, 504144 words/s, in_qsize 6, out_qsize 0
2019-06-26 14:42:46

Time for [doc2vec dbow]: 0:07:45.506598


2019-06-26 14:43:25,483 : INFO : PROGRESS: at example #10000, processed 759945 words (1547362/s), 21470 word types, 2066 tags
2019-06-26 14:43:25,972 : INFO : PROGRESS: at example #20000, processed 1500326 words (1515917/s), 29003 word types, 3435 tags
2019-06-26 14:43:26,474 : INFO : PROGRESS: at example #30000, processed 2257564 words (1511528/s), 32859 word types, 4469 tags
2019-06-26 14:43:26,966 : INFO : PROGRESS: at example #40000, processed 3004745 words (1521948/s), 35021 word types, 5248 tags
2019-06-26 14:43:27,463 : INFO : PROGRESS: at example #50000, processed 3783673 words (1570509/s), 36349 word types, 5880 tags
2019-06-26 14:43:27,959 : INFO : PROGRESS: at example #60000, processed 4562615 words (1571187/s), 37227 word types, 6352 tags
2019-06-26 14:43:28,456 : INFO : PROGRESS: at example #70000, processed 5347161 words (1580365/s), 37658 word types, 6727 tags
2019-06-26 14:43:28,955 : INFO : PROGRESS: at example #80000, processed 6128916 words (1570304/s), 37983 word ty

2019-06-26 14:44:19,879 : INFO : EPOCH 3 - PROGRESS: at 71.12% examples, 299348 words/s, in_qsize 6, out_qsize 0
2019-06-26 14:44:20,893 : INFO : EPOCH 3 - PROGRESS: at 75.72% examples, 296669 words/s, in_qsize 5, out_qsize 0
2019-06-26 14:44:21,915 : INFO : EPOCH 3 - PROGRESS: at 81.50% examples, 298529 words/s, in_qsize 5, out_qsize 0
2019-06-26 14:44:22,926 : INFO : EPOCH 3 - PROGRESS: at 89.05% examples, 306138 words/s, in_qsize 6, out_qsize 0
2019-06-26 14:44:23,940 : INFO : EPOCH 3 - PROGRESS: at 95.55% examples, 311320 words/s, in_qsize 5, out_qsize 0
2019-06-26 14:44:24,742 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-06-26 14:44:24,749 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-06-26 14:44:24,764 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-06-26 14:44:24,765 : INFO : EPOCH - 3 : training on 6557240 raw words (5678667 effective words) took 18.2s, 311824 effective words/s
2019-06-26 14:44:25,771 

2019-06-26 14:45:21,593 : INFO : EPOCH 7 - PROGRESS: at 42.30% examples, 293867 words/s, in_qsize 6, out_qsize 0
2019-06-26 14:45:22,603 : INFO : EPOCH 7 - PROGRESS: at 46.87% examples, 290505 words/s, in_qsize 5, out_qsize 0
2019-06-26 14:45:23,621 : INFO : EPOCH 7 - PROGRESS: at 53.15% examples, 298214 words/s, in_qsize 5, out_qsize 0
2019-06-26 14:45:24,656 : INFO : EPOCH 7 - PROGRESS: at 58.94% examples, 301094 words/s, in_qsize 5, out_qsize 0
2019-06-26 14:45:25,675 : INFO : EPOCH 7 - PROGRESS: at 64.72% examples, 301876 words/s, in_qsize 6, out_qsize 0
2019-06-26 14:45:26,699 : INFO : EPOCH 7 - PROGRESS: at 69.88% examples, 300486 words/s, in_qsize 6, out_qsize 0
2019-06-26 14:45:27,709 : INFO : EPOCH 7 - PROGRESS: at 75.57% examples, 301961 words/s, in_qsize 5, out_qsize 0
2019-06-26 14:45:28,720 : INFO : EPOCH 7 - PROGRESS: at 80.29% examples, 299371 words/s, in_qsize 6, out_qsize 0
2019-06-26 14:45:29,726 : INFO : EPOCH 7 - PROGRESS: at 86.00% examples, 300204 words/s, in_qsiz

2019-06-26 14:46:23,030 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-06-26 14:46:23,031 : INFO : EPOCH - 10 : training on 6557240 raw words (5679599 effective words) took 17.7s, 321715 effective words/s
2019-06-26 14:46:24,075 : INFO : EPOCH 11 - PROGRESS: at 5.15% examples, 278765 words/s, in_qsize 5, out_qsize 0
2019-06-26 14:46:25,076 : INFO : EPOCH 11 - PROGRESS: at 9.75% examples, 271326 words/s, in_qsize 5, out_qsize 0
2019-06-26 14:46:26,089 : INFO : EPOCH 11 - PROGRESS: at 14.20% examples, 265632 words/s, in_qsize 5, out_qsize 0
2019-06-26 14:46:27,103 : INFO : EPOCH 11 - PROGRESS: at 19.47% examples, 275373 words/s, in_qsize 5, out_qsize 0
2019-06-26 14:46:28,114 : INFO : EPOCH 11 - PROGRESS: at 24.56% examples, 277849 words/s, in_qsize 5, out_qsize 0
2019-06-26 14:46:29,119 : INFO : EPOCH 11 - PROGRESS: at 29.04% examples, 275375 words/s, in_qsize 5, out_qsize 0
2019-06-26 14:46:30,132 : INFO : EPOCH 11 - PROGRESS: at 36.25% examples, 292760 words/s

2019-06-26 14:47:26,334 : INFO : EPOCH 14 - PROGRESS: at 60.72% examples, 285916 words/s, in_qsize 5, out_qsize 0
2019-06-26 14:47:27,339 : INFO : EPOCH 14 - PROGRESS: at 65.65% examples, 284170 words/s, in_qsize 6, out_qsize 0
2019-06-26 14:47:28,358 : INFO : EPOCH 14 - PROGRESS: at 72.48% examples, 291348 words/s, in_qsize 5, out_qsize 0
2019-06-26 14:47:29,387 : INFO : EPOCH 14 - PROGRESS: at 77.47% examples, 290178 words/s, in_qsize 5, out_qsize 0
2019-06-26 14:47:30,393 : INFO : EPOCH 14 - PROGRESS: at 82.29% examples, 288827 words/s, in_qsize 5, out_qsize 0
2019-06-26 14:47:31,403 : INFO : EPOCH 14 - PROGRESS: at 90.41% examples, 298656 words/s, in_qsize 6, out_qsize 0
2019-06-26 14:47:32,414 : INFO : EPOCH 14 - PROGRESS: at 99.03% examples, 309308 words/s, in_qsize 6, out_qsize 0
2019-06-26 14:47:32,469 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-06-26 14:47:32,472 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-06-26 14:47:32,

2019-06-26 14:48:27,801 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-06-26 14:48:27,801 : INFO : EPOCH - 17 : training on 6557240 raw words (5679747 effective words) took 16.3s, 348870 effective words/s
2019-06-26 14:48:28,807 : INFO : EPOCH 18 - PROGRESS: at 4.43% examples, 255663 words/s, in_qsize 5, out_qsize 0
2019-06-26 14:48:29,810 : INFO : EPOCH 18 - PROGRESS: at 9.54% examples, 272766 words/s, in_qsize 5, out_qsize 0
2019-06-26 14:48:30,812 : INFO : EPOCH 18 - PROGRESS: at 15.26% examples, 292342 words/s, in_qsize 5, out_qsize 0
2019-06-26 14:48:31,825 : INFO : EPOCH 18 - PROGRESS: at 20.13% examples, 289140 words/s, in_qsize 5, out_qsize 0
2019-06-26 14:48:32,880 : INFO : EPOCH 18 - PROGRESS: at 25.14% examples, 286441 words/s, in_qsize 6, out_qsize 0
2019-06-26 14:48:33,900 : INFO : EPOCH 18 - PROGRESS: at 31.75% examples, 297452 words/s, in_qsize 6, out_qsize 0
2019-06-26 14:48:34,914 : INFO : EPOCH 18 - PROGRESS: at 37.57% examples, 301815 words/s

2019-06-26 14:49:30,563 : INFO : EPOCH 21 - PROGRESS: at 48.28% examples, 302571 words/s, in_qsize 5, out_qsize 0
2019-06-26 14:49:31,578 : INFO : EPOCH 21 - PROGRESS: at 53.80% examples, 302408 words/s, in_qsize 6, out_qsize 0
2019-06-26 14:49:32,582 : INFO : EPOCH 21 - PROGRESS: at 59.38% examples, 304310 words/s, in_qsize 5, out_qsize 0
2019-06-26 14:49:33,626 : INFO : EPOCH 21 - PROGRESS: at 65.47% examples, 306273 words/s, in_qsize 5, out_qsize 0
2019-06-26 14:49:34,655 : INFO : EPOCH 21 - PROGRESS: at 71.26% examples, 306906 words/s, in_qsize 6, out_qsize 0
2019-06-26 14:49:35,663 : INFO : EPOCH 21 - PROGRESS: at 76.34% examples, 306151 words/s, in_qsize 6, out_qsize 0
2019-06-26 14:49:36,671 : INFO : EPOCH 21 - PROGRESS: at 81.90% examples, 304426 words/s, in_qsize 6, out_qsize 0
2019-06-26 14:49:37,672 : INFO : EPOCH 21 - PROGRESS: at 86.51% examples, 301426 words/s, in_qsize 5, out_qsize 0
2019-06-26 14:49:38,724 : INFO : EPOCH 21 - PROGRESS: at 91.73% examples, 299899 words/s

2019-06-26 14:50:30,996 : INFO : EPOCH - 24 : training on 6557240 raw words (5678522 effective words) took 15.5s, 367224 effective words/s
2019-06-26 14:50:32,003 : INFO : EPOCH 25 - PROGRESS: at 7.18% examples, 398470 words/s, in_qsize 6, out_qsize 0
2019-06-26 14:50:33,027 : INFO : EPOCH 25 - PROGRESS: at 15.36% examples, 419900 words/s, in_qsize 5, out_qsize 0
2019-06-26 14:50:34,029 : INFO : EPOCH 25 - PROGRESS: at 23.58% examples, 438766 words/s, in_qsize 5, out_qsize 0
2019-06-26 14:50:35,034 : INFO : EPOCH 25 - PROGRESS: at 31.32% examples, 439552 words/s, in_qsize 6, out_qsize 0
2019-06-26 14:50:36,052 : INFO : EPOCH 25 - PROGRESS: at 39.44% examples, 440486 words/s, in_qsize 5, out_qsize 0
2019-06-26 14:50:37,056 : INFO : EPOCH 25 - PROGRESS: at 47.65% examples, 443571 words/s, in_qsize 6, out_qsize 0
2019-06-26 14:50:38,076 : INFO : EPOCH 25 - PROGRESS: at 55.99% examples, 448317 words/s, in_qsize 5, out_qsize 0
2019-06-26 14:50:39,089 : INFO : EPOCH 25 - PROGRESS: at 64.06% 

2019-06-26 14:51:34,534 : INFO : EPOCH 28 - PROGRESS: at 68.11% examples, 274234 words/s, in_qsize 5, out_qsize 0
2019-06-26 14:51:35,574 : INFO : EPOCH 28 - PROGRESS: at 73.35% examples, 273996 words/s, in_qsize 5, out_qsize 0
2019-06-26 14:51:36,584 : INFO : EPOCH 28 - PROGRESS: at 78.08% examples, 274277 words/s, in_qsize 5, out_qsize 0
2019-06-26 14:51:37,592 : INFO : EPOCH 28 - PROGRESS: at 83.20% examples, 275007 words/s, in_qsize 5, out_qsize 0
2019-06-26 14:51:38,595 : INFO : EPOCH 28 - PROGRESS: at 89.41% examples, 278559 words/s, in_qsize 6, out_qsize 0
2019-06-26 14:51:39,614 : INFO : EPOCH 28 - PROGRESS: at 93.83% examples, 277149 words/s, in_qsize 5, out_qsize 0
2019-06-26 14:51:40,632 : INFO : EPOCH 28 - PROGRESS: at 98.58% examples, 276275 words/s, in_qsize 5, out_qsize 0
2019-06-26 14:51:40,731 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-06-26 14:51:40,733 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-06-26 14:51:40,

Time for [doc2vec dmm]: 0:08:49.799967
Time for [3 - doc2vec model]: 0:16:35.306854


In [60]:
# 4. vectorize arguments
with Timer("4 - vectorize arguments"):
    # X_train, X_dev = make_vectors(X_train, X_dev, model_dbow)
    # X_train, X_dev = make_vectors(X_train, X_dev, model_dmm)
    X_train, X_dev = make_vectors(X_train, X_dev, model_concat)

100%|██████████| 42733/42733 [08:06<00:00, 87.85it/s] 
100%|██████████| 18315/18315 [03:40<00:00, 83.10it/s] 

Time for [4 - vectorize arguments]: 0:11:46.846109





In [61]:
# 5. combine two argument vectors into a single one
# - diff / concat / ...
with Timer("5 - vector comparison of arguments"):
    X_train_diff, X_dev_diff = make_vector_comparison(X_train, X_dev, mode="concat")

X_train_ = X_train_diff
X_dev_ = X_dev_diff

100%|██████████| 42733/42733 [00:03<00:00, 12894.11it/s]
100%|██████████| 18315/18315 [00:01<00:00, 13026.28it/s]


Time for [5 - vector comparison of arguments]: 0:00:04.903677


In [62]:
# 6. train
with Timer("6 - SVM (train -> predict)"):
    y_pred_svm = train_test_svm(X_train_, y_train, X_dev_)

# 7. Evaluate
with Timer("7 - report"):
    print(report_training_results(y_dev, y_pred_svm, name="SVM", heatmap=False))

Time for [StandardScaler fit]: 0:00:00.741463
Time for [StandardScaler transform]: 0:00:00.284643


  y = column_or_1d(y, warn=True)


Time for [SVC (linear) fit]: 0:02:08.898048
Time for [SVC predict]: 0:00:00.118778
Time for [6 - SVM (train -> predict)]: 0:02:10.060351
Confusion Matrix:
[[2249 6687]
 [1927 7452]]

Accuracy:  0.53 

Report for [SVM]:
              precision    recall  f1-score   support

       False       0.54      0.25      0.34      8936
        True       0.53      0.79      0.63      9379

    accuracy                           0.53     18315
   macro avg       0.53      0.52      0.49     18315
weighted avg       0.53      0.53      0.49     18315

{'macro': 0.49, 'micro': 0.53}
Time for [7 - report]: 0:00:00.045468




In [63]:
# 6. train
with Timer("6 - LogReg (train -> predict)"):
    y_pred_logreg = train_test_logreg(X_train_, y_train, X_dev_)

# 7. Evaluate
with Timer("7 - report"):
    print(report_training_results(y_dev, y_pred_logreg, name="LogisticRegression", heatmap=False))

  y = column_or_1d(y, warn=True)


Time for [LogisticRegression fit]: 0:00:26.509581
Time for [LogisticRegression predict]: 0:00:00.093120
Time for [6 - LogReg (train -> predict)]: 0:00:26.602942
Confusion Matrix:
[[4899 4037]
 [3982 5397]]

Accuracy:  0.56 

Report for [LogisticRegression]:
              precision    recall  f1-score   support

       False       0.55      0.55      0.55      8936
        True       0.57      0.58      0.57      9379

    accuracy                           0.56     18315
   macro avg       0.56      0.56      0.56     18315
weighted avg       0.56      0.56      0.56     18315

{'macro': 0.56, 'micro': 0.56}
Time for [7 - report]: 0:00:00.049659


In [64]:
# 6. train
with Timer("6 - SGDClassifier (train -> predict)"):
    y_pred_sgdcla = train_test_sgd(X_train_, y_train, X_dev_)

# 7. Evaluate
with Timer("7 - report"):
    print(report_training_results(y_dev, y_pred_sgdcla, name="SGDClassifier", heatmap=False))

  y = column_or_1d(y, warn=True)


Time for [SGDClassifier fit]: 0:00:18.710785
Time for [SGDClassifier predict]: 0:00:00.093530
Time for [6 - SGDClassifier (train -> predict)]: 0:00:18.804574
Confusion Matrix:
[[6591 2345]
 [5135 4244]]

Accuracy:  0.59 

Report for [SGDClassifier]:
              precision    recall  f1-score   support

       False       0.56      0.74      0.64      8936
        True       0.64      0.45      0.53      9379

    accuracy                           0.59     18315
   macro avg       0.60      0.60      0.58     18315
weighted avg       0.60      0.59      0.58     18315

{'macro': 0.58, 'micro': 0.59}
Time for [7 - report]: 0:00:00.051484


In [None]:
# old
return

asdf

# 2. Lemmatizing argument1 and argument2
with Timer("2 - lemmatize"):
    X_train = X_train.apply(get_lemma, axis=1)
    X_dev = X_dev.apply(get_lemma, axis=1)

# 3. Extracting features - 1-3 grams lemma
with Timer("3 - n-grams"):
    X_train_, X_dev_ = extract_n_grams_features(
        X_train, X_dev, columns=['argument1_lemmas', 'argument2_lemmas'])

# 4. train
with Timer("4 - SVM (train -> predict)"):
    y_pred = train_test_svm(X_train_, y_train, X_dev_)

# 5. Evaluate
with Timer("5 - report"):
    report_training_results(y_dev, y_pred)