# RATIO 2019 - Benchmarking Workshop

In [56]:
import datetime
import logging
import time

import gensim
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import seaborn as sns

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.parsing.preprocessing import STOPWORDS
# from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
from gensim.utils import simple_preprocess
from nltk.corpus import wordnet
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix, f1_score)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC, SVC
from sklearn import utils

from tqdm import tqdm
# from tqdm import tqdm_notebook as tqdm
# from tqdm.autonotebook import tqdm

In [38]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

In [39]:
# import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /home/ekoerner/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ekoerner/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ekoerner/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
# apply progress bars for pandas .apply() -> .progress_apply()
tqdm.pandas()

In [3]:
class Timer:
    def __init__(self, name=None):
        self.name = name

    def __enter__(self):
        self.time_start = time.time()

    def __exit__(self, *exc):
        time_end = time.time()
        time_delta = datetime.timedelta(seconds=(time_end - self.time_start))
        if self.name:
            print(("Time for [{}]: {}".format(self.name, time_delta)))
        else:
            print(("Time: {}".format(time_delta)))

# Task 1 - Same Side Classification

In [4]:
data_cross_path = 'data/same-side-classification/cross-topic/{}.csv'
data_within_path = 'data/same-side-classification/within-topic/{}.csv'

### Load within-topics and cross-topics data

In [5]:
# escapechar to detect quoting escapes, else it fails

# na_filter=False, because pandas automatic "nan" detection fails with the topic column, too
# cross_test_df['topic'].astype(str)[9270]

with Timer("read cross"):
    cross_traindev_df = pd.read_csv(data_cross_path.format('training'), index_col='id', escapechar='\\', na_filter=False)
    cross_test_df = pd.read_csv(data_cross_path.format('test'), index_col='id', escapechar='\\', na_filter=False)

with Timer("read within"):
    within_traindev_df = pd.read_csv(data_within_path.format('training'), index_col='id', escapechar='\\', na_filter=False)
    within_test_df = pd.read_csv(data_within_path.format('test'), index_col='id', escapechar='\\', na_filter=False)

Time for [read cross]: 0:00:00.930528
Time for [read within]: 0:00:01.014369


In [6]:
# Adding a tag for the topics in focus: "gay marriage" and "abortion"
def add_tag(row):
    title = row['topic'].lower().strip()
    if "abortion" in title:
        row['tag'] = 'abortion'
    elif "gay marriage"  in title:
        row['tag'] = 'gay marriage'
    else:
        row['tag'] = 'NA'
    return row


with Timer("tag cross traindev"):
    cross_traindev_df = cross_traindev_df.apply(add_tag, axis=1)
with Timer("tag cross test"):
    cross_test_df = cross_test_df.apply(add_tag, axis=1)

with Timer("tag within traindev"):
    within_traindev_df = within_traindev_df.apply(add_tag, axis=1)
with Timer("tag within test"):
    within_test_df = within_test_df.apply(add_tag, axis=1)

Time for [tag cross traindev]: 0:00:35.189107
Time for [tag cross test]: 0:00:19.834640
Time for [tag within traindev]: 0:00:37.309147
Time for [tag within test]: 0:00:18.014437


### Get an overview about each dataset

In [7]:
def get_overview(df, task='same-side', class_name='is_same_side'):
    # Total instance numbers
    total = len(df)
    print("Task: ", task)
    print('=' * 40, '\n')

    print('Total instances: ', total)
    print('\n')

    print('For each topic:')
    for tag, tag_df in df.groupby(['tag']):
        print(tag, ': ', len(tag_df), ' instances')
        if class_name in df.columns:
            for is_same_side, side_df in tag_df.groupby([class_name]):
                print('\t\t', is_same_side, ': ', len(side_df), ' instances')
    print('\n')

    if class_name in df.columns:
        print('For each class value:')
        for class_value, class_df in df.groupby([class_name]):
            print(class_value, ': ', len(class_df), ' instances')
        print('\n')

    print('Unique argument1:', len(df['argument1'].unique()))
    print('Unique argument2:', len(df['argument2'].unique()))
    arguments = df['argument1'].values
    arguments = np.concatenate([arguments, df['argument2'].values])

    print('Unique total arguments:', len(set(list(arguments))), '\n')

    print('-' * 40, '\n')

    arguments_length_lst = [
        len(word_tokenize(x)) for x in df['argument1'].values
    ]
    arguments_length_lst.extend(
        [len(word_tokenize(x)) for x in df['argument2'].values])
    print('Words:')
    print('\tshortest argument:', min(arguments_length_lst), ' words')
    print('\tlongest argument:', max(arguments_length_lst), ' words')
    print('\targument average length:', np.mean(arguments_length_lst),
          ' words')

    arguments_sent_length_lst = [
        len(sent_tokenize(x)) for x in df['argument1'].values
    ]
    arguments_sent_length_lst.extend(
        [len(sent_tokenize(x)) for x in df['argument2'].values])
    print('Sentences:')
    print('\tshortest argument:', min(arguments_sent_length_lst), ' sentences')
    print('\tlongest argument:', max(arguments_sent_length_lst), ' sentences')
    print('\targument average length:', np.mean(arguments_sent_length_lst),
          ' sentences')

In [8]:
# with Timer("overview cross"):
#     get_overview(cross_traindev_df)

In [9]:
# with Timer("overview within"):
#     get_overview(within_traindev_df)

### Filter to only tagged input?

In [None]:
within_traindev_df = within_traindev_df[(within_traindev_df['tag'] == 'gay marriage')]

In [None]:
cross_traindev_df = cross_traindev_df[(cross_traindev_df['tag'] == 'gay marriage') | (cross_traindev_df['tag'] == 'abortion')]

## Train model - Baseline

### train dev set - 70% 30%

In [10]:
def get_train_test_sets(df, ratio=0.30, random_state=1):
    X = df[['argument1', 'argument2', 'argument1_id', 'argument2_id', 'topic']]
    y = df[['is_same_side']]

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=ratio,
                                                        random_state=random_state,
                                                        shuffle=True)
    return X_train, X_test, y_train, y_test

### lemmatizing

In [11]:
def get_wordnet_pos(treebank_tag):
    """return WORDNET POS compliance to WORDENT lemmatization (a,n,r,v)"""
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        # As default pos in lemmatization is Noun
        return wordnet.NOUN


def lemmatize_stemming(token, pos_tag):
    '''lemmatize words (with POS information) and then stem'''
    stemmer = SnowballStemmer(
        "english")  # pOrter, M. "An algorithm for suffix stripping."
    return stemmer.stem(WordNetLemmatizer().lemmatize(token, pos=pos_tag))


def do_segmentation(text):
    '''do sentence segmentation, tokenization (with lemmatization&stemming)'''
    lemma = []
    for sentence in sent_tokenize(text):
        sentence = sentence.replace('\n', ' ').strip()
        tokens = [token for token in word_tokenize(sentence)]
        pos_tags = nltk.pos_tag(tokens)

        for idx in range(0, len(tokens)):
            token = tokens[idx].lower()
            if token not in gensim.parsing.preprocessing.STOPWORDS and len(
                    token) > 3:
                wordnet_pos = get_wordnet_pos(pos_tags[idx][1])
                l_ = lemmatize_stemming(token, wordnet_pos)
                lemma.append(l_)
    return lemma


def preprocess(text):
    '''concat lemmatized words together again'''
    lemma = do_segmentation(text)
    return ' '.join(lemma)

### Extracting n grams lemma for argument1 and argument2

In [12]:
def extract_ngrams(X_train, X_dev, col, idx='id'):
    vectorizer = CountVectorizer(min_df=600,
                                 max_df=0.7,
                                 ngram_range=(3, 3),
                                 max_features=5000)

    vectorizer.fit(X_train[col])
    features = vectorizer.transform(X_train[col])
    features_dev = vectorizer.transform(X_dev[col])

    train_df = pd.DataFrame(features.todense(),
                            columns=vectorizer.get_feature_names())
    train_df = train_df.add_prefix(col)

    aid_df = X_train[[idx]]

    train_df = train_df.merge(aid_df,
                              left_index=True,
                              right_index=True,
                              suffixes=(False, False),
                              how='inner')
    train_df.set_index(idx, inplace=True)

    dev_df = pd.DataFrame(features_dev.todense(),
                          columns=vectorizer.get_feature_names())
    dev_df = dev_df.add_prefix(col)

    aid_dev_df = X_dev[[idx]]

    dev_df = dev_df.merge(aid_dev_df,
                          left_index=True,
                          right_index=True,
                          suffixes=(False, False),
                          how='inner')
    dev_df.set_index(idx, inplace=True)
    return train_df, dev_df


def extract_n_grams_features(X_train, X_dev, columns, idx='id'):
    X_train = X_train.reset_index()
    result_train_df = X_train[[idx]]
    result_train_df.set_index(idx, inplace=True)

    X_dev = X_dev.reset_index()
    result_dev_df = X_dev[[idx]]
    result_dev_df.set_index(idx, inplace=True)

    for col in columns:
        result_train_df_, result_dev_df_ = extract_ngrams(X_train, X_dev, col)
        result_train_df = result_train_df.join(result_train_df_)
        result_dev_df = result_dev_df.join(result_dev_df_)
    return result_train_df, result_dev_df

### Train Doc2Vec model and vectorize argument1 and argument2

In [13]:
def make_d2v_docs(row):
    words1 = do_segmentation(row['argument1'])
    words2 = do_segmentation(row['argument2'])

    row['argument1_doc'] = TaggedDocument(words=words1,
                                          tags=[row['argument1_id']])
    row['argument2_doc'] = TaggedDocument(words=words2,
                                          tags=[row['argument2_id']])

    row['argument1_lemmas'] = ' '.join(words1)
    row['argument2_lemmas'] = ' '.join(words2)

    return row


class DatasetIter:
    def __init__(self, ds, shuffle=True):
        self.ds = ds
        self.shuffle = shuffle

    def _make_taggeddocs(self, row):
        yield row['argument1_doc']
        yield row['argument2_doc']

    def __iter__(self):
        if self.shuffle:
            self.ds = self.ds.sample(frac=1)

        for _, row in self.ds.iterrows():
            for doc in self._make_taggeddocs(row):
                yield doc


# https://github.com/RaRe-Technologies/gensim/blob/2024be9053094fbb2e765b9a06b9dc580f55c505/gensim/test/test_doc2vec.py#L501
class ConcatenatedDoc2Vec(object):
    """
    Concatenation of multiple models for reproducing the Paragraph Vectors paper.
    Models must have exactly-matching vocabulary and document IDs. (Models should
    be trained separately; this wrapper just returns concatenated results.)
    """

    def __init__(self, models):
        self.models = models
        if hasattr(models[0], 'docvecs'):
            self.docvecs = ConcatenatedDocvecs([model.docvecs for model in models])

    def __getitem__(self, token):
        return np.concatenate([model[token] for model in self.models])

    def infer_vector(self, document, alpha=0.1, min_alpha=0.0001, steps=5):
        return np.concatenate([model.infer_vector(document, alpha, min_alpha, steps) for model in self.models])

    def train(self, *ignore_args, **ignore_kwargs):
        pass  # train subcomponents individually


class ConcatenatedDocvecs(object):
    def __init__(self, models):
        self.models = models

    def __getitem__(self, token):
        return np.concatenate([model[token] for model in self.models])

In [27]:
def train_model(X_train, X_dev, workers=2, epochs=30):
    with Timer("doc2vec dbow"):
        # columns=['argument1_lemmas', 'argument2_lemmas']
        # pd.concat([X_train[columns], X_dev[columns]])
        alpha = 0.025  # https://radimrehurek.com/gensim/models/base_any2vec.html#gensim.models.base_any2vec.BaseWordEmbeddingsModel
        # %%time
        model_dbow = Doc2Vec(DatasetIter(X_train, shuffle=True),
                             dm=0,
                             vector_size=300,
                             negative=5,
                             hs=0,
                             min_count=2,
                             sample=0,
                             workers=workers,
                             epochs=epochs,
                             alpha=alpha,
                             min_alpha=alpha - (epochs * 0.002))
        
    with Timer("doc2vec dmm"):
        model_dmm = Doc2Vec(DatasetIter(X_train, shuffle=True),
                            dm=1,
                            dm_mean=1,
                            vector_size=300,
                            window=10,
                            negative=5,
                            min_count=1,
                            workers=workers,
                            epochs=epochs,
                            alpha=0.065,
                            min_alpha=0.065 - (epochs * 0.002))
        
    return model_dbow, model_dmm

In [15]:
# unused
def vec_for_learning(model, df):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

In [16]:
def make_vectors(X_train, X_dev, model):
    def make_d2v_vecs(row):
        vec1 = model.infer_vector(row['argument1_doc'].words, steps=20)
        vec2 = model.infer_vector(row['argument2_doc'].words, steps=20)

        row['argument1_vec'] = vec1
        row['argument2_vec'] = vec2
        
        return row

    X_train = X_train.progress_apply(make_d2v_vecs, axis=1)
    X_dev = X_dev.progress_apply(make_d2v_vecs, axis=1)
    
    return X_train, X_dev

In [32]:
def make_vector_comparison_diff(X_train, X_dev):
    def ret_vec_diff(row):
        return row['argument1_vec'] - row['argument2_vec']

    X_train_diff = X_train.progress_apply(ret_vec_diff, axis=1)
    X_dev_diff = X_dev.progress_apply(ret_vec_diff, axis=1)

    return X_train_diff, X_dev_diff


def make_vector_comparison_concat(X_train, X_dev):
    def ret_vec_concat(row):
        return np.concatenate((row['argument1_vec'], row['argument2_vec']))

    X_train_concat = X_train.progress_apply(ret_vec_concat, axis=1)
    X_dev_concat = X_dev.progress_apply(ret_vec_concat, axis=1)

    return X_train_concat, X_dev_concat


def make_vector_comparison(X_train, X_dev, mode="diff"):
    if mode == "concat":
        X_train, X_dev = make_vector_comparison_concat(X_train, X_dev)
    else:
        X_train, X_dev = make_vector_comparison_diff(X_train, X_dev)

    # array of array to 2d array
    X_train = np.array(list(X_train.values))
    X_dev = np.array(list(X_dev.values))

    return X_train, X_dev

### Train model and evaluate

In [59]:
def train_test_svm(X_train, y_train, X_test):
    with Timer("StandardScaler fit"):
        scaler = StandardScaler(copy=True, with_mean=False)
        scaler.fit(X_train)

    with Timer("StandardScaler transform"):
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)

    # ------------------

    with Timer("SVC (linear) fit"):
        # svclassifier = SVC(kernel='linear')
        svclassifier = LinearSVC()        
        svclassifier.fit(X_train, y_train)

    with Timer("SVC predict"):
        y_pred = svclassifier.predict(X_test)

    return y_pred


def train_test_logreg(X_train, y_train, X_test):
    with Timer("LogisticRegression fit"):
        logreg = LogisticRegression(n_jobs=1, C=1e5)
        logreg.fit(X_train, y_train)
    
    with Timer("LogisticRegression predict"):
        y_pred = logreg.predict(X_test)
    
    return y_pred


def train_test_sgd(X_train, y_train, X_test):
    with Timer("SGDClassifier fit"):
        sgdcla = SGDClassifier()
        sgdcla.fit(X_train, y_train)
    
    with Timer("SGDClassifier predict"):
        y_pred = sgdcla.predict(X_test)
    
    return y_pred


def heatconmat(y_test, y_pred):
    sns.set_context('talk')
    plt.figure(figsize=(9, 6))
    sns.heatmap(confusion_matrix(y_test, y_pred),
                annot=True,
                fmt='d',
                cbar=False,
                cmap='gist_earth_r',
                yticklabels=sorted(y_test.unique()))
    plt.show()


def report_training_results(y_test, y_pred, name=None, heatmap=True):
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, y_pred))
    if heatmap:
        heatconmat(y_test['is_same_side'], y_pred)
    print()
    print('Accuracy: ', round(accuracy_score(y_test, y_pred), 2), '\n')  #

    print('Report{}:'.format("" if not name else " for [{}]".format(name)))
    print(classification_report(y_test, y_pred))

    f1_dic = {}
    f1_dic['macro'] = round(
        f1_score(y_pred=y_pred, y_true=y_test, average='macro'), 2)
    f1_dic['micro'] = round(
        f1_score(y_pred=y_pred, y_true=y_test, average='micro'), 2)
    return f1_dic

### Cross topic - Training and evaluating model 

In [68]:
# 1. Getting train and dev data
with Timer("1 - test/train"):
    X_train, X_dev, y_train, y_dev = get_train_test_sets(within_traindev_df)

Time for [1 - test/train]: 0:00:00.051779


In [69]:
# 2. tokenize (make doc2vec docs + lemma string)
# tqdm.pandas()
with Timer("2 - tokenize"):
    X_train = X_train.progress_apply(make_d2v_docs, axis=1)
    X_dev = X_dev.progress_apply(make_d2v_docs, axis=1)

100%|██████████| 44732/44732 [14:23<00:00, 51.79it/s]
100%|██████████| 19171/19171 [06:04<00:00, 52.60it/s]

Time for [2 - tokenize]: 0:20:28.278134





In [77]:
with Timer("2a - pickle"):
    X_train.to_pickle("data/X_train.cross_td.p")
    X_dev.to_pickle("data/X_dev.cross_td.p")

Time for [2a - pickle]: 0:00:06.676402


In [66]:
with Timer("2b - unpickle"):
    X_train = pd.read_pickle("data/X_train.cross_td.p")
    X_dev = pd.read_pickle("data/X_dev.cross_td.p")

Time for [2b - unpickle]: 0:00:03.149843


In [71]:
# 3. train doc2vec model
with Timer("3 - doc2vec model"):
    model_dbow, model_dmm = train_model(X_train, X_dev, workers=3, epochs=30)

    model_dbow.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
    model_dmm.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

    model_concat = ConcatenatedDoc2Vec([model_dbow, model_dmm])

2019-06-26 15:40:06,213 : INFO : collecting all words and their counts
2019-06-26 15:40:06,228 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2019-06-26 15:40:06,728 : INFO : PROGRESS: at example #10000, processed 747850 words (1499229/s), 23869 word types, 2334 tags
2019-06-26 15:40:07,240 : INFO : PROGRESS: at example #20000, processed 1480421 words (1435534/s), 33053 word types, 3864 tags
2019-06-26 15:40:07,757 : INFO : PROGRESS: at example #30000, processed 2241461 words (1472511/s), 38835 word types, 5098 tags
2019-06-26 15:40:08,269 : INFO : PROGRESS: at example #40000, processed 2982076 words (1450841/s), 42821 word types, 6096 tags
2019-06-26 15:40:08,789 : INFO : PROGRESS: at example #50000, processed 3729338 words (1439301/s), 45834 word types, 6958 tags
2019-06-26 15:40:09,312 : INFO : PROGRESS: at example #60000, processed 4501107 words (1478209/s), 47780 word types, 7722 tags
2019-06-26 15:40:09,828 : INFO : PROGRESS: at example #70000, pr

2019-06-26 15:40:57,005 : INFO : EPOCH - 3 : training on 6707530 raw words (6792083 effective words) took 16.5s, 410709 effective words/s
2019-06-26 15:40:58,009 : INFO : EPOCH 4 - PROGRESS: at 4.71% examples, 310519 words/s, in_qsize 6, out_qsize 0
2019-06-26 15:40:59,027 : INFO : EPOCH 4 - PROGRESS: at 9.95% examples, 336670 words/s, in_qsize 6, out_qsize 0
2019-06-26 15:41:00,041 : INFO : EPOCH 4 - PROGRESS: at 14.68% examples, 329044 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:41:01,046 : INFO : EPOCH 4 - PROGRESS: at 20.45% examples, 341099 words/s, in_qsize 6, out_qsize 0
2019-06-26 15:41:02,074 : INFO : EPOCH 4 - PROGRESS: at 27.72% examples, 366705 words/s, in_qsize 6, out_qsize 0
2019-06-26 15:41:03,101 : INFO : EPOCH 4 - PROGRESS: at 34.00% examples, 377049 words/s, in_qsize 6, out_qsize 0
2019-06-26 15:41:04,123 : INFO : EPOCH 4 - PROGRESS: at 40.41% examples, 383270 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:41:05,172 : INFO : EPOCH 4 - PROGRESS: at 44.95% examples, 

2019-06-26 15:42:01,489 : INFO : EPOCH 7 - PROGRESS: at 41.92% examples, 400816 words/s, in_qsize 6, out_qsize 0
2019-06-26 15:42:02,513 : INFO : EPOCH 7 - PROGRESS: at 46.81% examples, 388372 words/s, in_qsize 6, out_qsize 0
2019-06-26 15:42:03,540 : INFO : EPOCH 7 - PROGRESS: at 51.78% examples, 381966 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:42:04,558 : INFO : EPOCH 7 - PROGRESS: at 56.30% examples, 375018 words/s, in_qsize 6, out_qsize 0
2019-06-26 15:42:05,558 : INFO : EPOCH 7 - PROGRESS: at 62.35% examples, 378837 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:42:06,567 : INFO : EPOCH 7 - PROGRESS: at 66.82% examples, 372924 words/s, in_qsize 6, out_qsize 0
2019-06-26 15:42:07,584 : INFO : EPOCH 7 - PROGRESS: at 74.48% examples, 383573 words/s, in_qsize 6, out_qsize 0
2019-06-26 15:42:08,611 : INFO : EPOCH 7 - PROGRESS: at 81.21% examples, 388176 words/s, in_qsize 6, out_qsize 0
2019-06-26 15:42:09,641 : INFO : EPOCH 7 - PROGRESS: at 86.27% examples, 384870 words/s, in_qsiz

2019-06-26 15:43:04,779 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-06-26 15:43:04,781 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-06-26 15:43:04,782 : INFO : EPOCH - 10 : training on 6707530 raw words (6792083 effective words) took 19.5s, 348800 effective words/s
2019-06-26 15:43:05,825 : INFO : EPOCH 11 - PROGRESS: at 5.04% examples, 325294 words/s, in_qsize 6, out_qsize 0
2019-06-26 15:43:06,829 : INFO : EPOCH 11 - PROGRESS: at 9.72% examples, 322029 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:43:07,849 : INFO : EPOCH 11 - PROGRESS: at 15.10% examples, 335194 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:43:08,855 : INFO : EPOCH 11 - PROGRESS: at 20.38% examples, 343168 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:43:09,856 : INFO : EPOCH 11 - PROGRESS: at 25.44% examples, 342527 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:43:10,873 : INFO : EPOCH 11 - PROGRESS: at 31.31% examples, 349360 words/s, in_qsize 5, out_qsize

2019-06-26 15:44:07,686 : INFO : EPOCH 14 - PROGRESS: at 24.58% examples, 321662 words/s, in_qsize 6, out_qsize 0
2019-06-26 15:44:08,704 : INFO : EPOCH 14 - PROGRESS: at 30.00% examples, 327063 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:44:09,729 : INFO : EPOCH 14 - PROGRESS: at 35.19% examples, 330346 words/s, in_qsize 6, out_qsize 0
2019-06-26 15:44:10,729 : INFO : EPOCH 14 - PROGRESS: at 40.54% examples, 332585 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:44:11,774 : INFO : EPOCH 14 - PROGRESS: at 45.53% examples, 331875 words/s, in_qsize 6, out_qsize 0
2019-06-26 15:44:12,786 : INFO : EPOCH 14 - PROGRESS: at 50.48% examples, 333107 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:44:13,789 : INFO : EPOCH 14 - PROGRESS: at 56.62% examples, 341579 words/s, in_qsize 6, out_qsize 0
2019-06-26 15:44:14,816 : INFO : EPOCH 14 - PROGRESS: at 61.75% examples, 341374 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:44:15,830 : INFO : EPOCH 14 - PROGRESS: at 66.57% examples, 340892 words/s

2019-06-26 15:45:12,005 : INFO : EPOCH 17 - PROGRESS: at 62.09% examples, 348277 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:45:13,037 : INFO : EPOCH 17 - PROGRESS: at 67.42% examples, 349097 words/s, in_qsize 6, out_qsize 0
2019-06-26 15:45:14,042 : INFO : EPOCH 17 - PROGRESS: at 72.52% examples, 347630 words/s, in_qsize 6, out_qsize 0
2019-06-26 15:45:15,066 : INFO : EPOCH 17 - PROGRESS: at 77.95% examples, 347879 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:45:16,077 : INFO : EPOCH 17 - PROGRESS: at 83.16% examples, 347130 words/s, in_qsize 6, out_qsize 0
2019-06-26 15:45:17,097 : INFO : EPOCH 17 - PROGRESS: at 88.58% examples, 349124 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:45:18,103 : INFO : EPOCH 17 - PROGRESS: at 94.24% examples, 351151 words/s, in_qsize 6, out_qsize 0
2019-06-26 15:45:18,937 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-06-26 15:45:18,954 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-06-26 15:45:18,

2019-06-26 15:46:13,682 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-06-26 15:46:13,683 : INFO : EPOCH - 20 : training on 6707530 raw words (6792083 effective words) took 18.4s, 368446 effective words/s
2019-06-26 15:46:14,686 : INFO : EPOCH 21 - PROGRESS: at 5.55% examples, 398039 words/s, in_qsize 6, out_qsize 0
2019-06-26 15:46:15,708 : INFO : EPOCH 21 - PROGRESS: at 11.10% examples, 393902 words/s, in_qsize 6, out_qsize 0
2019-06-26 15:46:16,710 : INFO : EPOCH 21 - PROGRESS: at 17.54% examples, 411638 words/s, in_qsize 6, out_qsize 0
2019-06-26 15:46:17,745 : INFO : EPOCH 21 - PROGRESS: at 23.13% examples, 395184 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:46:18,756 : INFO : EPOCH 21 - PROGRESS: at 28.23% examples, 379186 words/s, in_qsize 6, out_qsize 0
2019-06-26 15:46:19,761 : INFO : EPOCH 21 - PROGRESS: at 34.26% examples, 382222 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:46:20,789 : INFO : EPOCH 21 - PROGRESS: at 40.04% examples, 382927 words/

2019-06-26 15:47:16,736 : INFO : EPOCH 24 - PROGRESS: at 52.97% examples, 393531 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:47:17,756 : INFO : EPOCH 24 - PROGRESS: at 59.23% examples, 395312 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:47:18,768 : INFO : EPOCH 24 - PROGRESS: at 63.97% examples, 389082 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:47:19,775 : INFO : EPOCH 24 - PROGRESS: at 69.83% examples, 388871 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:47:20,797 : INFO : EPOCH 24 - PROGRESS: at 75.99% examples, 390544 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:47:21,801 : INFO : EPOCH 24 - PROGRESS: at 81.20% examples, 386857 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:47:22,845 : INFO : EPOCH 24 - PROGRESS: at 87.24% examples, 387229 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:47:23,862 : INFO : EPOCH 24 - PROGRESS: at 92.75% examples, 386388 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:47:24,875 : INFO : EPOCH 24 - PROGRESS: at 97.52% examples, 382813 words/s

2019-06-26 15:48:20,070 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-06-26 15:48:20,086 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-06-26 15:48:20,087 : INFO : EPOCH - 27 : training on 6707530 raw words (6792083 effective words) took 18.2s, 372402 effective words/s
2019-06-26 15:48:21,112 : INFO : EPOCH 28 - PROGRESS: at 5.02% examples, 332303 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:48:22,134 : INFO : EPOCH 28 - PROGRESS: at 9.92% examples, 332069 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:48:23,149 : INFO : EPOCH 28 - PROGRESS: at 16.35% examples, 362252 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:48:24,158 : INFO : EPOCH 28 - PROGRESS: at 21.41% examples, 360660 words/s, in_qsize 6, out_qsize 0
2019-06-26 15:48:25,166 : INFO : EPOCH 28 - PROGRESS: at 27.16% examples, 365647 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:48:26,181 : INFO : EPOCH 28 - PROGRESS: at 32.16% examples, 360349 words/s, in_qsize 5, out_qsize

Time for [doc2vec dbow]: 0:09:09.510699


2019-06-26 15:49:16,332 : INFO : PROGRESS: at example #10000, processed 750805 words (1281682/s), 23121 word types, 2327 tags
2019-06-26 15:49:16,915 : INFO : PROGRESS: at example #20000, processed 1505007 words (1296575/s), 32711 word types, 3860 tags
2019-06-26 15:49:17,507 : INFO : PROGRESS: at example #30000, processed 2246842 words (1256074/s), 38845 word types, 5112 tags
2019-06-26 15:49:18,097 : INFO : PROGRESS: at example #40000, processed 2992234 words (1265215/s), 43071 word types, 6158 tags
2019-06-26 15:49:18,689 : INFO : PROGRESS: at example #50000, processed 3746641 words (1276491/s), 45750 word types, 7015 tags
2019-06-26 15:49:19,284 : INFO : PROGRESS: at example #60000, processed 4508593 words (1283865/s), 47529 word types, 7713 tags
2019-06-26 15:49:19,879 : INFO : PROGRESS: at example #70000, processed 5271518 words (1283565/s), 48798 word types, 8325 tags
2019-06-26 15:49:20,466 : INFO : PROGRESS: at example #80000, processed 6016632 words (1270783/s), 49725 word ty

2019-06-26 15:50:13,661 : INFO : EPOCH 3 - PROGRESS: at 49.21% examples, 295209 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:50:14,679 : INFO : EPOCH 3 - PROGRESS: at 53.58% examples, 291889 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:50:15,734 : INFO : EPOCH 3 - PROGRESS: at 57.94% examples, 288195 words/s, in_qsize 6, out_qsize 0
2019-06-26 15:50:16,739 : INFO : EPOCH 3 - PROGRESS: at 62.90% examples, 287408 words/s, in_qsize 6, out_qsize 0
2019-06-26 15:50:17,743 : INFO : EPOCH 3 - PROGRESS: at 69.57% examples, 293716 words/s, in_qsize 6, out_qsize 0
2019-06-26 15:50:18,752 : INFO : EPOCH 3 - PROGRESS: at 75.64% examples, 297818 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:50:19,768 : INFO : EPOCH 3 - PROGRESS: at 80.58% examples, 295919 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:50:20,784 : INFO : EPOCH 3 - PROGRESS: at 86.14% examples, 297212 words/s, in_qsize 6, out_qsize 0
2019-06-26 15:50:21,798 : INFO : EPOCH 3 - PROGRESS: at 90.12% examples, 294649 words/s, in_qsiz

2019-06-26 15:51:17,672 : INFO : EPOCH 6 - PROGRESS: at 52.55% examples, 255149 words/s, in_qsize 6, out_qsize 0
2019-06-26 15:51:18,678 : INFO : EPOCH 6 - PROGRESS: at 57.30% examples, 257535 words/s, in_qsize 6, out_qsize 0
2019-06-26 15:51:19,719 : INFO : EPOCH 6 - PROGRESS: at 61.25% examples, 255424 words/s, in_qsize 6, out_qsize 0
2019-06-26 15:51:20,734 : INFO : EPOCH 6 - PROGRESS: at 65.88% examples, 256249 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:51:21,750 : INFO : EPOCH 6 - PROGRESS: at 71.49% examples, 261225 words/s, in_qsize 6, out_qsize 0
2019-06-26 15:51:22,751 : INFO : EPOCH 6 - PROGRESS: at 77.86% examples, 267868 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:51:23,763 : INFO : EPOCH 6 - PROGRESS: at 82.09% examples, 266945 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:51:24,782 : INFO : EPOCH 6 - PROGRESS: at 86.55% examples, 266431 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:51:25,805 : INFO : EPOCH 6 - PROGRESS: at 92.43% examples, 270697 words/s, in_qsiz

2019-06-26 15:52:22,037 : INFO : EPOCH 9 - PROGRESS: at 64.97% examples, 273685 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:52:23,048 : INFO : EPOCH 9 - PROGRESS: at 69.71% examples, 273852 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:52:24,054 : INFO : EPOCH 9 - PROGRESS: at 74.43% examples, 274675 words/s, in_qsize 6, out_qsize 0
2019-06-26 15:52:25,093 : INFO : EPOCH 9 - PROGRESS: at 79.69% examples, 276415 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:52:26,111 : INFO : EPOCH 9 - PROGRESS: at 86.03% examples, 281647 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:52:27,116 : INFO : EPOCH 9 - PROGRESS: at 90.28% examples, 280210 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:52:28,129 : INFO : EPOCH 9 - PROGRESS: at 94.75% examples, 278356 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:52:29,136 : INFO : EPOCH 9 - PROGRESS: at 99.44% examples, 278354 words/s, in_qsize 4, out_qsize 0
2019-06-26 15:52:29,152 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-

2019-06-26 15:53:26,297 : INFO : EPOCH 12 - PROGRESS: at 68.34% examples, 311450 words/s, in_qsize 6, out_qsize 0
2019-06-26 15:53:27,334 : INFO : EPOCH 12 - PROGRESS: at 72.29% examples, 304833 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:53:28,355 : INFO : EPOCH 12 - PROGRESS: at 77.25% examples, 304547 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:53:29,394 : INFO : EPOCH 12 - PROGRESS: at 82.93% examples, 305122 words/s, in_qsize 6, out_qsize 0
2019-06-26 15:53:30,427 : INFO : EPOCH 12 - PROGRESS: at 87.66% examples, 302657 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:53:31,432 : INFO : EPOCH 12 - PROGRESS: at 93.22% examples, 304220 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:53:32,449 : INFO : EPOCH 12 - PROGRESS: at 97.92% examples, 302725 words/s, in_qsize 6, out_qsize 0
2019-06-26 15:53:32,689 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-06-26 15:53:32,702 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-06-26 15:53:32,

2019-06-26 15:54:31,363 : INFO : EPOCH 15 - PROGRESS: at 73.18% examples, 284589 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:54:32,388 : INFO : EPOCH 15 - PROGRESS: at 77.60% examples, 282884 words/s, in_qsize 6, out_qsize 0
2019-06-26 15:54:33,418 : INFO : EPOCH 15 - PROGRESS: at 82.25% examples, 281850 words/s, in_qsize 6, out_qsize 0
2019-06-26 15:54:34,420 : INFO : EPOCH 15 - PROGRESS: at 87.44% examples, 283270 words/s, in_qsize 6, out_qsize 0
2019-06-26 15:54:35,421 : INFO : EPOCH 15 - PROGRESS: at 92.21% examples, 283533 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:54:36,450 : INFO : EPOCH 15 - PROGRESS: at 96.92% examples, 283414 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:54:36,897 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-06-26 15:54:36,908 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-06-26 15:54:36,922 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-06-26 15:54:36,923 : INFO : EPOCH - 15

2019-06-26 15:55:35,271 : INFO : EPOCH 18 - PROGRESS: at 85.88% examples, 337898 words/s, in_qsize 6, out_qsize 0
2019-06-26 15:55:36,277 : INFO : EPOCH 18 - PROGRESS: at 90.97% examples, 334856 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:55:37,278 : INFO : EPOCH 18 - PROGRESS: at 95.82% examples, 331653 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:55:37,977 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-06-26 15:55:37,985 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-06-26 15:55:37,998 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-06-26 15:55:37,999 : INFO : EPOCH - 18 : training on 6707530 raw words (5969451 effective words) took 18.0s, 332050 effective words/s
2019-06-26 15:55:39,014 : INFO : EPOCH 19 - PROGRESS: at 5.07% examples, 303711 words/s, in_qsize 6, out_qsize 0
2019-06-26 15:55:40,021 : INFO : EPOCH 19 - PROGRESS: at 9.91% examples, 286957 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:55:41,

2019-06-26 15:56:36,868 : INFO : EPOCH 22 - PROGRESS: at 19.73% examples, 282227 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:56:37,901 : INFO : EPOCH 22 - PROGRESS: at 26.44% examples, 300710 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:56:38,918 : INFO : EPOCH 22 - PROGRESS: at 30.97% examples, 295011 words/s, in_qsize 6, out_qsize 0
2019-06-26 15:56:39,943 : INFO : EPOCH 22 - PROGRESS: at 35.86% examples, 293238 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:56:40,951 : INFO : EPOCH 22 - PROGRESS: at 40.48% examples, 292326 words/s, in_qsize 6, out_qsize 0
2019-06-26 15:56:41,951 : INFO : EPOCH 22 - PROGRESS: at 45.53% examples, 293645 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:56:42,959 : INFO : EPOCH 22 - PROGRESS: at 53.24% examples, 310279 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:56:43,977 : INFO : EPOCH 22 - PROGRESS: at 61.27% examples, 324525 words/s, in_qsize 6, out_qsize 0
2019-06-26 15:56:44,991 : INFO : EPOCH 22 - PROGRESS: at 68.96% examples, 336301 words/s

2019-06-26 15:57:39,327 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-06-26 15:57:39,327 : INFO : EPOCH - 25 : training on 6707530 raw words (5970182 effective words) took 17.1s, 348350 effective words/s
2019-06-26 15:57:40,352 : INFO : EPOCH 26 - PROGRESS: at 4.58% examples, 258088 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:57:41,363 : INFO : EPOCH 26 - PROGRESS: at 9.04% examples, 262946 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:57:42,375 : INFO : EPOCH 26 - PROGRESS: at 13.57% examples, 265231 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:57:43,385 : INFO : EPOCH 26 - PROGRESS: at 17.95% examples, 266320 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:57:44,411 : INFO : EPOCH 26 - PROGRESS: at 23.17% examples, 273096 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:57:45,423 : INFO : EPOCH 26 - PROGRESS: at 28.14% examples, 276702 words/s, in_qsize 6, out_qsize 0
2019-06-26 15:57:46,432 : INFO : EPOCH 26 - PROGRESS: at 35.55% examples, 301544 words/s

2019-06-26 15:58:43,235 : INFO : EPOCH 29 - PROGRESS: at 37.50% examples, 366945 words/s, in_qsize 6, out_qsize 0
2019-06-26 15:58:44,235 : INFO : EPOCH 29 - PROGRESS: at 44.48% examples, 374410 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:58:45,272 : INFO : EPOCH 29 - PROGRESS: at 51.56% examples, 379515 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:58:46,302 : INFO : EPOCH 29 - PROGRESS: at 56.89% examples, 373071 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:58:47,336 : INFO : EPOCH 29 - PROGRESS: at 61.99% examples, 364518 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:58:48,343 : INFO : EPOCH 29 - PROGRESS: at 69.35% examples, 372317 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:58:49,348 : INFO : EPOCH 29 - PROGRESS: at 77.08% examples, 379151 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:58:50,349 : INFO : EPOCH 29 - PROGRESS: at 84.65% examples, 384276 words/s, in_qsize 5, out_qsize 0
2019-06-26 15:58:51,382 : INFO : EPOCH 29 - PROGRESS: at 91.21% examples, 382879 words/s

Time for [doc2vec dmm]: 0:09:58.574804
Time for [3 - doc2vec model]: 0:19:08.120081


In [72]:
# 4. vectorize arguments
with Timer("4 - vectorize arguments"):
    # X_train, X_dev = make_vectors(X_train, X_dev, model_dbow)
    # X_train, X_dev = make_vectors(X_train, X_dev, model_dmm)
    X_train, X_dev = make_vectors(X_train, X_dev, model_concat)

100%|██████████| 44732/44732 [09:53<00:00, 75.39it/s] 
100%|██████████| 19171/19171 [04:09<00:00, 76.78it/s] 

Time for [4 - vectorize arguments]: 0:14:03.043704





In [73]:
# 5. combine two argument vectors into a single one
# - diff / concat / ...
with Timer("5 - vector comparison of arguments"):
    X_train_diff, X_dev_diff = make_vector_comparison(X_train, X_dev, mode="concat")

X_train_ = X_train_diff
X_dev_ = X_dev_diff

100%|██████████| 44732/44732 [00:03<00:00, 13138.50it/s]
100%|██████████| 19171/19171 [00:01<00:00, 12915.73it/s]


Time for [5 - vector comparison of arguments]: 0:00:05.090890


In [74]:
# 6. train
with Timer("6 - SVM (train -> predict)"):
    y_pred_svm = train_test_svm(X_train_, y_train, X_dev_)

# 7. Evaluate
with Timer("7 - report"):
    print(report_training_results(y_dev, y_pred_svm, name="SVM", heatmap=False))

Time for [StandardScaler fit]: 0:00:00.747875
Time for [StandardScaler transform]: 0:00:00.295211


  y = column_or_1d(y, warn=True)


Time for [SVC (linear) fit]: 0:04:22.565508
Time for [SVC predict]: 0:00:00.049544
Time for [6 - SVM (train -> predict)]: 0:04:23.723510
Confusion Matrix:
[[3011 5822]
 [2589 7749]]

Accuracy:  0.56 

Report for [SVM]:
              precision    recall  f1-score   support

       False       0.54      0.34      0.42      8833
        True       0.57      0.75      0.65     10338

    accuracy                           0.56     19171
   macro avg       0.55      0.55      0.53     19171
weighted avg       0.56      0.56      0.54     19171

{'macro': 0.53, 'micro': 0.56}
Time for [7 - report]: 0:00:00.609881


In [75]:
# 6. train
with Timer("6 - LogReg (train -> predict)"):
    y_pred_logreg = train_test_logreg(X_train_, y_train, X_dev_)

# 7. Evaluate
with Timer("7 - report"):
    print(report_training_results(y_dev, y_pred_logreg, name="LogisticRegression", heatmap=False))

  y = column_or_1d(y, warn=True)


Time for [LogisticRegression fit]: 0:01:02.070384
Time for [LogisticRegression predict]: 0:00:00.049328
Time for [6 - LogReg (train -> predict)]: 0:01:02.166525
Confusion Matrix:
[[4568 4265]
 [3819 6519]]

Accuracy:  0.58 

Report for [LogisticRegression]:
              precision    recall  f1-score   support

       False       0.54      0.52      0.53      8833
        True       0.60      0.63      0.62     10338

    accuracy                           0.58     19171
   macro avg       0.57      0.57      0.57     19171
weighted avg       0.58      0.58      0.58     19171

{'macro': 0.57, 'micro': 0.58}
Time for [7 - report]: 0:00:00.621420


In [76]:
# 6. train
with Timer("6 - SGDClassifier (train -> predict)"):
    y_pred_sgdcla = train_test_sgd(X_train_, y_train, X_dev_)

# 7. Evaluate
with Timer("7 - report"):
    print(report_training_results(y_dev, y_pred_sgdcla, name="SGDClassifier", heatmap=False))

  y = column_or_1d(y, warn=True)


Time for [SGDClassifier fit]: 0:00:41.005865
Time for [SGDClassifier predict]: 0:00:00.098403
Time for [6 - SGDClassifier (train -> predict)]: 0:00:41.104528
Confusion Matrix:
[[4920 3913    0]
 [4009 6328    1]
 [   0    0    0]]

Accuracy:  0.59 

Report for [SGDClassifier]:


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


              precision    recall  f1-score   support

       False       0.55      0.56      0.55      8833
        True       0.62      0.61      0.61     10338
gay marriage       0.00      0.00      0.00         0

    accuracy                           0.59     19171
   macro avg       0.39      0.39      0.39     19171
weighted avg       0.59      0.59      0.59     19171

{'macro': 0.39, 'micro': 0.59}
Time for [7 - report]: 0:00:00.360067


In [None]:
# old
return

asdf

# 2. Lemmatizing argument1 and argument2
with Timer("2 - lemmatize"):
    X_train = X_train.apply(get_lemma, axis=1)
    X_dev = X_dev.apply(get_lemma, axis=1)

# 3. Extracting features - 1-3 grams lemma
with Timer("3 - n-grams"):
    X_train_, X_dev_ = extract_n_grams_features(
        X_train, X_dev, columns=['argument1_lemmas', 'argument2_lemmas'])

# 4. train
with Timer("4 - SVM (train -> predict)"):
    y_pred = train_test_svm(X_train_, y_train, X_dev_)

# 5. Evaluate
with Timer("5 - report"):
    report_training_results(y_dev, y_pred)