# RATIO 2019 - Benchmarking Workshop

In [1]:
import numpy as np
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize
import csv

In [2]:
import datetime
import time


class Timer:
    def __init__(self, name=None):
        self.name = name

    def __enter__(self):
        self.time_start = time.time()

    def __exit__(self, *exc):
        time_end = time.time()
        time_delta = datetime.timedelta(seconds=(time_end - self.time_start))
        if self.name:
            print(("Time for [{}]: {}".format(self.name, time_delta)))
        else:
            print(("Time: {}".format(time_delta)))

# Task 1 - Same Side Clasiification



In [3]:
data_cross_path = 'data/same-side-classification/cross-topic/{}.csv'
data_within_path = 'data/same-side-classification/within-topic/{}.csv'

### Load within-topics and cross-topics data

In [4]:
# escapechar to detect quoting escapes, else it fails

# na_filter=False, because pandas automatic "nan" detection fails with the topic column, too
# cross_test_df['topic'].astype(str)[9270]

with Timer("read cross"):
    # cross_traindev_df = pd.read_csv(data_cross_path.format('training'), index_col='id', escapechar='\\', na_filter=False)
    # cross_test_df = pd.read_csv(data_cross_path.format('test'), index_col='id', escapechar='\\', na_filter=False)
    cross_traindev_df = pd.read_csv(data_cross_path.format('training'), quotechar='"',quoting=csv.QUOTE_ALL,encoding='utf-8',escapechar='\\',doublequote=False,index_col='id')
    # cross_test_df =  pd.read_csv(data_cross_path.format('test'), quotechar='"',quoting=csv.QUOTE_ALL,encoding='utf-8',escapechar='\\',doublequote=False,index_col='id')
    cross_test_df =  pd.read_csv(data_cross_path.format('test'), index_col='id')

with Timer("read within"):
    # within_traindev_df = pd.read_csv(data_within_path.format('training'), index_col='id', escapechar='\\', na_filter=False)
    # within_test_df = pd.read_csv(data_within_path.format('test'), index_col='id', escapechar='\\', na_filter=False)
    within_traindev_df =  pd.read_csv(data_within_path.format('training'),quotechar='"',quoting=csv.QUOTE_ALL,encoding='utf-8',escapechar='\\',doublequote=False,index_col='id')
    # within_test_df =  pd.read_csv(data_within_path.format('test'),  quotechar='"',quoting=csv.QUOTE_ALL,encoding='utf-8',escapechar='\\',doublequote=False,index_col='id')
    within_test_df =  pd.read_csv(data_within_path.format('test'), index_col='id')


Time for [read cross]: 0:00:01.730243
Time for [read within]: 0:00:01.484252


In [5]:
# Adding a tag for the topics in focus: "gay marriage" and "abortion"
def add_tag(row):
    title = row['topic'].lower().strip()
    if title.find('abortion') > -1:
        row['tag'] = 'abortion'
    elif title.find('gay marriage') > -1:
        row['tag'] = 'gay marriage'
    else:
        row['tag'] = 'NA'
    return row


with Timer("tag cross"):
    cross_traindev_df = cross_traindev_df.apply(add_tag, axis=1)
    cross_test_df = cross_test_df.apply(add_tag, axis=1)

with Timer("tag within"):
    within_traindev_df = within_traindev_df.apply(add_tag, axis=1)
    within_test_df = within_test_df.apply(add_tag, axis=1)

Time for [tag cross]: 0:00:46.870798
Time for [tag within]: 0:00:44.794555


In [6]:
within_traindev_df['tag'].unique()

array(['gay marriage', 'abortion'], dtype=object)

In [7]:
within_traindev_df.info()
cross_traindev_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 63903 entries, 85249 to 3298
Data columns (total 8 columns):
argument1       63903 non-null object
argument1_id    63903 non-null object
argument2       63903 non-null object
argument2_id    63903 non-null object
debate_id       63903 non-null object
is_same_side    63903 non-null bool
topic           63903 non-null object
tag             63903 non-null object
dtypes: bool(1), object(7)
memory usage: 4.0+ MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 61048 entries, 0 to 61047
Data columns (total 8 columns):
argument1       61048 non-null object
argument1_id    61048 non-null object
argument2       61048 non-null object
argument2_id    61048 non-null object
debate_id       61048 non-null object
is_same_side    61048 non-null bool
topic           61048 non-null object
tag             61048 non-null object
dtypes: bool(1), object(7)
memory usage: 3.8+ MB


In [8]:
np.unique(within_traindev_df[['is_same_side']])

array([False,  True])

In [9]:
np.unique(cross_traindev_df[['is_same_side']])

array([False,  True])

In [None]:
#74517
# within_traindev_df.loc[82134]['argument2']
#within_traindev_df.loc[74517]['argument2']

In [None]:
# within_traindev_df[(within_traindev_df['tag'] == 'gay marriage') & (within_traindev_df['tag'] == 'gay marriage')]
within_traindev_df[(within_traindev_df['tag'] == 'gay marriage')]

In [None]:
#within_traindev_df[(within_traindev_df['tag'] == 'gay marriage') and (within_traindev_df['is_same_side'] == 'True')]

### Get an overview about each dataset

In [None]:
def get_overview(df, task='same-side', class_name='is_same_side'):
    # Total instance numbers
    total = len(df)
    print("Task: ", task)
    print('=' * 40, '\n')

    print('Total instances: ', total)
    print('\n')

    print('For each topic:')
    for tag, tag_df in df.groupby(['tag']):
        print(tag, ': ', len(tag_df), ' instances')
        if class_name in df.columns:
            for is_same_side, side_df in tag_df.groupby([class_name]):
                print('\t\t', is_same_side, ': ', len(side_df), ' instances')
    print('\n')

    if class_name in df.columns:
        print('For each class value:')
        for class_value, class_df in df.groupby([class_name]):
            print(class_value, ': ', len(class_df), ' instances')
        print('\n')

    print('Unique argument1:', len(df['argument1'].unique()))
    print('Unique argument2:', len(df['argument2'].unique()))
    arguments = df['argument1'].values
    arguments = np.concatenate([arguments, df['argument2'].values])

    print('Unique total arguments:', len(set(list(arguments))), '\n')

    print('-' * 40, '\n')

    arguments_length_lst = [
        len(word_tokenize(x)) for x in df['argument1'].values
    ]
    arguments_length_lst.extend(
        [len(word_tokenize(x)) for x in df['argument2'].values])
    print('Words:')
    print('\tshortest argument:', min(arguments_length_lst), ' words')
    print('\tlongest argument:', max(arguments_length_lst), ' words')
    print('\targument average length:', np.mean(arguments_length_lst),
          ' words')

    arguments_sent_length_lst = [
        len(sent_tokenize(x)) for x in df['argument1'].values
    ]
    arguments_sent_length_lst.extend(
        [len(sent_tokenize(x)) for x in df['argument2'].values])
    print('Sentences:')
    print('\tshortest argument:', min(arguments_sent_length_lst), ' sentences')
    print('\tlongest argument:', max(arguments_sent_length_lst), ' sentences')
    print('\targument average length:', np.mean(arguments_sent_length_lst),
          ' sentences')

In [None]:
with Timer("overview cross"):
    get_overview(cross_traindev_df)

In [None]:
with Timer("overview within"):
    get_overview(within_traindev_df)

## Train model - Baseline

### train dev set - 70% 30%

In [10]:
from sklearn.model_selection import train_test_split
import nltk
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')


def get_train_test_sets(df):
    X = df[['argument1', 'argument2', 'topic']]
    y = df[['is_same_side']]

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.30,
                                                        random_state=1,
                                                        shuffle=True)
    return X_train, X_test, y_train, y_test

[nltk_data] Downloading package wordnet to /home/ekoerner/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ekoerner/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


### lemmatizing

In [11]:
import gensim
from gensim.parsing.preprocessing import STOPWORDS
from gensim.utils import simple_preprocess
from nltk.corpus import wordnet
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize


def get_wordnet_pos(treebank_tag):
    """
        return WORDNET POS compliance to WORDENT lemmatization (a,n,r,v) 
        """
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        # As default pos in lemmatization is Noun
        return wordnet.NOUN


def lemmatize_stemming(token, pos_tag):
    stemmer = SnowballStemmer(
        "english")  # pOrter, M. "An algorithm for suffix stripping."
    return stemmer.stem(WordNetLemmatizer().lemmatize(token, pos=pos_tag))


def preprocess(text):
    lemma = []
    for sentence in sent_tokenize(text):
        sentence = sentence.replace('\n', ' ').strip()
        tokens = [token for token in word_tokenize(sentence)]
        pos_tags = nltk.pos_tag(tokens)

        for idx in range(0, len(tokens)):
            token = tokens[idx].lower()
            if token not in gensim.parsing.preprocessing.STOPWORDS and len(
                    token) > 3:
                wordnet_pos = get_wordnet_pos(pos_tags[idx][1])
                l_ = lemmatize_stemming(token, wordnet_pos)
                lemma.append(l_)
    return ' '.join(lemma)

In [12]:
def get_lemma(row):
    row['argument1_lemmas'] = preprocess(row['argument1'])
    row['argument2_lemmas'] = preprocess(row['argument2'])
    return row

### Extracting n grams lemma for argument1 and argument2

In [13]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


def extract_ngrams(X_train, X_dev, col, idx='id'):
    vectorizer = CountVectorizer(min_df=600,
                                 max_df=0.7,
                                 ngram_range=(3, 3),
                                 max_features=5000)

    vectorizer.fit(X_train[col])
    features = vectorizer.transform(X_train[col])
    features_dev = vectorizer.transform(X_dev[col])

    train_df = pd.DataFrame(features.todense(),
                            columns=vectorizer.get_feature_names())
    train_df = train_df.add_prefix(col)

    aid_df = X_train[[idx]]

    train_df = train_df.merge(aid_df,
                              left_index=True,
                              right_index=True,
                              suffixes=(False, False),
                              how='inner')
    train_df.set_index(idx, inplace=True)

    dev_df = pd.DataFrame(features_dev.todense(),
                          columns=vectorizer.get_feature_names())
    dev_df = dev_df.add_prefix(col)

    aid_dev_df = X_dev[[idx]]

    dev_df = dev_df.merge(aid_dev_df,
                          left_index=True,
                          right_index=True,
                          suffixes=(False, False),
                          how='inner')
    dev_df.set_index(idx, inplace=True)
    return train_df, dev_df


def extract_n_grams_features(X_train, X_dev, columns, idx='id'):
    X_train = X_train.reset_index()
    result_train_df = X_train[[idx]]
    result_train_df.set_index(idx, inplace=True)

    X_dev = X_dev.reset_index()
    result_dev_df = X_dev[[idx]]
    result_dev_df.set_index(idx, inplace=True)

    for col in columns:
        result_train_df_, result_dev_df_ = extract_ngrams(X_train, X_dev, col)
        result_train_df = result_train_df.join(result_train_df_)
        result_dev_df = result_dev_df.join(result_dev_df_)
    return result_train_df, result_dev_df

### Train model and evaluate

In [21]:
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix, f1_score)
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC


def train_test_svm(X_train, y_train, X_test):
    scaler = StandardScaler(copy=True, with_mean=False)
    scaler.fit(X_train)

    X_train = scaler.transform(X_train)

    svclassifier = SVC(kernel='linear')
    svclassifier.fit(X_train, y_train)

    X_test = scaler.transform(X_test)
    y_pred = svclassifier.predict(X_test)

    return y_pred


def report_training_results(y_test, y_pred, digits=3):
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, y_pred))
    print()
    print('Accuracy: ', round(accuracy_score(y_test, y_pred), digits))  #
    print()

    print('Report:')
    print(classification_report(y_test, y_pred, digits=digits))
    f1_dic = {}

    f1_dic['macro'] = round(
        f1_score(y_pred=y_pred, y_true=y_test, average='macro'), digits)
    f1_dic['micro'] = round(
        f1_score(y_pred=y_pred, y_true=y_test, average='micro'), digits)
    return f1_dic

### Cross topic - Training and evaluating model 

In [22]:
# 1. Getting train and dev data
with Timer("1 - test/train"):
    X_train, X_dev, y_train, y_dev = get_train_test_sets(cross_traindev_df)

# 2. Lemmatizing argument1 and argument2
with Timer("2 - lemmatize"):
    X_train = X_train.apply(get_lemma, axis=1)
    X_dev = X_dev.apply(get_lemma, axis=1)

# 3. Extracting features - 1-3 grams lemma
with Timer("3 - n-grams"):
    X_train_, X_dev_ = extract_n_grams_features(
        X_train, X_dev, columns=['argument1_lemmas', 'argument2_lemmas'])

# 4. train
with Timer("4 - SVM (train -> predict)"):
    y_pred = train_test_svm(X_train_, y_train, X_dev_)

# 5. Evaluate
with Timer("5 - report"):
    report_training_results(y_dev, y_pred)

Time for [1 - test/train]: 0:00:00.016843
Time for [2 - lemmatize]: 0:24:58.456637
Time for [3 - n-grams]: 0:00:20.130679


  y = column_or_1d(y, warn=True)


Time for [4 - SVM (train -> predict)]: 0:01:21.778531
Confusion Matrix:
[[ 363 8573]
 [ 300 9079]]

Accuracy:  0.516

Report:
              precision    recall  f1-score   support

       False      0.548     0.041     0.076      8936
        True      0.514     0.968     0.672      9379

    accuracy                          0.516     18315
   macro avg      0.531     0.504     0.374     18315
weighted avg      0.531     0.516     0.381     18315

Time for [5 - report]: 0:00:00.034135


### Within topic - Training and evaluating model 

In [23]:
# 1. Getting train and dev data
with Timer("1 - test/train"):
    X_train, X_dev, y_train, y_dev = get_train_test_sets(within_traindev_df)

# 2. Lemmatizing argument1 and argument2
with Timer("2 - lemmatize"):
    X_train = X_train.apply(get_lemma, axis=1)
    X_dev = X_dev.apply(get_lemma, axis=1)

# 3. Extracting features - 1-3 grams lemma
with Timer("3 - n-grams"):
    X_train_, X_dev_ = extract_n_grams_features(
        X_train, X_dev, columns=['argument1_lemmas', 'argument2_lemmas'])

# 4. train
with Timer("4 - SVM (train -> predict)"):
    y_pred = train_test_svm(X_train_, y_train, X_dev_)

# 5. Evaluate
with Timer("5 - report"):
    report_training_results(y_dev, y_pred)

Time for [1 - test/train]: 0:00:00.024590
Time for [2 - lemmatize]: 0:19:43.205894
Time for [3 - n-grams]: 0:00:22.950521


  y = column_or_1d(y, warn=True)


Time for [4 - SVM (train -> predict)]: 0:01:00.917064
Confusion Matrix:
[[  181  8652]
 [  149 10189]]

Accuracy:  0.541

Report:
              precision    recall  f1-score   support

       False      0.548     0.020     0.040      8833
        True      0.541     0.986     0.698     10338

    accuracy                          0.541     19171
   macro avg      0.545     0.503     0.369     19171
weighted avg      0.544     0.541     0.395     19171

Time for [5 - report]: 0:00:00.034107
