# Statistical Significance Testing: Paired Bootstrap Test

* **Sample Size:** 20% of DevTest data
* **Number Samples:** 10,000

Compare the performance of two models and determine whether one is significantly better than the other.

***

Load libraries:

In [1]:
# For path variables
import config, utils

# For data analysis
import pandas as pd
import numpy as np
import os, re

# For creating directories
from pathlib import Path

# For fastText embeddings
from gensim.models import FastText
from gensim import utils as gensim_utils

# For classification
import sklearn.metrics as metrics
from sklearn.preprocessing import MultiLabelBinarizer
from skmultilearn.problem_transform import ClassifierChain
from sklearn.ensemble import RandomForestClassifier

# For statistical significance testing
from scipy.stats import ttest_ind

#### Setup

Define the paths to the model input data:

In [2]:
train_data_path = config.tokc_path+"model_input/token_train.csv"
train_data = utils.preprocess(train_data_path)
dev_data_path = config.tokc_path+"model_input/token_validate.csv"
dev_data_full = utils.preprocess(dev_data_path)

Set the dimensionality of word embeddings for the models being compared:

In [3]:
dimensions = ["50", "100", "200", "300"]
d = dimensions[0]

Set the fraction of samples to include in each classifier training instance, and the number of classifier instances to train:

In [4]:
frac_samples = 0.2
# n_samples = 100
n_classifiers = 10000

From the training and devtest data, extract features and binarize targets:

In [5]:
mlb = MultiLabelBinarizer()
target_col = "tag"
feature_cols = ["token_id", "token"]

In [6]:
# ------------------------
# Load data
# ------------------------
def zipTokensFeatures(loaded_data):
    token_data = list(zip(loaded_data[feature_cols[0]], loaded_data[feature_cols[1]]))
    return token_data


# ------------------------
# Extract GloVe features
# ------------------------
glove = utils.getGloveEmbeddings(d)
def extractGloveEmbedding(token, embedding_dict=glove, dimensions=int(d)):
    if token.isalpha():
        token = token.lower()
    try:
        embedding = embedding_dict[token]
    except KeyError:
        embedding = np.zeros((dimensions,))
    return embedding.reshape(-1,1)

def makeGloveFeatureMatrix(token_data, dimensions=int(d)):    
    feature_list = [extractGloveEmbedding(token) for token_id,token in token_data]
    return np.array(feature_list).reshape(-1,dimensions)


# ------------------------
# Extract fastText features
# ------------------------
file_name = config.fasttext_path+"fasttext{}_lowercased.model".format(d)
embedding_model = FastText.load(file_name)
def extractFastTextEmbedding(token, fasttext_model=embedding_model):
    if token.isalpha():
        token = token.lower()
    embedding = fasttext_model.wv[token]
    return embedding

def makeFastTextFeatureMatrix(token_data):
    feature_list = [extractFastTextEmbedding(token) for token_id,token in token_data]
    return np.array(feature_list)


# ------------------------
# Binarize targets
# ------------------------
def binarizeTrainTargets(train_data):
    y_train_labels = train_data[target_col]
    y_train = mlb.fit_transform(y_train_labels)
    return mlb, y_train

def binarizeDevTargets(mlb, dev_data):
    y_dev_labels = dev_data[target_col]
    y_dev = mlb.transform(y_dev_labels)
    return y_dev

#### Train

Train two classification models, one with GloVe word embeddings as features and one with custom fastText word embeddings as features.

In [None]:
# Load and preprocess training data
train_tokens = zipTokensFeatures(train_data)

# Get GloVe features
X_train_glove = makeGloveFeatureMatrix(train_tokens)

# Get custom fastText features
X_train_ft = makeFastTextFeatureMatrix(train_tokens)

# Get targets
mlb, y_train = binarizeTrainTargets(train_data)

# Train a model with GloVe embeddings as features
clf_glove = ClassifierChain(classifier = RandomForestClassifier(random_state=22))
clf_glove.fit(X_train_glove, y_train)

# Train a model with custom fastText embeddings as features
clf_ft = ClassifierChain(classifier = RandomForestClassifier(random_state=22))
clf_ft.fit(X_train_ft, y_train)

#### Predict and Evaluate

Test the two classifiers on random samples of equal size from the devtest data, and calculate the macro (average across all labels) precision, recall, and F1 scores for the model's performance for each sample.

In [None]:
glove_f1_scores, glove_precision_scores, glove_recall_scores = [], [], []
ft_f1_scores, ft_precision_scores, ft_recall_scores = [], [], []
counter = 0
start = time.time()
for n in range(n_classifiers):
    # Load and preprocess a sample of devtest data
    dev_data = dev_data_full.sample(frac=frac_samples, replace=True)
    dev_tokens = zipTokensFeatures(dev_data)
    
    # Extract GloVe and custom fastText features for the devtest data sample
    X_dev_glove = makeGloveFeatureMatrix(dev_tokens)
    X_dev_ft = makeFastTextFeatureMatrix(dev_tokens)
    
    # Get targets
    y_dev = binarizeDevTargets(mlb, dev_data)
    
    # Predict and evaluate the model with GloVe embeddings as features
    predictions_glove = clf_glove.predict(X_dev_glove)
    glove_precision_scores += [metrics.precision_score(y_dev, predictions_glove, average="macro", zero_division=0)]
    glove_recall_scores += [metrics.recall_score(y_dev, predictions_glove, average="macro", zero_division=0)]
    glove_f1_scores += [metrics.f1_score(y_dev, predictions_glove, average="macro", zero_division=0)]
    
    # Predict and evaluate the model with custom fastText embeddings as features
    predictions_ft = clf_ft.predict(X_dev_ft)
    ft_precision_scores += [metrics.precision_score(y_dev, predictions_ft, average="macro", zero_division=0)]
    ft_recall_scores += [metrics.recall_score(y_dev, predictions_ft, average="macro", zero_division=0)]
    ft_f1_scores += [metrics.f1_score(y_dev, predictions_ft, average="macro", zero_division=0)]
    
    counter += 1
    
    if (counter%10 == 0):
        print(counter, "iterations")
    
assert len(glove_f1_scores) == n_classifiers
assert len(ft_f1_scores) == n_classifiers
print(time.time() - start)

#### Calculate Statistical Significance

* Alpha = 0.5 (5%)

* Null Hypothesis: The performance of a model with pre-trained GloVe embeddings is not significantly different (better or worse) than a model trained on fastText embeddings custom-trained on the model's corpus.

* Alternative Hypothesis: The performance of a model with pre-trained GloVe embeddings is significantly different than a model trained on fastText embeddings custom-trained on the model's corpus.

In [None]:
ttest_precision = ttest_ind(ft_precision_scores, glove_precision_scores)
print(ttest_precision)

In [None]:
ttest_recall = ttest_ind(ft_recall_scores, glove_recall_scores)
print(ttest_recall)

In [None]:
ttest_f1 = ttest_ind(ft_f1_scores, glove_f1_scores)
print(ttest_f1)

Does order of input arrays matter?

In [None]:
ttest_precision2 = ttest_ind(glove_precision_scores, ft_precision_scores)
print(ttest_precision2)

In [None]:
ttest_recall2 = ttest_ind(glove_recall_scores, ft_recall_scores)
print(ttest_recall2)

In [None]:
ttest_f12 = ttest_ind(glove_f1_scores, ft_f1_scores)
print(ttest_f12)

***

#### Steps to the Paired Bootstrap Test

**Step 1:** Select random sample sets of equal size, sampling with replacement (the same row can selected repeatedly in a sample).

**Step 2:** On each sample set, run the model and calculate the precision, recall, and F1 score.

**Step 3:** Compute the difference between each model's precision, recall, and F1 score for each random sample set.

**Step 4:** Zero-center the data, subtracting the performance difference observed in the models overall from the sampled model performance scores.

**Step 5:** Calculate the *p*-value, which is the proportion of samples that the model with GloVe embeddings performed better than the model with fastText embeddings???

In [70]:
print(glove_preds.shape, ft_preds.shape)

(165954, 7) (165740, 7)


In [79]:
def getSampleScores(df, sample_size, n_samples, model, agmt_col):
    sample_list, prec_list, rec_list, f1_list = [], [], [], []
    for i in range(n_samples):
        # Get a random selection of sample_size rows from df
        sample = df.sample(n=sample_size)
        # Calculate precision, recall, and f1 scores for that sample
        agmts = list(sample[agmt_col])
        precision, recall, f1 = utils.precisionRecallF1(
            agmts.count("true positive"), agmts.count("false positive"), agmts.count("false negative")
        )
        sample_list += [model+" "+str(i)]
        prec_list += [precision]
        rec_list += [recall]
        f1_list += [f1]
        
    # Create a DataFrame (table) of the scores
    return pd.DataFrame({"sample":sample_list, "precision":prec_list, "recall": rec_list, "f_1":f1_list})

In [94]:
sample_size = 100000
agmt_col = "_merge"
n_samples = 10000

In [95]:
model = "glove"
glove_samples_scores = getSampleScores(glove_preds, sample_size, n_samples, model, agmt_col)
# glove_samples_scores

In [96]:
model = "fasttext"
ft_samples_scores = getSampleScores(ft_preds, sample_size, n_samples, model, agmt_col)
# ft_samples_scores

In [107]:
ttest_result = ttest_ind(ft_samples_scores["f_1"], glove_samples_scores["f_1"])
ttest_result

Ttest_indResult(statistic=655.4455914909287, pvalue=0.0)

In [106]:
wilcoxon_result = wilcoxon(ft_samples_scores["f_1"], glove_samples_scores["f_1"])
wilcoxon_result

WilcoxonResult(statistic=0.0, pvalue=0.0)

Subtract the fastText scores from the GloVe scores:

In [None]:
mean_glove_f1 = np.mean(glove_samples_scores["f_1"])
mean_glove_f1 = np.mean(glove_samples_scores["f_1"])

In [100]:
diff = (glove_samples_scores.drop(columns=["sample"])).subtract((ft_samples_scores.drop(columns=["sample"])))
# diff

Zero-center the data.

In [101]:
prec_diff = perf_diff.iloc[0,0]
rec_diff = perf_diff.iloc[0,1]
f1_diff = perf_diff.iloc[0,2]
overall_diff = pd.DataFrame({
    "precision":([prec_diff]*diff.shape[0]), "recall":([rec_diff]*diff.shape[0]), "f_1":([f1_diff]*diff.shape[0])
})
zero_centered = diff.subtract(overall_diff)
zero_centered.head()

Unnamed: 0,precision,recall,f_1
0,-0.010803,0.004652,0.003004
1,-0.022315,-0.004442,-0.007194
2,-0.020473,-0.009523,-0.009204
3,-0.019699,-0.003604,-0.005568
4,-0.016723,-0.004196,-0.004569


In [108]:
wilcoxon(zero_centered["f_1"])

WilcoxonResult(statistic=12852233.0, pvalue=0.0)

In [88]:
def calculatePValue(df, col_name):
    values = list(df[col_name])
    count = 0
    for value in values:
        if value > 0:
            count += 1
    return count/(len(values))

In [89]:
prec_p = calculatePValue(zero_centered, "precision")
rec_p = calculatePValue(zero_centered, "recall")
f1_p = calculatePValue(zero_centered, "f_1")
print(prec_p, rec_p, f1_p)

0.0 0.461 0.33
