### This notebook implements the ensemble classifier that identifies comparative questions. The pre-calculated results (logistic regression, transformer-based) are in the results.zip directory that should be unzipped.

In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, confusion_matrix

In [None]:
df = pd.read_csv("", sep="\t") # specifies the path to data_full.tsv in Webis-CompQuestions-22/comparative-questions-or-not
df["pos"] = df["pos"].apply(eval)
df["lemma"] = df["lemma"].apply(eval)
df["tokenized"] = df["clean"].apply(lambda x: re.split(" ", x))

#### Classification rules

In [None]:
def re_and(inp, regexs):
    for rex in regexs:
        if rex.match(inp) is None:
            return False
    return True


def re_or(inp, regexs):
    for rex in regexs:
        if rex.match(inp) is not None:
            return True
    return False

def re_one(inp, regex):
    if regex.match(inp) is not None:
        return True
    return False

def has_interrogative(inp):
    interrogratives = ["who", "which", "what", "where", "when", "how", "why"]
    tokens = word_tokenize(inp["clean"])
    for i in interrogratives:
        if i in tokens:
            return True
    return False

def has_singular_interrogative(inp):
    interrogratives = ["who", "which", "what", "where"]
    tokens = word_tokenize(inp["clean"])
    for i in interrogratives:
        if i in tokens:
            return True
    return False

def pos_match(inp, pos_tags):
    for p in inp:
        if p in pos_tags:
            return True
    return False

def find_in_seq(arr, q):
    if q[0] not in arr:
        return -1
    pos = arr.index(q[0])
    for it, x in enumerate(q[1:]):
        if x != arr[it + pos +1]:
            return -1
    return pos + len(q)

def clean_df():
    if "pred" in df.columns:
        df["pred"] = False
    if "neg_prediction" in df.columns:
        df["neg_prediction"] = False
    for col in df.columns:
        if "Unnamed: 0" in col:
            del df[col]
        if "pos_rule" in col:
            del df[col]
        if "neg_rule" in col:
            del df[col]



In [None]:
df["pred"] = False
df["truth"] = df["comp"].apply(lambda x: True if x==1.0 else False)

In [None]:
#regex needed for negative rules:
rex_neg_rule_0 = re.compile(r'((when|how (long|many|much))\b.*)')


def neg_rule_0(inp):
    question = inp["clean"]
    pos = inp["pos"]
    regexs = [rex_neg_rule_0]
    return re_and(question, regexs)# and not(("JJS" in pos) or ("RBS" in pos))

evil_words = ["lyrics", "wrote", "mean", "covered", "cast", "played", "season", "episode", "award", "sing", "sang", "song", "album", "movie"]

def neg_rule_1(inp):
    question = inp["clean"]
    for evil in evil_words:
        if evil in question:
            return True
    return False

def neg_rule_2(inp):
    token = inp['tokenized']
    if len(token) < 4:
        return True
    return False

In [None]:
#regex needed for pos rules
rex_pos_rule_2_1 = re.compile(r'(.*difference between.*)')
rex_pos_rule_2_2 = re.compile(r'(.*compare to.*)')
rex_pos_rule_3 = re.compile(r'((which) (are|is|\'s|s)\b.*\b(a|an|the)\b.*)')
rex_pos_rule_4_1 = re.compile(r'(what are( some)? good\b.*)')
rex_pos_rule_4_2 = re.compile(r'(.*\byour favorite\b.*)')
rex_pos_rule_5 = re.compile(r'(.*(who was (the|a|an).*))')
rex_pos_rule_6 = re.compile(r'((which).*(should i))')
rex_pos_rule_9_1 = re.compile(r'(.*\b(or|and|from|between|vs|and|versus)\b.*)')
rex_pos_rule_9_2 = re.compile(r'(.*\b(distinguish|differ|differentiate|differences|strengths|weaknesses)\b.*)')
rex_pos_rule_9_3 = re.compile(r'(.*(the differen(ce|t) between).*)')
rex_pos_rule_10 = re.compile(r'(.*(the (best)).*)')
rex_pos_rule_11 = re.compile(r'.*\b(is)\b.*')
rex_pos_rule_13_1 = re.compile(r'(.*are\b.*and\b.*same.*)')
rex_pos_rule_13_2 = re.compile(r'(.*same.*\bas\b.*)')
rex_pos_rule_13_3 = re.compile(r'(.*are|is.*)')

def pos_rule_1(row):
    if row["neg_prediction"]:
        return False
    tokens = row["tokenized"]
    try:
        pos_JJ = max(
            find_in_seq(tokens, ["what", "is"]), 
            find_in_seq(tokens, ["what", "are"]), 
            find_in_seq(tokens, ["what", "s"]),
            find_in_seq(tokens, ["what", "'s"]),
            find_in_seq(tokens, ["which", "is"]), 
            find_in_seq(tokens, ["which", "are"]), 
            find_in_seq(tokens, ["which", "s"]),
            find_in_seq(tokens, ["which", "'s"]),
            find_in_seq(tokens, ["is", "the"]), 
            find_in_seq(tokens, ["are", "the"]), 
            find_in_seq(tokens, ["s", "the"]),
            find_in_seq(tokens, ["'s", "the"])

        )
        if pos_JJ < 0:
            return False
        if tokens[pos_JJ] in ["difference", "differences", "pros", "good", "advantages", "better", "similarities"]:
            return True
    except:
        return False
    return False

def pos_rule_2(row):
    tokenized = row["tokenized"]
    pos = row["pos"]
    question = row["clean"]
    return re_or(question, [rex_pos_rule_2_1, rex_pos_rule_2_2]) and not row["neg_prediction"]

def pos_rule_3(row):
    question = row["clean"]
    pos = row["pos"]
    return re_one(question, rex_pos_rule_3) and pos_match(pos, ["JJS", "RBS", "JJR", "RBR"]) and not row["neg_prediction"]

def pos_rule_4(inp):
    question = inp["clean"]
    pos = inp["pos"]
    return re_or(question, [rex_pos_rule_4_1, rex_pos_rule_4_2]) and not inp["neg_prediction"]

def pos_rule_5(inp):
    pos = inp["pos"]
    tokens = inp["tokenized"]
    question = inp["clean"]
    return re_one(question, rex_pos_rule_5) and pos[3] in ["JJS", "RBS", "RBR", "JJR"] and not inp["neg_prediction"]

def pos_rule_6(inp):
    pos = inp["pos"]
    tokens = inp["tokenized"]
    question = inp["clean"]
    return re_one(question, rex_pos_rule_6) and not inp["neg_prediction"]

def pos_rule_7(inp):
    pos = inp["pos"]
    tokens = inp["tokenized"]
    if pos[0] in ["JBS", "RBS"]:
        return True and not inp["neg_prediction"]
    if tokens[0] in ["the", "a", "an"] and pos[1] in ["JBS", "RBS"]:
        return True and not inp["neg_prediction"]
    return False

def pos_rule_8(inp):
    question = inp["clean"]
    pos = inp["pos"]
    if inp["neg_prediction"]:
        return False
    tokens = inp["tokenized"]
    try:
        pos_JJ = max(
            find_in_seq(tokens, ["what", "is"]), 
            find_in_seq(tokens, ["what", "are"]), 
            find_in_seq(tokens, ["what", "s"]),
            find_in_seq(tokens, ["what", "'s"]),
            find_in_seq(tokens, ["which", "is"]), 
            find_in_seq(tokens, ["which", "are"]), 
            find_in_seq(tokens, ["which", "s"]),
            find_in_seq(tokens, ["which", "'s"]),
            find_in_seq(tokens, ["is", "the"]), 
            find_in_seq(tokens, ["are", "the"]), 
            find_in_seq(tokens, ["s", "the"]),
            find_in_seq(tokens, ["'s", "the"])
        )
        if pos_JJ < 0:
            return False
        if pos[pos_JJ] in ["RBS", "JJS"]:
            return True and not neg_rule_2(inp)
        if tokens[pos_JJ] in ["most"]:
            return True and not neg_rule_2(inp)
    except:
        return False
    return False

def pos_rule_9(inp):
    question = inp["clean"]
    if "how" in question:
        return False
    return re_and(question, [rex_pos_rule_9_1, rex_pos_rule_9_2]) or re_one(question, rex_pos_rule_9_3) and not inp["neg_prediction"]

def pos_rule_10(inp):
    pos = inp["pos"]
    tokens = inp["tokenized"]
    question = inp["clean"]
    if ('for the best' in question) or ('how is' in question):
        return False
    return re_one(question, rex_pos_rule_10) and not inp["neg_prediction"]

def pos_rule_11(inp):
    question = inp["clean"]
    pos = inp["pos"]
    return has_singular_interrogative(inp) and pos_match(pos, ["JJS", "RBS"]) and not inp["neg_prediction"]

def pos_rule_12(inp):
    question = inp["clean"]
    pos = inp["pos"]
    tokenized = inp["tokenized"]
    for it, p in enumerate(pos):
        if p in ["JJS", "RBS"] and "the" in tokenized[0: it]:
            return True  and not inp["neg_prediction"]
    return False

def pos_rule_13(inp):
    tokenized = inp["tokenized"]
    pos = inp["pos"]
    question = inp["clean"]
    return (re_one(question, rex_pos_rule_13_1) or re_and(question, [rex_pos_rule_13_2, rex_pos_rule_13_3])) and not inp["neg_prediction"]

In [None]:
clean_df()
#NEGATIVE RULES##################################################################
#if neg_prediction is True, the question is not a comparative question
# if False it is a comparative question
neg_rules = ["neg_rule_0", "neg_rule_1"]
df["neg_rule_0"] = df.apply(lambda x: neg_rule_0(x), axis=1)
df["neg_rule_1"] = df.apply(lambda x: neg_rule_1(x), axis=1)

def make_prediction(old_prediction, prediction_rule):
    return old_prediction or prediction_rule

df["neg_prediction"] = False
for neg_rule in neg_rules:
    df["neg_prediction"] = df.apply(lambda x: make_prediction(x["neg_prediction"], x[neg_rule]), axis=1)
##################################################################################


#POSITIVE RULES###################################################################
#if pred is True, question is comparative
#if pred is False, question is not comparative
pos_rules = [pos_rule_1, pos_rule_2, pos_rule_3, pos_rule_4, pos_rule_5, pos_rule_6, pos_rule_7, pos_rule_8, pos_rule_9, pos_rule_10, pos_rule_11, pos_rule_12, pos_rule_13]
pos_rules_100 = [pos_rule_1, pos_rule_2, pos_rule_3, pos_rule_4, pos_rule_5, pos_rule_6, pos_rule_7, pos_rule_8, pos_rule_9, pos_rule_10]

precision_per_rule = []
recall_per_rule = []
precision_total = []
recall_total = []
for pos_rule in pos_rules_100:
    df[pos_rule.__name__] = df.apply(lambda x: pos_rule(x), axis=1)
    df["pred"] = df.apply(lambda x: make_prediction(x["pred"], x[pos_rule.__name__]), axis=1)

    precision_per_rule.append(precision_score(df["truth"], df[pos_rule.__name__]))
    recall_per_rule.append(recall_score(df["truth"], df[pos_rule.__name__]))

    precision_total.append(precision_score(df["truth"], df["pred"]))
    recall_total.append(recall_score(df["truth"], df["pred"]))

In [None]:
df = df[["comp", "clean", "pred"]]

#### Load all result data frames

In [None]:
def load_probs(split, name, path="./results/"):
    temp_df = pd.read_csv(path + split + "/" + name + ".tsv", sep="\t")
    temp_df[name] = temp_df["prob"].apply(eval).apply(lambda x: x[0])
    temp_df = temp_df[["clean", name]]
    return temp_df

In [None]:
lr = pd.read_csv("./results/hard/lr2_hard.tsv", sep="\t") # the results.zip should be unzipped
lr["lr"] = lr["prob"]


# models trained on hard questions:
roberta_base_mean_hard = load_probs(split="hard", name="roberta_base_mean_hard")

roberta_base_cls_hard = load_probs(split="hard", name="roberta_base_cls_hard")

roberta_large_cls_hard = load_probs(split="hard", name="roberta_large_cls_hard")

bart_large_mean_hard = load_probs(split="hard", name="bart_large_mean_hard")

#models trained on very hard questions:
roberta_base_cls_very_hard = load_probs(split="very_hard", name="roberta_base_cls_very_hard")

bart_large_mean_very_hard = load_probs(split="very_hard", name="bart_large_mean_very_hard")

roberta_base_mean_very_hard = load_probs(split="very_hard", name="roberta_base_mean_very_hard")

roberta_large_cls_very_hard = load_probs(split="very_hard", name="roberta_large_cls_very_hard")

sentencebert_large_cls_very_hard = load_probs(split="very_hard", name="sbert_large_cls_very_hard")

sentencebert_large_mean_very_hard = load_probs(split="very_hard", name="sbert_large_mean_very_hard")

In [None]:
def merge_all(dfs):
    for e in range(len(dfs)):
        if e == 0:
            df = dfs[0]
        else:
            df = df.merge(dfs[e], on="clean", how="left")
    return df

In [None]:
res = merge_all(dfs=[
    df,
    lr,
    roberta_base_mean_hard,
    roberta_base_cls_hard,
    roberta_large_cls_hard,
    bart_large_mean_hard,
    roberta_base_cls_very_hard,
    bart_large_mean_very_hard,
    roberta_base_mean_very_hard,
    roberta_large_cls_very_hard,
    sentencebert_large_cls_very_hard,
    sentencebert_large_mean_very_hard])

#### Averaging the probabilities at the last step

In [None]:
res["avg"] = res.iloc[:,[True if ("lr" in col) or ("hard" in col) else False for col in res.columns]].mean(axis=1)

In [None]:
from tqdm import tqdm
from matplotlib import pyplot as plt
thresholds = np.append(np.arange(1,0.9, -0.0001), np.arange(0.9,0, -0.001))

precision_scores, recall_scores = list(), list()
predictions = res["avg"]
for threshold in tqdm(thresholds):
    prob_preds = np.where(predictions>=threshold, 1, 0)
    temp_classification_report = classification_report(y_true=y_true, y_pred=prob_preds, output_dict=True)['1']
    precision = round(temp_classification_report['precision'], 3)
    precision_scores.append(precision)                      
    recall_scores.append(round(temp_classification_report['recall'], 3))

l = [item for item in zip(precision_scores, recall_scores, list(thresholds)) if item[0] != 0]

plt.plot([i[1] for i in l], [i[0] for i in l], marker='.', label="avg prediction")
plt.xlabel('Recall')
plt.ylabel('Precision')
#show the legend
plt.legend()
plt.grid()
#show the plot
plt.show()

l1 = [i for i in zip(precision_scores, recall_scores, list(thresholds)) if i[0]==1]
try:
    print("Comp. questions: max Prec. {:.3f} with Rec. {:.3f} at thresh. {:.6f}".format(l1[-1][0], l1[-1][1], l1[-1][2]))
except:
    print("Model doesn't reach precision of 1.00")
try:
    l3 = [item for item in l if 0.95 < item[0] < 1]
    print("Comp. questions: max Prec. {:.3f} with Rec. {:.3f} at thresh. {:.6f}".format(l3[-1][0], l3[-1][1], l3[-1][2]))
    print("F1: {:.3f}".format(2*l3[-1][0]*l3[-1][1]/(l3[-1][0] + l3[-1][1])))
except:
    print("Model doesn't reach precision of 0.95")
try:
    l2 = [item for item in l if 0.90 < item[0] < 1]
    print("Comp. questions: max Prec. {:.3f} with Rec. {:.3f} at thresh. {:.6f}".format(l2[-1][0], l2[-1][1], l2[-1][2]))
    print("F1: {:.3f}".format(2*l2[-1][0]*l2[-1][1]/(l2[-1][0] + l2[-1][1])))
except:print("Model doesn't reach precision of 0.90")

#### Define the ensemble

In [None]:
def ensemble(row, models=[]):
    rules = row["pred"]
    lr = row["lr"]
    roberta_base_mean_hard = row["roberta_base_mean_hard"]
    roberta_base_cls_hard = row["roberta_base_cls_hard"]
    roberta_large_cls_hard = row["roberta_large_cls_hard"]
    bart_large_mean_hard = row["bart_large_mean_hard"]

    roberta_base_cls_very_hard = row["roberta_base_cls_very_hard"]
    roberta_base_mean_very_hard = row["roberta_base_mean_very_hard"]
    roberta_large_cls_very_hard = row["roberta_large_cls_very_hard"]
    sentencebert_large_cls_very_hard = row["sbert_large_cls_very_hard"]
    sentencebert_large_mean_very_hard = row["sbert_large_mean_very_hard"]
    bart_large_mean_very_hard = row["bart_large_mean_very_hard"]

    avg = row["avg"]

    if "rules" in models:
        if rules: return 1
    if "lr" in models:
        if lr >= 0.903700: return 1
    if "roberta_base_cls_hard" in models:
        if roberta_base_cls_hard >= 0.988100: return 1
    if "bart_large_mean_hard" in models:
        if bart_large_mean_hard >= 1: return 1
    if "sentencebert_large_mean_very_hard" in models:
        if sentencebert_large_mean_very_hard >= 1: return 1
    if "bart_large_mean_very_hard" in models:
        if bart_large_mean_very_hard >= 1: return 1
    if "avg" in models:
        if avg >= 0.890000: return 1
    return 0

models = [
    "rules",
    "lr",
    "roberta_base_cls_hard",
    "bart_large_mean_hard",
    "sentencebert_large_mean_very_hard",
    "bart_large_mean_very_hard",
    "avg"]

recalls = []
precisions = []
for e in range(len(models)):
    if e == 0:
        model_subset = models[0]
    else:
        model_subset = models[0:e+1]
    y_true = res["comp"]
    y_pred = res.apply(lambda x: ensemble(x, model_subset), axis=1)
    recall = classification_report(y_true = y_true, y_pred = y_pred, output_dict=True)["1"]["recall"]
    precision = classification_report(y_true = y_true, y_pred = y_pred, output_dict=True)["1"]["precision"]
    recalls.append(recall)
    precisions.append(precision)

In [None]:
def calc_change(recalls):
    changes = []
    curr = 0
    for value in recalls:
        change = value - curr
        curr = value
        changes.append(change)
    return changes

cumulative_df = pd.DataFrame({"names": models, "precision": precisions, "cumulative_recall":recalls})
cumulative_df["change"] = calc_change(cumulative_df["cumulative_recall"])

#### Plot the precision-recall curve for the ensemble classifier

In [None]:
import matplotlib.pyplot as plt
x = models
plt.plot(x, recalls, "bo")
plt.title("cumulative recall")
plt.xticks(rotation=90)