## Notebook to apply the classification rules for classifying questions as comparative or not

In [None]:
%matplotlib inline
import re
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, confusion_matrix
from matplotlib import pyplot as plt
import matplotlib.pyplot as plt
import matplotlib
import ast
import warnings

warnings.filterwarnings('ignore')

### Read the data

In [None]:
PATH = '' # specify the path to the /Webis-CompQuestions-22/comparative-questions-binary-labels/data_full.tsv
df_full = pd.read_csv(PATH, sep="\t")

df = df_full.copy()
df["pos"] = df["pos"].apply(eval)
df["lemma"] = df["lemma"].apply(eval)
df["tokenized"] = df["clean"].apply(lambda x: re.split(" ", x))

### Define and apply the classification rules 

In [None]:
def re_and(inp, regexs):
    for rex in regexs:
        if rex.match(inp) is None:
            return False
    return True


def re_or(inp, regexs):
    for rex in regexs:
        if rex.match(inp) is not None:
            return True
    return False

def re_one(inp, regex):
    if regex.match(inp) is not None:
        return True
    return False

def has_interrogative(inp):
    interrogratives = ["who", "which", "what", "where", "when", "how", "why"]
    tokens = word_tokenize(inp["clean"])
    for i in interrogratives:
        if i in tokens:
            return True
    return False

def has_singular_interrogative(inp):
    interrogratives = ["who", "which", "what", "where"]
    tokens = word_tokenize(inp["clean"])
    for i in interrogratives:
        if i in tokens:
            return True
    return False

def pos_match(inp, pos_tags):
    for p in inp:
        if p in pos_tags:
            return True
    return False

def find_in_seq(arr, q):
    if q[0] not in arr:
        return -1
    pos = arr.index(q[0])
    for it, x in enumerate(q[1:]):
        if x != arr[it + pos +1]:
            return -1
    return pos + len(q)

def clean_df():
    if "pred" in df.columns:
        df["pred"] = False
    if "neg_prediction" in df.columns:
        df["neg_prediction"] = False
    for col in df.columns:
        if "Unnamed: 0" in col:
            del df[col]
        if "pos_rule" in col:
            del df[col]
        if "neg_rule" in col:
            del df[col]

In [None]:
df["pred"] = False
df["truth"] = df["comp"].apply(lambda x: True if x==1.0 else False)

In [None]:
#regex needed for the negative rules (a question is not comparative)

rex_neg_rule_0 = re.compile(r'((when|how (long|many|much))\b.*)')


def neg_rule_0(inp):
    question = inp["clean"]
    pos = inp["pos"]
    regexs = [rex_neg_rule_0]
    return re_and(question, regexs)

evil_words = ["lyrics", "wrote", "mean", "covered", "cast", "played", "season", "episode", "award", "sing", "sang", "song", "album", "movie"]

def neg_rule_1(inp):
    question = inp["clean"]
    for evil in evil_words:
        if evil in question:
            return True
    return False

def neg_rule_2(inp):
    token = inp['tokenized']
    if len(token) < 4:
        return True
    return False

In [None]:
#regex needed for the positive rules (a question is comparative)
rex_pos_rule_2_1 = re.compile(r'(.*difference between.*)')
rex_pos_rule_2_2 = re.compile(r'(.*compare to.*)')
rex_pos_rule_3 = re.compile(r'((which) (are|is|\'s|s)\b.*\b(a|an|the)\b.*)')
rex_pos_rule_4_1 = re.compile(r'(what are( some)? good\b.*)')
rex_pos_rule_4_2 = re.compile(r'(.*\byour favorite\b.*)')
rex_pos_rule_5 = re.compile(r'(.*(who was (the|a|an).*))')
rex_pos_rule_6 = re.compile(r'((which).*(should i))')
rex_pos_rule_9_1 = re.compile(r'(.*\b(or|and|from|between|vs|and|versus)\b.*)')
rex_pos_rule_9_2 = re.compile(r'(.*\b(distinguish|differ|differentiate|differences|strengths|weaknesses)\b.*)')
rex_pos_rule_9_3 = re.compile(r'(.*(the differen(ce|t) between).*)')
rex_pos_rule_10 = re.compile(r'(.*(the (best)).*)')
rex_pos_rule_11 = re.compile(r'.*\b(is)\b.*')
rex_pos_rule_13_1 = re.compile(r'(.*are\b.*and\b.*same.*)')
rex_pos_rule_13_2 = re.compile(r'(.*same.*\bas\b.*)')
rex_pos_rule_13_3 = re.compile(r'(.*are|is.*)')

def pos_rule_1(row):
    if row["neg_prediction"]:
        return False
    tokens = row["tokenized"]
    try:
        pos_JJ = max(
            find_in_seq(tokens, ["what", "is"]), 
            find_in_seq(tokens, ["what", "are"]), 
            find_in_seq(tokens, ["what", "s"]),
            find_in_seq(tokens, ["what", "'s"]),
            find_in_seq(tokens, ["which", "is"]), 
            find_in_seq(tokens, ["which", "are"]), 
            find_in_seq(tokens, ["which", "s"]),
            find_in_seq(tokens, ["which", "'s"]),
            find_in_seq(tokens, ["is", "the"]), 
            find_in_seq(tokens, ["are", "the"]), 
            find_in_seq(tokens, ["s", "the"]),
            find_in_seq(tokens, ["'s", "the"])

        )
        if pos_JJ < 0:
            return False
        if tokens[pos_JJ] in ["difference", "differences", "pros", "good", "advantages", "better", "similarities"]:
            return True
    except:
        return False
    return False

def pos_rule_2(row):
    tokenized = row["tokenized"]
    pos = row["pos"]
    question = row["clean"]
    return re_or(question, [rex_pos_rule_2_1, rex_pos_rule_2_2]) and not row["neg_prediction"]

def pos_rule_3(row):
    question = row["clean"]
    pos = row["pos"]
    return re_one(question, rex_pos_rule_3) and pos_match(pos, ["JJS", "RBS", "JJR", "RBR"]) and not row["neg_prediction"]

def pos_rule_4(inp):
    question = inp["clean"]
    pos = inp["pos"]
    return re_or(question, [rex_pos_rule_4_1, rex_pos_rule_4_2]) and not inp["neg_prediction"]

def pos_rule_5(inp):
    pos = inp["pos"]
    tokens = inp["tokenized"]
    question = inp["clean"]
    return re_one(question, rex_pos_rule_5) and pos[3] in ["JJS", "RBS", "RBR", "JJR"] and not inp["neg_prediction"]

def pos_rule_6(inp):
    pos = inp["pos"]
    tokens = inp["tokenized"]
    question = inp["clean"]
    return re_one(question, rex_pos_rule_6) and not inp["neg_prediction"]

def pos_rule_7(inp):
    pos = inp["pos"]
    tokens = inp["tokenized"]
    if pos[0] in ["JBS", "RBS"]:
        return True and not inp["neg_prediction"]
    if tokens[0] in ["the", "a", "an"] and pos[1] in ["JBS", "RBS"]:
        return True and not inp["neg_prediction"]
    return False

def pos_rule_8(inp):
    question = inp["clean"]
    pos = inp["pos"]
    if inp["neg_prediction"]:
        return False
    tokens = inp["tokenized"]
    try:
        pos_JJ = max(
            find_in_seq(tokens, ["what", "is"]), 
            find_in_seq(tokens, ["what", "are"]), 
            find_in_seq(tokens, ["what", "s"]),
            find_in_seq(tokens, ["what", "'s"]),
            find_in_seq(tokens, ["which", "is"]), 
            find_in_seq(tokens, ["which", "are"]), 
            find_in_seq(tokens, ["which", "s"]),
            find_in_seq(tokens, ["which", "'s"]),
            find_in_seq(tokens, ["is", "the"]), 
            find_in_seq(tokens, ["are", "the"]), 
            find_in_seq(tokens, ["s", "the"]),
            find_in_seq(tokens, ["'s", "the"])
        )
        if pos_JJ < 0:
            return False
        if pos[pos_JJ] in ["RBS", "JJS"]:
            return True and not neg_rule_2(inp)
        if tokens[pos_JJ] in ["most"]:
            return True and not neg_rule_2(inp)
    except:
        return False
    return False

def pos_rule_9(inp):
    question = inp["clean"]
    if "how" in question:
        return False
    return re_and(question, [rex_pos_rule_9_1, rex_pos_rule_9_2]) or re_one(question, rex_pos_rule_9_3) and not inp["neg_prediction"]

def pos_rule_10(inp):
    pos = inp["pos"]
    tokens = inp["tokenized"]
    question = inp["clean"]
    if ('for the best' in question) or ('how is' in question):
        return False
    return re_one(question, rex_pos_rule_10) and not inp["neg_prediction"]

def pos_rule_11(inp):
    question = inp["clean"]
    pos = inp["pos"]
    return has_singular_interrogative(inp) and pos_match(pos, ["JJS", "RBS"]) and not inp["neg_prediction"]

def pos_rule_12(inp):
    question = inp["clean"]
    pos = inp["pos"]
    tokenized = inp["tokenized"]
    for it, p in enumerate(pos):
        if p in ["JJS", "RBS"] and "the" in tokenized[0: it]:
            return True  and not inp["neg_prediction"]
    return False

def pos_rule_13(inp):
    tokenized = inp["tokenized"]
    pos = inp["pos"]
    question = inp["clean"]
    return (re_one(question, rex_pos_rule_13_1) or re_and(question, [rex_pos_rule_13_2, rex_pos_rule_13_3])) and not inp["neg_prediction"]

In [None]:
clean_df()

#NEGATIVE RULES##################################################################
#if neg_prediction is True, the question is not a comparative question
# if False it is a comparative question
neg_rules = ["neg_rule_0", "neg_rule_1"]
df["neg_rule_0"] = df.apply(lambda x: neg_rule_0(x), axis=1)
df["neg_rule_1"] = df.apply(lambda x: neg_rule_1(x), axis=1)

def make_prediction(old_prediction, prediction_rule):
    return old_prediction or prediction_rule

df["neg_prediction"] = False
for neg_rule in neg_rules:
    df["neg_prediction"] = df.apply(lambda x: make_prediction(x["neg_prediction"], x[neg_rule]), axis=1)
##################################################################################


#POSITIVE RULES###################################################################
#if pred is True, question is comparative
#if pred is False, question is not comparative
pos_rules = [pos_rule_1, pos_rule_2, pos_rule_3, pos_rule_4, pos_rule_5, pos_rule_6, pos_rule_7, pos_rule_8, pos_rule_9, pos_rule_10, pos_rule_11, pos_rule_12, pos_rule_13]
pos_rules_100 = [pos_rule_1, pos_rule_2, pos_rule_3, pos_rule_4, pos_rule_5, pos_rule_6, pos_rule_7, pos_rule_8, pos_rule_9, pos_rule_10]

precision_per_rule = []
recall_per_rule = []
precision_total = []
recall_total = []
for pos_rule in pos_rules_100:
    df[pos_rule.__name__] = df.apply(lambda x: pos_rule(x), axis=1)
    df["pred"] = df.apply(lambda x: make_prediction(x["pred"], x[pos_rule.__name__]), axis=1)

    precision_per_rule.append(precision_score(df["truth"], df[pos_rule.__name__]))
    recall_per_rule.append(recall_score(df["truth"], df[pos_rule.__name__]))

    precision_total.append(precision_score(df["truth"], df["pred"]))
    recall_total.append(recall_score(df["truth"], df["pred"]))

In [None]:
# Reports the effectiveness of the rules
print(classification_report(df["truth"], df["pred"],  digits=3))

In [None]:
# Identifies and saves questions that are not recognized by the rules as comparative (hard questions)
# Hard questions will be used in the logistic regression classifier (next step in the ensemble)

OUTPUT_PATH = '' # specifies where to store hard questions, e.g., as a tsv file hard_questions.tsv
# For the convienience /Webis-CompQuestions-22/comparative-questions-binary-labels/ already contains hard_questions.tsv
# that can be used directly

hard_questions = df.loc[np.invert((df["comp"]==1) & (df["pred"]==True))][["id", "comp","clean", "lemma", "pos", "tokenized"]]
hard_questions.to_csv(OUTPUT_PATH, sep="\t", index=False)

### Logistic regression classifier for hard questions

In [None]:
import pandas as pd
from ast import literal_eval
import scipy.sparse as sp
import numpy as np
from ast import literal_eval
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, KFold, StratifiedKFold
from sklearn.metrics import classification_report, make_scorer, precision_score, recall_score
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

### Read the data

In [None]:
INPUT_PATH = '' # specifies the path to the hard questions, e.g., as a tsv file hard_questions.tsv
# For the convienience /Webis-CompQuestions-22/comparative-questions-binary-labels/ already contains hard_questions.tsv
# that can be used directly

hard_questions = pd.read_csv(INPUT_PATH, sep="\t")
hard_questions["pos"] = hard_questions["pos"].apply(eval)
hard_questions["lemma"] = hard_questions["lemma"].apply(eval)
hard_questions["tokenized"] = hard_questions["tokenized"].apply(eval)

In [None]:
def combine(line):
    lemmas = line["lemma"]
    pos = line["pos"]
    combined = ""
    for e in range(len(lemmas)):
        combined += lemmas[e] + " " + pos[e] + " "
    #combined = " ".join(lemmas) + " "
    #combined += " ".join(pos)
    return combined

def combine_token_and_pos(line):
    tokens = line["tokenized"]
    pos = line["pos"]
    combined = ""
    for e in range(len(tokens)):
        combined += tokens[e] + " " + pos[e] + " "

    return combined


def strip_punct(s):
    s = re.sub('[^А-Яа-яA-Za-z0-9]', ' ', s)
    s = s.lower()
    return " ".join(s.split())

hard_questions["combined"] = hard_questions.apply(lambda x: combine(x), axis=1)
hard_questions["lemma"] = hard_questions["lemma"].apply(lambda x: " ".join(x))
hard_questions = hard_questions.drop_duplicates(subset="clean")
hard_questions = hard_questions.reset_index(drop=True)

In [None]:
train_questions = np.array(hard_questions["clean"].tolist())
y_train = np.array(hard_questions["comp"].tolist())
ids = np.array(hard_questions.id.to_list())

### Logistic regression with a 10-fold crossvalidation

In [None]:
path_out = '' # specifies where to store 10 splits with hard questions, e.g., a directory hard_question_splits


kf = StratifiedKFold(n_splits=10)
preds_pos, y_trainCv = list(), list()

split = 1
very_hard_ids = list()

for train_index, test_index in kf.split(train_questions, y_train):
    quest_train, quest_test = train_questions[train_index], train_questions[test_index]
    y_tr, y_ts = y_train[train_index], y_train[test_index]
    hard_ids = ids[test_index]
    precision_scores, recall_scores = list(), list()
    y_trainCv.append(y_ts)
    very_hard_ids.extend(hard_ids)
    
    vectorizer = CountVectorizer(ngram_range=(4,4), analyzer='word', min_df = 1, token_pattern='(?u)\\b\\w+\\b')
    vectorizer.fit(quest_train)
    X_tr = vectorizer.transform(quest_train)
    X_ts = vectorizer.transform(quest_test)

    clf = LogisticRegression(C = 48., penalty = 'l2', solver = 'liblinear')
    clf.fit(X_tr, y_tr)
    predictions = clf.predict_proba(X_ts)
    binary_predictions = clf.predict(X_ts)

    pPred = predictions[:,1]
    nPred = predictions[:,0]
    predictions = predictions[:,1]    
    preds_pos.append(predictions)
    
    # Saves the 10 train-test splits
    questions_clean = hard_questions[hard_questions.clean.isin(quest_test)].clean
    
    df_out = pd.DataFrame({'comp': y_ts, 'clean': questions_clean, 'neg_prob': nPred, 'pos_prob': pPred})
    df_out.to_csv(path_out + 'split_{}'.format(str(split)) + '.tsv', index=False, sep='\t')
    split += 1
    
    
# The code below will plot a precision-reacall curve used for identifying a classifier's operating point

preds_pos_flat = np.array([item for sublist in preds_pos for item in sublist])
y_trainCv_flat = [item for sublist in y_trainCv for item in sublist]

precision_scores, recall_scores = list(), list()

# Selecting the probability threshold, for which LR achives a Precision  of 1.0 for comparative questions
thresholds = np.arange(1, 0, -0.001)
thresholds = np.array(list(np.arange(1, 0.5, -0.0001)) + list(np.arange(0.5, 0, -0.001)))
thresholds = [round(p, 6) for p in thresholds]

for threshold in tqdm(thresholds):
    prob_preds = np.where(preds_pos_flat>=threshold, 1, 0)
    temp_classification_report = classification_report(y_true=y_trainCv_flat, y_pred=prob_preds, output_dict=True)['1']
    precision = round(temp_classification_report['precision'], 3)
    precision_scores.append(precision)                      
    recall_scores.append(round(temp_classification_report['recall'], 3))
    if 0.9 < precision < 0.98:
        break

l = [item for item in zip(precision_scores, recall_scores, list(thresholds)) if item[0] != 0]

plt.plot([i[1] for i in l], [i[0] for i in l], marker='.', label='Logistic')
plt.xlabel('Recall')
plt.ylabel('Precision')
# show the legend
plt.legend()
plt.grid()
# show the plot
plt.show()

l1 = [i for i in zip(precision_scores, recall_scores, list(thresholds)) if i[0]==1]
print("Comp. questions: max Prec. {:.3f} with Rec. {:.3f} at thresh. {:.6f}".format(l1[-1][0], l1[-1][1], l1[-1][2]))

### Save questions that are not classified by the LR as comparative

In [None]:
from os import listdir
from os.path import join

path = '' #specifies where the 10 splits with hard questions were stored, e.g., a directory hard_question_splits (the cell above)
dfs = []
for file in listdir(path):
    df = pd.read_csv(join(path, file), sep='\t')
    dfs.append(df)
res_comp = pd.concat(dfs)

questions = list()

for q in res_comp.clean.tolist():
    if res_comp.loc[res_comp.clean == q].pos_prob.values[0] < 0.903700: questions.append(q)

train_very_hard = res_comp[res_comp['clean'].isin(questions)]
train_very_hard = train_very_hard.drop(['neg_prob', 'pos_prob'], axis=1)

PATH_OUT = '' #specifies where to save very hard questions (that are used in the next step of the ensemble), e.g., the file very_hard.tsv

train_very_hard.to_csv('PATH_OUT', sep='\t', index=False)

# For the convienience /Webis-CompQuestions-22/comparative-questions-binary-labels/ already contains very_hard.tsv
# that can be used directly