In [None]:
import pandas as pd
import re
import numpy as np

from sklearn.base import clone
from sklearn.metrics import classification_report, precision_score, recall_score
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import StratifiedKFold

import tensorflow as tf
from transformers import pipeline, AutoTokenizer
from tqdm import tqdm

from matplotlib import pyplot as plt

import ast
import warnings

warnings.filterwarnings('ignore')

In [None]:
# May be needed for experiments on GPU. The experiments can be run on CPU, too 
physical_devices = tf.config.list_physical_devices('GPU')
try:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
    print("successful allowed memory to grow")
except:
    # Invalid device or cannot modify virtual devices once initialized.
    pass

#### Helper function

In [None]:
# Examples of usage are below

def run_crossvalidation(model, questions, X, y, path_splits, save_splits=False, make_plot=True, thresholds, 
                        nr_splits=10, early_stop=True, plot_label="", is_keras=False, batch_size=5, return_prob_df=False):
    y_cv_true = []
    y_cv_predict = []
    question_cv = []
    kf = StratifiedKFold(n_splits=nr_splits)
    split = 1
    for train_index, test_index in kf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        y_cv_true.extend(y_test)
        question_cv.extend(questions[test_index])
        if not is_keras:
            temp_model = clone(model)
            temp_model.fit(X_train, y_train)
            predictions = temp_model.predict_proba(X_test)
            predictions = predictions[:,1]
        #keras models:
        else:
            temp_model = tf.keras.models.clone_model(model)
            temp_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[TruePositives()])
            es = EarlyStopping(monitor='loss', mode='min', verbose=0)
            temp_model.fit(X_train, y_train, epochs=100, batch_size=batch_size, verbose=0, callbacks=[es])
            predictions = temp_model.predict(X_test, batch_size=batch_size)
            
        y_cv_predict.extend(predictions)

        if save_splits:
            questions_cv = questions[test_index]
            df_out = pd.DataFrame({"comp": y_test, "clean": questions_cv, "prediction": predictions})
            df_out.to_csv(path_splits + "split_{}".format(str(split)) + ".tsv", index=False, sep="\t")
        split += 1
    
    predictions = np.array(y_cv_predict)
    y_true = np.array(y_cv_true )
    

    if make_plot:
        precision_scores, recall_scores = list(), list()
        for threshold in tqdm(thresholds):
            prob_preds = np.where(predictions>=threshold, 1, 0)
            temp_classification_report = classification_report(y_true=y_true, y_pred=prob_preds, output_dict=True)['1']
            precision = round(temp_classification_report['precision'], 3)
            precision_scores.append(precision)                      
            recall_scores.append(round(temp_classification_report['recall'], 3))
            if early_stop:
                if precision < 0.90:
                    break

        l = [item for item in zip(precision_scores, recall_scores, list(thresholds)) if item[0] != 0]

        plt.plot([i[1] for i in l], [i[0] for i in l], marker='.', label=plot_label)
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        #show the legend
        plt.legend()
        plt.grid()
        #show the plot
        plt.show()

        l1 = [i for i in zip(precision_scores, recall_scores, list(thresholds)) if i[0]==1]
        try:
            print("Comp. questions: max Prec. {:.3f} with Rec. {:.3f} at thresh. {:.6f}".format(l1[-1][0], l1[-1][1], l1[-1][2]))
        except:
            print("Model doesn't reach precision of 1.00")
        try:
            l3 = [item for item in l if 0.95 < item[0] < 1]
            print("Comp. questions: max Prec. {:.3f} with Rec. {:.3f} at thresh. {:.6f}".format(l3[-1][0], l3[-1][1], l3[-1][2]))
            print("F1: {:.3f}".format(2*l3[-1][0]*l3[-1][1]/(l3[-1][0] + l3[-1][1])))
        except:
            print("Model doesn't reach precision of 0.95")
        try:
            l2 = [item for item in l if 0.90 < item[0] < 1]
            print("Comp. questions: max Prec. {:.3f} with Rec. {:.3f} at thresh. {:.6f}".format(l2[-1][0], l2[-1][1], l2[-1][2]))
            print("F1: {:.3f}".format(2*l2[-1][0]*l2[-1][1]/(l2[-1][0] + l2[-1][1])))
        except:print("Model doesn't reach precision of 0.90")

    if return_prob_df:
        return  pd.DataFrame({"clean": question_cv, "prob": y_cv_predict})  

In [None]:
# This neural network is used for the embeddings by the "base" transformer models

import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM
from sklearn.preprocessing import LabelBinarizer
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.metrics import TruePositives, Precision

tf.random.set_seed(2)
model = Sequential()
model.add(Dense(512, input_shape=(768,), activation='relu'))
#model.add(Dropout(0.2))
model.add(Dense(256, activation='relu'))
#model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
#model.add(Dropout(0.2))
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
# compile the keras model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[TruePositives()]) #metrics=['accuracy']

In [None]:
# This neural network is used for the embeddings by the "large" transformer models

tf.random.set_seed(2)
model2 = Sequential()
model2.add(Dense(512, input_shape=(1024,), activation='relu'))
#model.add(Dropout(0.2))
model2.add(Dense(256, activation='relu'))
#model.add(Dropout(0.2))
model2.add(Dense(64, activation='relu'))
#model.add(Dropout(0.2))
model2.add(Dense(16, activation='relu'))
model2.add(Dense(1, activation='sigmoid'))
# compile the keras model
model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=[TruePositives()]) #metrics=['accuracy']

In [None]:
# read the data, e.g., very hard questions (after logistic regression)
very_hard = pd.read_csv("very_hard.tsv", sep="\t")[["comp", "clean"]]

#### Compute question representations using pre-trained transformer models

In [None]:
from transformers import pipeline, AutoTokenizer
from tqdm import tqdm

questions = very_hard["clean"] # questions cleansed from punctuation are used

# you can specify any model available in the transformers library
# options used in the paper include roberta-base, roberta-large, sentence-transformers/bert-large-nli-mean-tokens, facebook/bart-large-cnn
# you should specify the same model name, both in model and tokenizer

feature_extraction = pipeline('feature-extraction', model="roberta-base", tokenizer="roberta-base", device=0) # device=-1 for CPU, device=0 for GPU

# both the CLS-token emebeddings (only), and the mean of all tokens can be used
X_cls, X_mean = list(), list()

for question in tqdm(questions):
    features = feature_extraction(question)
    cls = features[0][0]
    features = np.mean(features[0], axis=0)
    X_mean.append(features)
    X_cls.append(cls)

In [None]:
# for the CLS-token emebeddings

X = np.array(X_cls)
X = np.stack(X)
X.shape

In [None]:
# for the mean of all token embeddings

X = np.array(X_mean)
X = np.stack(X)
X.shape

#### Example to run classification experiments

In [None]:
y = very_hard["comp"] # true labels

roberta_base_cls_very_hard = run_crossvalidation(model = model, #"model" when transformer-base is used, "model2" when large 
questions=very_hard["clean"], #cleaned questions
X=X, #representations: either for CLS-token emebeddings or the mean of all token embeddings
y=y, # true labels
path_splits='', #specifies the path to save the results for each split of the cross-validation (if save_splits=True)
save_splits=False, #set True to save the results for each split of the cross-validation
make_plot=True, #True to plot the precision-recall curve
thresholds=np.arange(1,0, -0.001), #range of the classifier's probability thresholds for plotting the precision-recall curve
nr_splits=10, #number of CV splipts
early_stop=True,
plot_label="feed forward NN",
is_keras=True,
batch_size=5,
return_prob_df=True) #returns the results as a dataframe that contains classifier's probabilities

roberta_base_cls_very_hard.to_csv('', sep="\t") #returnd dataframe can be saved