### This notebook provides the code to experiment with transformer embeddings

In [None]:
from transformers import pipeline
from transformers import RobertaTokenizer, RobertaModel,  LongformerTokenizer, LongformerModel
import torch
import pandas as pd
from tqdm import tqdm
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
import warnings

warnings.filterwarnings('ignore')

tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
model = RobertaModel.from_pretrained('roberta-large')

# to experiment with logformer:
#tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-large-4096')
#model = LongformerModel.from_pretrained('allenai/longformer-large-4096')

def extract_features(answers):
    '''Used to represent answers using the transformer model defined above'''
    X = list()
    for answer in tqdm(answers):
        input_ids = torch.tensor(tokenizer.encode(answer, max_length=512, truncation=True)).unsqueeze(0)
        outputs = model(input_ids)
        last_hidden_states = outputs[0]
        X.append(last_hidden_states.detach().numpy())
    return X

def extract_features_extended(answers, questions):
    '''Used to represent question + answer using the transformer model defined above'''
    X = list()
    for a, q in tqdm(zip(answers, questions)):
        answer_concat = q + ' ' + a
        input_ids = torch.tensor(tokenizer.encode(answer_concat, max_length=512, truncation=True)).unsqueeze(0)
        outputs = model(input_ids)
        last_hidden_states = outputs[0]
        X.append(last_hidden_states.detach().numpy())
    return X

#### Read and prepare the data

In [None]:
data_df = pd.read_csv('DATASET.tsv', sep='\t', encoding='utf-8') # specify the stance dataset
train_df, test_df = train_test_split(data_df, test_size=0.2) # create train/test splits

In [None]:
train_answers = np.array(train_df.answer.tolist()) # to experiment with the masked answers use train_df.masked_answer.tolist()
train_questions = np.array(train_df.question.tolist()) # alternatevely masked questions can be used
y_train = np.array(train_df.answer_stance.tolist())
test_answers = np.array(test_df.answer.tolist())   # to experiment with the masked answers use train_df.masked_answer.tolist()
test_questions = np.array(test_df.masked_question.tolist()) # alternatevely masked questions can be used
y_test = np.array(test_df.answer_stance.tolist())

X_train = extract_features(train_answers) # Used for only answer embeddings
#X_train = extract_features_extended(train_answers, train_questions) # used for the embeddings question + answer
X_test = extract_features(test_answers)
#X_test = extract_features_extended(test_answers, test_questions)

X_train_mean = np.array([np.mean(X[0], axis=0) for X in X_train]) # mean of all token embeddings
X_train_cls = np.array([X[0][0] for X in X_train]) # only the CLS-token embedding
X_test_mean = np.array([np.mean(X[0], axis=0) for X in X_test])
X_test_cls = np.array([X[0][0] for X in X_test])

del X_train
del X_test

#### Logistic regresion

In [None]:
%%time
# Find the optimal hyper-parameters for logistic regression

tuned_parameters = {'penalty' : ['l1', 'l2'],
                    'C' : np.logspace(-4, 4, 20),
                    'solver' : ['liblinear', 'lbfgs']}

lr = LogisticRegression()
clf = GridSearchCV(lr, tuned_parameters, cv=5, scoring='accuracy')
clf.fit(X_train_mean, y_train) # for only CLS-token embeddings use X_train_cls
print("Best parameters set found on the train set:")
print(clf.best_params_)

In [None]:
clf = LogisticRegression()
clf = LogisticRegression(C=0.07, penalty='l2', solver='lbfgs')
#clf = LinearSVC() # to use the SVM classifier 

clf.fit(X_train_mean, y_train) # for only CLS-token embeddings use X_train_cls
y_pred = clf.predict(X_test_mean)

print(classification_report(y_test, y_pred, digits=3, zero_division=False))
print(confusion_matrix(y_test, y_pred))

#### Feedforward neural network

In [None]:
%%time
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from keras.callbacks import EarlyStopping
from keras.metrics import TruePositives, Precision
from keras.utils import np_utils

tf.random.set_seed(42)

model = Sequential()
model.add(Dense(768, input_shape=(1024,), activation='relu'))
#model.add(Dropout(0.2))
model.add(Dense(256, activation='relu'))
#model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
#model.add(Dropout(0.2))
model.add(Dense(16, activation='relu'))
model.add(Dense(4, activation='softmax'))
# compile the keras model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

es = EarlyStopping(monitor='loss', mode='min', verbose=0)

Y = np_utils.to_categorical(y_train)
model.fit(np.array(X_train_mean), Y, epochs=100, batch_size=5,verbose=0, callbacks=[es]) # for only CLS-token embeddings use X_train_cls

y_pred = np.argmax(model.predict(X_test_mean), axis=-1)

print(classification_report(y_test, y_pred, digits=3, zero_division=False))