In [None]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
nltk.download('stopwords')

In [None]:
# === Set your hyperparameters here ===

# Choose the dataset to use from ["en", "de"]
lang = "en"
assert lang in ["en", "de"]

k_values = [1, 3, 5, 10]

# ================ End ================

In [None]:
# Load data

data_path = f'../../data/trial/train/{lang}.tsv'
variables_path = f'../../data/trial/vocabulary/{lang}.tsv'

data_df = pd.read_csv(data_path, sep ='\t')
variable_df = pd.read_csv(variables_path, sep ='\t')

data_df = data_df[data_df['is_variable']==1].reset_index(drop=True)

In [None]:
data_df

Text Preprocessor

In [None]:
def text_preprocess(ds: pd.Series) -> pd.Series:

    for m in range(len(ds)):
        
        main_words = re.sub('[^a-zA-Z]', ' ', str(ds[m]))                                      # Retain only alphabets
        main_words = (main_words.lower()).split()
        main_words = [w for w in main_words if not w in set(stopwords.words('english'))]  # Remove stopwords
        
        lem = WordNetLemmatizer()
        main_words = [lem.lemmatize(w) for w in main_words if len(w) > 1]                 # Group different forms of the same word
        
        main_words = ' '.join(main_words)
        ds[m] = main_words

    return ds

Create Bag of Words Matrices

BoW vocabulary is created using the variable detection dataset. This vocabulary is used to create matrices for both datasets.

In [None]:
data_df['text'] = text_preprocess(data_df['text'])
variable_df['v_question'] = text_preprocess(variable_df['v_question'])

X = data_df['text'].values
vX = variable_df['v_question'].values


vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X).toarray()
vX = vectorizer.transform(vX).toarray()

Top k Accuracy Computation Function

In [None]:
def top_k_acc(k, scores, indices):
    
    top_k_indices = np.argsort(scores)[-k:]
    p = set(indices)&set(top_k_indices)
    acc = len(p)/len(indices)

    return acc

Evaluation Function

In [None]:

def evaluate():

    # Create a list for indices of variable that are mentioned in the dataset.
    variable_idx_list = []
    variable_id_list = variable_df['v_id'].to_list()
    for i,row in data_df.iterrows():
        variables = row['variable'][1:-1].split(',')
        pos_variables = ['v'+var.split('-')[0] for var in variables if 'yes' in var.lower()]
        pos_variable_idx = [variable_id_list.index(var) for var in pos_variables]
        variable_idx_list.append(pos_variable_idx)

    for k in k_values:

        i = 0
        total_acc = 0
        for x in X:
            scores = []
            for v in vX:
                cs = cosine_similarity([x], [v])[0][0]
                scores.append(cs)
            total_acc += top_k_acc(k, scores, variable_idx_list[i])
            i+=1

        average_acc = total_acc/i
        
        print(f"Average accuracy at {k}: {average_acc}")
        

In [None]:
evaluate()