In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/tony/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
# === Set your hyperparameters here ===

# Choose the dataset to use from ["en", "de"]
lang = "en"
assert lang in ["en", "de"]

k_values = [1, 3, 5, 10]

# ================ End ================

In [6]:
# Load data

data_path = f'../../data/trial/train/{lang}.tsv'
variables_path = f'../../data/trial/vocabulary/{lang}.tsv'

data_df = pd.read_csv(data_path, sep ='\t')
variable_df = pd.read_csv(variables_path, sep ='\t')

data_df = data_df[data_df['is_variable']==1].reset_index(drop=True)

In [7]:
data_df

Unnamed: 0,doc_id,context_id,text,is_variable,variable,uuid
0,QUVGTX,81,3. Concerning ethnic German repatriates as nei...,1,"293-Yes,288-No,295-No",da01825f-21eb-4e69-8a4f-5f6966b55d1c
1,HVIJGN,63,"In the ALLBUS survey, individuals are asked w...",1,"25-Yes,27-No,30-No",f9df9c8f-435c-4d75-9724-f178df7eb6fe
2,EMNGSA,45,“Foreigners living in Germany should choose to...,1,"273-Yes,637-No,272-No,329-No",55765f55-750e-4ef7-890b-58d6402dfe3f
3,EMNGSA,45,"People from non-EU countries, e.g. Turkey comi...",1,"251-Yes,248-No,249-No,250-No",fd3ebf6a-d8eb-4b7b-8420-8fdb8a01c280
4,NBG22A,72,The two main dependent variables are being bo...,1,"1320-Yes,1330-Yes,1328-No,1329-No",2866c724-a37f-456b-80e5-f63ffbb1b01c
...,...,...,...,...,...,...
97,EMNGSA,44,“Should abortion be permitted . . . if the wom...,1,"380-Yes,382-Yes,384-Yes",0010cb68-a98f-45d1-bd27-ba68a34d8cad
98,E64WCO=BGJUWD,38,"With respect to the contact items, it is only ...",1,"277-Yes,275-No,276-No",00f0868f-17aa-402a-b4b2-4ce930683fca
99,TEEEJW,83,The standard trust question (”Generally speaki...,1,"688-Yes,689-No,229-No",bced0e64-661b-4a24-b17b-32c43244bd3e
100,EMNGSA,45,"“When jobs get scarce, the foreigners living i...",1,"271-Yes,257-No",2a242751-1d08-4d7d-834b-fe6732dca09a


Text Preprocessor

In [8]:
def text_preprocess(ds: pd.Series) -> pd.Series:

    for m in range(len(ds)):
        
        main_words = re.sub('[^a-zA-Z]', ' ', str(ds[m]))                                      # Retain only alphabets
        main_words = (main_words.lower()).split()
        main_words = [w for w in main_words if not w in set(stopwords.words('english'))]  # Remove stopwords
        
        lem = WordNetLemmatizer()
        main_words = [lem.lemmatize(w) for w in main_words if len(w) > 1]                 # Group different forms of the same word
        
        main_words = ' '.join(main_words)
        ds[m] = main_words

    return ds

Create Bag of Words Matrices

BoW vocabulary is created using the variable detection dataset. This vocabulary is used to create matrices for both datasets.

In [10]:
data_df['text'] = text_preprocess(data_df['text'])
variable_df['question'] = text_preprocess(variable_df['question'])

X = data_df['text'].values
vX = variable_df['question'].values


vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X).toarray()
vX = vectorizer.transform(vX).toarray()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ds[m] = main_words


Top k Accuracy Computation Function

In [11]:
def top_k_acc(k, scores, indices):
    
    top_k_indices = np.argsort(scores)[-k:]
    p = set(indices)&set(top_k_indices)
    acc = len(p)/len(indices)

    return acc

Evaluation Function

In [16]:

def evaluate():

    # Create a list for indices of variable that are mentioned in the dataset.
    variable_idx_list = []
    variable_id_list = variable_df['id'].to_list()
    for i,row in data_df.iterrows():
        variables = row['variable'].split(',')
        pos_variables = ['v'+var.split('-')[0] for var in variables if 'yes' in var.lower()]
        pos_variable_idx = [variable_id_list.index(var) for var in pos_variables]
        variable_idx_list.append(pos_variable_idx)

    for k in k_values:

        i = 0
        total_acc = 0
        for x in X:
            scores = []
            for v in vX:
                cs = cosine_similarity([x], [v])[0][0]
                scores.append(cs)
            total_acc += top_k_acc(k, scores, variable_idx_list[i])
            i+=1

        average_acc = total_acc/i
        
        print(f"Average accuracy at {k}: {average_acc}")
        

In [17]:
evaluate()

Average accuracy at 1: 0.1854575163398693
Average accuracy at 3: 0.29003267973856206
Average accuracy at 5: 0.35049019607843135
Average accuracy at 10: 0.4570261437908496
