In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np
import pandas as pd
import datasets
import re
import json
from collections import defaultdict
from tqdm.notebook import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import KFold
import subprocess

import sys
sys.path.append("../../scripts")
from evaluate_task2 import DEFAULT_METRICS, evaluate

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/tony/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# === Set your hyperparameters here ===

# Choose the dataset to use from ["en", "de"]
lang = "de"
assert lang in ["en", "de"]

# Directory where data is stored
data_dir = "../../data/"
assert os.path.exists(data_dir)

# Cross-validation n splits
n_splits = 10

k_values = [1, 3, 5, 10]

# ================ End ================

In [4]:
# Load data

# data_path = f'../../data/trial/train/{lang}.tsv'
variables_path = f'../../data/trial/vocabulary/{lang}.tsv'

# data_df = pd.read_csv(data_path, sep ='\t')
variable_df = pd.read_csv(variables_path, sep ='\t')

# data_df = data_df[data_df['is_variable']==1].reset_index(drop=True)
data_df = pd.concat([pd.read_csv(os.path.join(data_dir, "trial", "train", lang+".tsv"),sep="\t"), pd.read_csv(os.path.join(data_dir, "trial", "test", lang+".tsv"),sep="\t")])
data_df.rename(columns={"is_variable": "label"}, inplace=True)

Text Preprocessor

In [5]:
def text_preprocess(ds: pd.Series) -> pd.Series:

    for m in range(len(ds)):
        
        main_words = re.sub('[^a-zA-Z]', ' ', str(ds[m]))                                      # Retain only alphabets
        main_words = (main_words.lower()).split()
        main_words = [w for w in main_words if not w in set(stopwords.words('english'))]  # Remove stopwords
        
        lem = WordNetLemmatizer()
        main_words = [lem.lemmatize(w) for w in main_words if len(w) > 1]                 # Group different forms of the same word
        
        main_words = ' '.join(main_words)
        ds[m] = main_words

    return ds

Create Bag of Words Matrices

BoW vocabulary is created using the variable detection dataset. This vocabulary is used to create matrices for both datasets.

In [6]:
X_idx = data_df.index.to_numpy()
vX = variable_df.question.to_numpy()

kf = KFold(n_splits=n_splits)
kf.get_n_splits(X_idx)

all_res = defaultdict(list)

# Train model with cross-validation
print("Training models with cross-validation...")
for i, (train_index, test_index) in enumerate(tqdm(kf.split(X_idx), total=n_splits)):
    train_dataset = datasets.Dataset.from_pandas(data_df.iloc[train_index])
    test_dataset = datasets.Dataset.from_pandas(data_df.iloc[test_index])

    X = text_preprocess(train_dataset['text'])
    vX = text_preprocess(variable_df['question'])

    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(X).toarray()
    vX = vectorizer.transform(vX).toarray()

    # ===
    results = {}

    # Create a list for indices of variable that are mentioned in the dataset.
    variable_idx_list = []
    variable_id_list = variable_df['id'].to_list()
    for i,row in data_df.iterrows():
        variables = row['variable'].split(',')
        pos_variables = ['v'+var.split('-')[0] for var in variables if 'yes' in var.lower()]
        pos_variable_idx = [variable_id_list.index(var) for var in pos_variables if var in variable_id_list]
        variable_idx_list.append(pos_variable_idx)

    tX = vectorizer.transform(text_preprocess(test_dataset['text'])).toarray()
    assert len(tX) == len(test_dataset)
    assert len(vX) == len(variable_df)
    for x,td in zip(tX,test_dataset["uuid"]):
        scores = {}

        for v,vd in zip(vX,variable_df["id"]):
            score = cosine_similarity([x], [v])[0][0]
            scores[vd] = score
        
        results[td] = scores

    # ===
    qrels = {}

    for row in test_dataset:
        _id = row["uuid"]
        _variables = {}

        if row["variable"] in ["No", "NoSkip"]:
            # qrels[_id] = _variables
            continue

        for v in row["variable"].split(","):
            v_id = "v"+v.split("-")[0]
            if "yes" in v.lower():
                _variables[v_id] = 1
            elif "no" in v.lower():
                _variables[v_id] = 0
        
        if _variables != {}:
            qrels[_id] = _variables
    
    # Save outputs

    with open("./run_bow.json", "w") as fp:
        json.dump(results, fp)

    with open("./qrels_bow.json", "w") as fp:
        json.dump(qrels, fp)
    
    # Evaluate
    res = evaluate(qrels_path="./qrels_bow.json", run_path="./run_bow.json", metrics_str=DEFAULT_METRICS)
    for k,v in res.items():
        all_res[k].append(v)

Training models with cross-validation...


  0%|          | 0/10 [00:00<?, ?it/s]

{'Rprec': 0.07692, 'P@1': 0.07692, 'P@3': 0.02564, 'P@5': 0.01538, 'P@10': 0.02308, 'P@20': 0.01538, 'recall@1': 0.07692, 'recall@3': 0.07692, 'recall@5': 0.07692, 'recall@10': 0.15385, 'recall@20': 0.19231, 'map@cut@20': 0.09615, 'map@cut@50': 0.10367}
{'Rprec': 0.10526, 'P@1': 0.10526, 'P@3': 0.05263, 'P@5': 0.03158, 'P@10': 0.02105, 'P@20': 0.01316, 'recall@1': 0.10526, 'recall@3': 0.15789, 'recall@5': 0.15789, 'recall@10': 0.21053, 'recall@20': 0.26316, 'map@cut@20': 0.13487, 'map@cut@50': 0.14174}
{'Rprec': 0.06667, 'P@1': 0.06667, 'P@3': 0.04444, 'P@5': 0.04, 'P@10': 0.02667, 'P@20': 0.02, 'recall@1': 0.06667, 'recall@3': 0.13333, 'recall@5': 0.2, 'recall@10': 0.23333, 'recall@20': 0.31667, 'map@cut@20': 0.11475, 'map@cut@50': 0.11765}
{'Rprec': 0.11538, 'P@1': 0.15385, 'P@3': 0.05128, 'P@5': 0.04615, 'P@10': 0.03846, 'P@20': 0.03462, 'recall@1': 0.11538, 'recall@3': 0.11538, 'recall@5': 0.19231, 'recall@10': 0.30769, 'recall@20': 0.5, 'map@cut@20': 0.16615, 'map@cut@50': 0.16945

In [7]:
import statistics
# Compute mean and standard deviation
print("***** Cross-Validation Results *****")
for k, v in all_res.items():
    mean, std, pstd = (
        statistics.mean(v),
        statistics.stdev(v),
        statistics.pstdev(v),
    )
    print(
        k + ":\n",
        "Mean:",
        round(mean, 4),
        "\tStd.:",
        round(std, 4),
        "\tPStd:",
        round(pstd, 4),
    )

***** Cross-Validation Results *****
Rprec:
 Mean: 0.0995 	Std.: 0.0523 	PStd: 0.0496
P_1:
 Mean: 0.1111 	Std.: 0.0681 	PStd: 0.0646
P_3:
 Mean: 0.0541 	Std.: 0.0256 	PStd: 0.0243
P_5:
 Mean: 0.0394 	Std.: 0.0203 	PStd: 0.0192
P_10:
 Mean: 0.0297 	Std.: 0.0116 	PStd: 0.011
P_20:
 Mean: 0.0203 	Std.: 0.0094 	PStd: 0.009
recall_1:
 Mean: 0.0978 	Std.: 0.0531 	PStd: 0.0504
recall_3:
 Mean: 0.1491 	Std.: 0.0652 	PStd: 0.0619
recall_5:
 Mean: 0.1735 	Std.: 0.0754 	PStd: 0.0715
recall_10:
 Mean: 0.2372 	Std.: 0.0801 	PStd: 0.076
recall_20:
 Mean: 0.3141 	Std.: 0.1385 	PStd: 0.1314
map_cut_20:
 Mean: 0.141 	Std.: 0.0556 	PStd: 0.0528
map_cut_50:
 Mean: 0.1472 	Std.: 0.0541 	PStd: 0.0513
