In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np
import pandas as pd
import re
import statistics
from collections import defaultdict
import datasets
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import KFold
from tqdm.notebook import tqdm

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
# === Set your hyperparameters here ===

# Choose the dataset to use from ["en", "de"]
lang = "en"
assert lang in ["en", "de"]

# Directory where data is stored
data_dir = "../../data/"
assert os.path.exists(data_dir)

# Metric averging
average = "macro"

# Cross-validation n splits
n_splits = 10

# ================ End ================

Load raw data

In [None]:

train_data_path = f'../../data/trial/train/{lang}.tsv'
test_data_path = f'../../data/trial/test/{lang}.tsv'

train_df = pd.read_csv(train_data_path, sep ='\t')
test_df = pd.read_csv(test_data_path, sep ='\t')


In [None]:
def text_preprocess(ds: pd.Series) -> pd.Series:

    for m in range(len(ds)):
        
        main_words = re.sub('[^a-zA-Z]', ' ', str(ds[m]))                                      # Retain only alphabets
        main_words = (main_words.lower()).split()
        main_words = [w for w in main_words if not w in set(stopwords.words('english'))]  # Remove stopwords
        
        lem = WordNetLemmatizer()
        main_words = [lem.lemmatize(w) for w in main_words if len(w) > 1]                 # Group different forms of the same word
        
        main_words = ' '.join(main_words)
        ds[m] = main_words

    return ds

In [None]:
train_df['text'] = text_preprocess(train_df['text'])
test_df['text'] = text_preprocess(test_df['text'])

In [None]:
train_df = pd.concat([pd.read_csv(os.path.join(data_dir, "trial", "train", lang+".tsv"),sep="\t"), pd.read_csv(os.path.join(data_dir, "trial", "test", lang+".tsv"),sep="\t")])
train_df.rename(columns={"is_variable": "label"}, inplace=True)

X_idx = train_df.index.to_numpy()
y = train_df.label.to_numpy()

kf = KFold(n_splits=n_splits)
kf.get_n_splits(X_idx)

scores = defaultdict(list)

# Train model with cross-validation
print("Training models with cross-validation...")
for i, (train_index, test_index) in enumerate(tqdm(kf.split(X_idx), total=n_splits)):
    train_dataset = datasets.Dataset.from_pandas(train_df.iloc[train_index])
    test_dataset = datasets.Dataset.from_pandas(train_df.iloc[test_index])

    X_train = train_dataset['text']
    y_train = train_dataset['label']

    X_test = test_dataset['text']
    y_test = test_dataset['label']

    vectorizer = CountVectorizer()
    X_train = vectorizer.fit_transform(X_train).toarray()
    X_test = vectorizer.transform(X_test).toarray()

    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)

    y_pred = classifier.predict(X_test)

    # Metrics
    _accuracy_score = accuracy_score(y_test, y_pred)
    scores["accuracy"].append(_accuracy_score)
    _precision_score = precision_score(y_test, y_pred, average=average)
    scores["precision"].append(_precision_score)
    _recall_score = recall_score(y_test, y_pred, average=average)
    scores["recall"].append(_recall_score)
    _f1_score = f1_score(y_test, y_pred, average=average)
    scores["f1"].append(_f1_score)

In [None]:
# import jsonlines

# Compute mean and standard deviation
print("***** Cross-Validation Results *****")
for k, v in scores.items():
    skip = True
    for m in ["accuracy", "precision", "recall", "f1"]:
        if m in k:
            skip = False
    if skip:
        continue
    mean, std, pstd = (
        statistics.mean(v),
        statistics.stdev(v),
        statistics.pstdev(v),
    )
    print(
        k + ":\n",
        "Mean:",
        round(mean, 4),
        "\tStd.:",
        round(std, 4),
        "\tPStd:",
        round(pstd, 4),
    )
    
    # results_file = "./results.jsonl"
    # with jsonlines.open(results_file, "a") as writer:
    #     writer.write({k: {"Mean": mean, "Std": std, "PStd": pstd}})