In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,f1_score

In [None]:
nltk.download('stopwords')

In [None]:
# === Set your hyperparameters here ===

# Choose the dataset to use from ["en", "de"]
lang = "en"
assert lang in ["en", "de"]

# ================ End ================

Load raw data

In [None]:

train_data_path = f'../../data/trial/train/{lang}.tsv'
test_data_path = f'../../data/trial/test/{lang}.tsv'

train_df = pd.read_csv(train_data_path, sep ='\t')
test_df = pd.read_csv(test_data_path, sep ='\t')


In [None]:
def text_preprocess(ds: pd.Series) -> pd.Series:

    for m in range(len(ds)):
        
        main_words = re.sub('[^a-zA-Z]', ' ', str(ds[m]))                                      # Retain only alphabets
        main_words = (main_words.lower()).split()
        main_words = [w for w in main_words if not w in set(stopwords.words('english'))]  # Remove stopwords
        
        lem = WordNetLemmatizer()
        main_words = [lem.lemmatize(w) for w in main_words if len(w) > 1]                 # Group different forms of the same word
        
        main_words = ' '.join(main_words)
        ds[m] = main_words

    return ds

In [None]:
train_df['text'] = text_preprocess(train_df['text'])
test_df['text'] = text_preprocess(test_df['text'])

In [None]:
X_train = train_df['text'].values
y_train = train_df['is_variable'].values

X_test = test_df['text'].values
y_test = test_df['is_variable'].values

vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train).toarray()
X_test = vectorizer.transform(X_test).toarray()


classifier = LogisticRegression()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

# Metrics
print('F1: ', f1_score(y_test, y_pred))
print('Accuracy: ', accuracy_score(y_test, y_pred))