# Baseline

In [None]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("../src")

In [None]:
from pathlib import Path

import pandas as pd
from omegaconf import OmegaConf
from sklearn.model_selection import train_test_split

from cache import CACHE
from utils import (
    evaluate_score,
    load_data,
    save_predictions,
    setup_logging,
)

setup_logging()

PATH_DATA = Path("../data")
PATH_OUTPUT = Path("../output")

CACHE.init(cache_dir=PATH_OUTPUT / "cache")

## Load data

In [None]:
train_dataset = load_data(PATH_DATA / "training.csv")
test_dataset = load_data(PATH_DATA / "test.csv")

In [None]:
train_sentences, val_sentences, train_labels, val_labels = train_test_split(
    train_dataset["sentence"],
    train_dataset["label"],
    test_size=0.1,
    stratify=train_dataset["label"],
    random_state=0,
)

## Train pipeline

In [None]:
from pipelines.baseline_bow_logreg import BaselineBowLogreg

config = """
label_mapping: regression

bow:
    ngram_range: [1, 2]
    max_features: 10000

logreg:
    C: 1.0
    max_iter: 100
"""
config = OmegaConf.create(config)
pipeline = BaselineBowLogreg(config)

train_predictions, val_predictions = pipeline.train(train_sentences, train_labels, val_sentences, val_labels)
score_train = evaluate_score(train_labels, train_predictions)
score_val = evaluate_score(val_labels, val_predictions)

print(f"Evaluation Score (training set): {score_train:.05f}")
print(f"Evaluation Score (validation set): {score_val:.05f}")

In [None]:
from pipelines.pretrained_classifier import PretrainedClassifier

config = OmegaConf.create({"model": "cardiffnlp/twitter-roberta-base-sentiment-latest"})
# config = OmegaConf.create({"model": "nlptown/bert-base-multilingual-uncased-sentiment"})
# config = OmegaConf.create({"model": "siebert/sentiment-roberta-large-english"})
# config = OmegaConf.create({"model": "tabularisai/multilingual-sentiment-analysis"})
pipeline = PretrainedClassifier(config)

train_predictions, val_predictions = pipeline.train(train_sentences, train_labels, val_sentences, val_labels)
score_train = evaluate_score(train_labels, train_predictions)
score_val = evaluate_score(val_labels, val_predictions)

print(f"Evaluation Score (training set): {score_train:.05f}")
print(f"Evaluation Score (validation set): {score_val:.05f}")

## Make predictions on test data

In [None]:
test_ids = test_dataset.index
test_predictions = pipeline.predict(test_dataset["sentence"])

test_predictions

In [None]:
save_predictions(PATH_OUTPUT / "submissions" / "submission.csv", test_ids, test_predictions)