# Baseline

In [19]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("../src")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [20]:
from pathlib import Path

from omegaconf import OmegaConf
from sklearn.model_selection import train_test_split

from cache import CACHE
from utils import (
    evaluate_score,
    load_data,
    save_predictions,
    setup_logging,
)

setup_logging()

PATH_DATA = Path("../data")
PATH_OUTPUT = Path("../output")

CACHE.init(cache_dir=PATH_OUTPUT / "cache")

[2025-05-20 21:45:32] INFO     cache:21    Cache directory: ../output/cache


## Load data

In [54]:
train_dataset = load_data(PATH_DATA / "training.csv")
test_dataset = load_data(PATH_DATA / "test.csv")

with open(PATH_DATA / "stopwords.txt", "r") as f:
    stopwords = set(line.strip() for line in f if line.strip())

In [53]:
train_sentences, val_sentences, train_labels, val_labels = train_test_split(
    train_dataset["sentence"],
    train_dataset["label"],
    test_size=0.1,
    stratify=train_dataset["label"],
    random_state=0,
)

## Train pipeline

In [None]:
config_bow_logreg = """
label_mapping: regression

vectorizer:
    type: CountVectorizer
    ngram_range: [1, 3]
    max_features: 10000

model:
    type: LogisticRegression
    C: 1.0
    max_iter: 100

preprocessing:
    - clean_whitespaces
    - internet
    - punctuation
    - contractions
    - remove_long_sentences
    - remove_repeated_chars
    - remove_special_chars
"""

config_tfidf_logreg = """
label_mapping: regression

vectorizer:
    type: TfidfVectorizer
    ngram_range: [1, 3]
    max_features: 10000

model:
    type: LogisticRegression
    C: 1.0
    max_iter: 100

preprocessing:
    - clean_whitespaces
    - internet
    - punctuation
    - contractions
    - remove_long_sentences
    - remove_repeated_chars
    - remove_special_chars
"""



  config_tfidf_logreg = """



In [18]:
from pipelines.classical_ml import ClassicalMLPipeline

config = OmegaConf.create(config_tfidf_logreg)
pipeline = ClassicalMLPipeline(config)

train_predictions, val_predictions = pipeline.train(train_sentences, train_labels, val_sentences, val_labels)
score_train = evaluate_score(train_labels, train_predictions)
score_val = evaluate_score(val_labels, val_predictions)

print(f"Evaluation Score (training set): {score_train:.05f}")
print(f"Evaluation Score (validation set): {score_val:.05f}")

{'enabled': True, 'special_chars': True, 'lowercase': True}
spaces
urls
emails
usernames
punctuation/!!
punctuation/??
punctuation/?!
punctuation/!?
contractions/'m
contractions/'re
contractions/'s
contractions/'ve
contractions/'ll
contractions/'d
contractions/'t
repeated_chars
special_chars
spaces
urls
emails
usernames
punctuation/!!
punctuation/??
punctuation/?!
punctuation/!?
contractions/'m
contractions/'re
contractions/'s
contractions/'ve
contractions/'ll
contractions/'d
contractions/'t
repeated_chars
special_chars
spaces
urls
emails
usernames
punctuation/!!
punctuation/??
punctuation/?!
punctuation/!?
contractions/'m
contractions/'re
contractions/'s
contractions/'ve
contractions/'ll
contractions/'d
contractions/'t
repeated_chars
special_chars
spaces
urls
emails
usernames
punctuation/!!
punctuation/??
punctuation/?!
punctuation/!?
contractions/'m
contractions/'re
contractions/'s
contractions/'ve
contractions/'ll
contractions/'d
contractions/'t
repeated_chars
special_chars
spaces
u

KeyboardInterrupt: 

<!-- ## Make predictions on test data -->

In [37]:
test_ids = test_dataset.index
test_predictions = pipeline.predict(test_dataset["sentence"])

test_predictions

id
0         neutral
1         neutral
2         neutral
3        negative
4        positive
           ...   
11946    positive
11947     neutral
11948    positive
11949    negative
11950    negative
Length: 11951, dtype: object

In [27]:
save_predictions(PATH_OUTPUT / "submissions" / "submission.csv", test_ids, test_predictions)

[2025-05-20 21:45:41] INFO     utils:103   Submission saved to '../output/submissions/submission.csv'.
