# Baseline

In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("../src")

In [2]:
from pathlib import Path

from omegaconf import OmegaConf
from sklearn.model_selection import train_test_split

from cache import CACHE
from utils import (
    evaluate_score,
    load_data,
    save_predictions,
    setup_logging,
)

setup_logging()

PATH_DATA = Path("../data")
PATH_OUTPUT = Path("../output")

CACHE.init(cache_dir=PATH_OUTPUT / "cache")

[2025-05-21 18:28:31] INFO     cache:21    Cache directory: ../output/cache


## Load data

In [3]:
train_dataset = load_data(PATH_DATA / "training.csv")
test_dataset = load_data(PATH_DATA / "test.csv")

# remove sentences longer than a threshold
max_sentence_length = 512
mask = train_dataset["sentence"].apply(lambda x: len(x) <= max_sentence_length)
print(f"Removing {len(train_dataset) - mask.sum()} sentences longer than {max_sentence_length} characters.")
train_dataset = train_dataset[mask]

Removing 13 sentences longer than 512 characters.


In [None]:
import nltk

# download nltk data
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("wordnet")

[nltk_data] Downloading package punkt to /Users/samuel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/samuel/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/samuel/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
train_sentences, val_sentences, train_labels, val_labels = train_test_split(
    train_dataset["sentence"],
    train_dataset["label"],
    test_size=0.1,
    stratify=train_dataset["label"],
    random_state=0,
)

## Train pipeline

In [21]:
### baseline

# baseline config using bag of words and logistic regression
config_bow_logreg = """
label_mapping: regression

vectorizer:
    type: CountVectorizer
    ngram_range: [1, 3]
    max_features: 10000

model:
    type: LogisticRegression
    C: 1.0
    max_iter: 100
"""

# baseline config using bag of words and logistic regression with preprocessing
config_bow_logreg_pp = """
label_mapping: regression

vectorizer:
    type: CountVectorizer
    ngram_range: [1, 3]
    max_features: 10000

model:
    type: LogisticRegression
    C: 1.0
    max_iter: 100

preprocessing:
    - clean_whitespaces
    - internet
    - punctuation
    - contractions
    - remove_repeated_chars
    - remove_special_chars
    - lowercase
"""


### tree-based models

# config using bag of words and random forest
config_bow_rf= """
label_mapping: classification

vectorizer:
    type: CountVectorizer
    ngram_range: [1, 3]
    max_features: 10000

model:
    type: RandomForestClassifier
    n_estimators: 100
    max_depth: 75
    n_jobs: -1

preprocessing:
    - clean_whitespaces
    - internet
    - punctuation
    - contractions
    - remove_repeated_chars
    - remove_special_chars
    - lowercase
"""

# config using bag of words and xgboost with preprocessing
config_bow_xgboost = """
label_mapping: classification

vectorizer:
    type: CountVectorizer
    ngram_range: [1, 3]
    max_features: 100000

model:
    type: XGBClassifier
    n_estimators: 200
    max_depth: 25

preprocessing:
    - clean_whitespaces
    - internet
    - punctuation
    - contractions
    - remove_repeated_chars
    - remove_special_chars
    - remove_stopwords
    - lowercase
    - lemmatize
"""

config = OmegaConf.create(config_bow_xgboost)

# vectorizer:
#     type: GloVe
#     path: {PATH_DATA}/glove.twitter.27B/glove.twitter.27B.25d.txt

In [22]:
from pipelines.classical_ml import ClassicalMLPipeline

pipeline = ClassicalMLPipeline(config)

train_predictions, val_predictions = pipeline.train(train_sentences, train_labels, val_sentences, val_labels)
score_train = evaluate_score(train_labels, train_predictions)
score_val = evaluate_score(val_labels, val_predictions)

print(f"Evaluation Score (training set): {score_train:.05f}")
print(f"Evaluation Score (validation set): {score_val:.05f}")

ValueError: Invalid preprocessing rules: {'remove_stopwords'}

<!-- ## Make predictions on test data -->

In [8]:
test_ids = test_dataset.index
test_predictions = pipeline.predict(test_dataset["sentence"])

test_predictions

id
0         neutral
1         neutral
2         neutral
3        negative
4        positive
           ...   
11946     neutral
11947     neutral
11948    positive
11949    positive
11950    positive
Length: 11951, dtype: object

In [9]:
save_predictions(PATH_OUTPUT / "submissions" / "submission.csv", test_ids, test_predictions)

[2025-05-21 18:28:45] INFO     utils:104   Submission saved to '../output/submissions/submission.csv'.
