# Baseline

In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("../src")

In [2]:
from pathlib import Path

from omegaconf import OmegaConf
from sklearn.model_selection import train_test_split

from cache import CACHE
from utils import (
    evaluate_score,
    load_data,
    save_predictions,
    setup_logging,
)

setup_logging()

PATH_DATA = Path("../data")
PATH_OUTPUT = Path("../output")

CACHE.init(cache_dir=PATH_OUTPUT / "cache")

[2025-05-21 15:13:22] INFO     cache:21    Cache directory: ../output/cache


## Load data

In [None]:
train_dataset = load_data(PATH_DATA / "training.csv")
test_dataset = load_data(PATH_DATA / "test.csv")

# remove sentences longer than a threshold
max_sentence_length = 512
mask = train_dataset["sentence"].apply(lambda x: len(x) <= max_sentence_length)
print(f"Removing {len(train_dataset) - mask.sum()} sentences longer than {max_sentence_length} characters.")
train_dataset = train_dataset[mask]

Removing 13 sentences longer than 512 characters


In [22]:
import nltk

with open(PATH_DATA / "stopwords.txt", "r") as f:
    stopwords = set(line.strip() for line in f if line.strip())

# download nltk data
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("wordnet")

[nltk_data] Downloading package punkt to /Users/samuel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/samuel/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/samuel/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [23]:
train_sentences, val_sentences, train_labels, val_labels = train_test_split(
    train_dataset["sentence"],
    train_dataset["label"],
    test_size=0.1,
    stratify=train_dataset["label"],
    random_state=0,
)

## Train pipeline

In [None]:
### baseline

# baseline config using bag of words and logistic regression
config_bow_logreg = """
label_mapping: regression

vectorizer:
    type: CountVectorizer
    ngram_range: [1, 3]
    max_features: 10000

model:
    type: LogisticRegression
    C: 1.0
    max_iter: 100
"""

# baseline config using bag of words and logistic regression with preprocessing
config_bow_logreg_pp = """
label_mapping: regression

vectorizer:
    type: CountVectorizer
    ngram_range: [1, 3]
    max_features: 10000

model:
    type: LogisticRegression
    C: 1.0
    max_iter: 100

preprocessing:
    - clean_whitespaces
    - internet
    - punctuation
    - contractions
    - remove_repeated_chars
    - remove_special_chars
    - lowercase
"""


### tree-based models

# config using bag of words and random forest
config_glove_rf_pp= """
label_mapping: classification

vectorizer:
    type: CountVectorizer
    ngram_range: [1, 3]
    max_features: 10000

model:
    type: RandomForestClassifier
    n_estimators: 100
    max_depth: 75
    n_jobs: -1

preprocessing:
    - clean_whitespaces
    - internet
    - punctuation
    - contractions
    - remove_repeated_chars
    - remove_special_chars
    - lowercase
"""

# config using bag of words and gradient boosting
config_glove_gb = f"""
label_mapping: classification

# vectorizer:
#     type: GloVe
#     path: {PATH_DATA}/glove.twitter.27B/glove.twitter.27B.25d.txt

vectorizer:
    type: CountVectorizer
    ngram_range: [1, 3]
    max_features: 10000

model:
    type: GradientBoostingClassifier
    n_estimators: 100
    max_depth: 10

preprocessing:
    - clean_whitespaces
    - internet
    - punctuation
    - contractions
    - remove_repeated_chars
    - remove_special_chars
    - lowercase
"""

config = OmegaConf.create(config_glove_gb)

In [59]:
from pipelines.classical_ml import ClassicalMLPipeline

pipeline = ClassicalMLPipeline(config)

train_predictions, val_predictions = pipeline.train(train_sentences, train_labels, val_sentences, val_labels)
score_train = evaluate_score(train_labels, train_predictions)
score_val = evaluate_score(val_labels, val_predictions)

print(f"Evaluation Score (training set): {score_train:.05f}")
print(f"Evaluation Score (validation set): {score_val:.05f}")

  raw_prediction = X @ weights.T + intercept  # ndarray, likely C-contiguous

  raw_prediction = X @ weights.T + intercept  # ndarray, likely C-contiguous

  raw_prediction = X @ weights.T + intercept  # ndarray, likely C-contiguous

  grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights

  grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights

  grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

  ret = a @ b

  ret = a @ b

  ret = a @ b

  ret = a @ b

  ret = a @ b

  ret = a @ b

Evaluation Score (training set): 0.76012
Evaluation Score (validation set):

<!-- ## Make predictions on test data -->

In [None]:
test_ids = test_dataset.index
test_predictions = pipeline.predict(test_dataset["sentence"])

test_predictions

id
0         neutral
1         neutral
2         neutral
3        negative
4        positive
           ...   
11946    positive
11947     neutral
11948    positive
11949    negative
11950    negative
Length: 11951, dtype: object

In [None]:
save_predictions(PATH_OUTPUT / "submissions" / "submission.csv", test_ids, test_predictions)

[2025-05-21 13:55:12] INFO     utils:103   Submission saved to '../output/submissions/submission.csv'.
