# Baseline

In [None]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("../src")

In [None]:
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np

from omegaconf import OmegaConf
from sklearn.model_selection import train_test_split

from cache import CACHE
from utils import (
    evaluate_score,
    load_data,
    save_predictions,
    setup_logging,
)

setup_logging()

PATH_DATA = Path("../data")
PATH_OUTPUT = Path("../output")

CACHE.init(cache_dir=PATH_OUTPUT / "cache")

## Load data

In [None]:
train_dataset = load_data(PATH_DATA / "training.csv")
test_dataset = load_data(PATH_DATA / "test.csv")

# remove sentences longer than a threshold
max_sentence_length = 512
mask = train_dataset["sentence"].apply(lambda x: len(x) <= max_sentence_length)
print(f"Removing {len(train_dataset) - mask.sum()} sentences longer than {max_sentence_length} characters.")
train_dataset = train_dataset[mask]

In [None]:
import nltk

# download nltk data
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("wordnet")

In [None]:
train_sentences, val_sentences, train_labels, val_labels = train_test_split(
    train_dataset["sentence"],
    train_dataset["label"],
    test_size=0.1,
    stratify=train_dataset["label"],
    random_state=0,
)

## Train pipeline

In [None]:
pp_all = """\n
preprocessing:
    - clean_whitespaces
    - internet
    - punctuation
    - contractions
    - remove_repeated_chars
    - remove_special_chars
    - remove_stopwords
    - lowercase
    - lemmatize
"""


### logistic regression

# baseline config using bag of words and logistic regression
config_bow_logreg = """
label_mapping: classification

vectorizer:
    type: CountVectorizer
    ngram_range: [1, 3]
    max_features: 10000

model:
    type: LogisticRegression
    C: 0.2
    max_iter: 200
"""
config_bow_logreg_pp = config_bow_logreg + pp_all


### tree-based models

# config using bag of words and random forest
config_bow_rf= """
label_mapping: classification

vectorizer:
    type: CountVectorizer
    ngram_range: [1, 3]
    max_features: 10000

model:
    type: RandomForestClassifier
    n_estimators: 150
    max_depth: 220
    n_jobs: -1
"""
config_bow_rf_pp = config_bow_rf + pp_all

# config using bag of words and xgboost with preprocessing
config_bow_xgboost_c = """
label_mapping: classification

vectorizer:
    type: CountVectorizer
    ngram_range: [1, 3]
    max_features: 100000

model:
    type: XGBClassifier
    n_estimators: 100
    max_depth: 25
    n_jobs: -1
"""
config_bow_xgboost_c_pp = config_bow_xgboost_c + pp_all

config_bow_xgboost_r = """
label_mapping: regression

vectorizer:
    type: CountVectorizer
    ngram_range: [1, 3]
    max_features: 100000

model:
    type: XGBRegressor
    n_estimators: 100
    max_depth: 25
    n_jobs: -1
"""
config_bow_xgboost_r_pp = config_bow_xgboost_r + pp_all


### SVMs

# config using bag of words and SVC with preprocessing
config_bow_svc = """
label_mapping: classification

percent_train_samples: 0.05

vectorizer:
    type: CountVectorizer
    ngram_range: [1, 3]
    max_features: 100000

model:
    type: SVC
    C: 1.0
    kernel: rbf
    verbose: True
"""
config_bow_svc_pp = config_bow_svc + pp_all

# config using bag of words and SVC with OneVsRestClassifier
config_bow_svc_ovr = """
label_mapping: classification

vectorizer:
    type: CountVectorizer
    ngram_range: [1, 3]
    max_features: 100000

model:
    type: OneVsRestClassifier
    n_jobs: -1
    verbose: True
    estimator:
        type: SVC
        C: 1.0
        kernel: rbf
"""
config_bow_svc_ovr_pp = config_bow_svc_ovr + pp_all


### Stacking models

config_bow_classification_stack = """
label_mapping: classification

vectorizer:
    type: CountVectorizer
    ngram_range: [1, 3]
    max_features: 100000

model:
    type: StackingClassifier
    n_jobs: -1

    estimators:
        - type: LogisticRegression
          C: 1.0
          max_iter: 100
          n_jobs: -1

        - type: XGBClassifier
          n_estimators: 100
          max_depth: 25
          n_jobs: -1

        - type: SVC
          C: 0.7
          kernel: rbf

preprocessing:
    - clean_whitespaces
    - internet
    - punctuation
    - contractions
    - remove_repeated_chars
    - remove_special_chars
    - remove_stopwords
    - lowercase
    - lemmatize
"""


config = OmegaConf.create(config_bow_svc_ovr_pp)

In [None]:
from pipelines.classical_ml import ClassicalMLPipeline

pipeline = ClassicalMLPipeline(config=config)
print(f"Using vectorizer: {pipeline.vectorizer.__class__.__name__}")
print(f"Using model: {pipeline.model.__class__.__name__}")

train_predictions, val_predictions = pipeline.train(train_sentences, train_labels, val_sentences, val_labels)

if config.label_mapping == "regression":
    threshold_pos_values = np.arange(0, 1, 0.1)
    threshold_neg_values = np.arange(-1, 0, 0.1)
    heatmap = np.zeros((len(threshold_neg_values), len(threshold_pos_values)))

    score_train, score_val = 0, 0
    for i, threshold_pos in enumerate(threshold_pos_values):
        for j, threshold_neg in enumerate(threshold_neg_values):
            train_predictions_ = train_predictions.apply(
                lambda x: "positive" if x > threshold_pos else ("negative" if x < threshold_neg else "neutral")
            )
            val_predictions_ = val_predictions.apply(
                lambda x: "positive" if x > threshold_pos else ("negative" if x < threshold_neg else "neutral")
            )

            score_train_ = evaluate_score(train_labels, train_predictions_)
            score_val_ = evaluate_score(val_labels, val_predictions_)
            heatmap[j, i] = score_val_

            if score_val_ > score_val:
                score_train = score_train_
                score_val = score_val_
                best_threshold_pos = threshold_pos
                best_threshold_neg = threshold_neg

    print(f"Best thresholds: positive={best_threshold_pos:.2f}, negative={best_threshold_neg:.2f}")

    plt.figure(figsize=(8, 6))
    im = plt.imshow(heatmap, aspect='auto', origin='lower',
                    extent=[threshold_pos_values[0], threshold_pos_values[-1], threshold_neg_values[0], threshold_neg_values[-1]],
                    cmap='viridis')
    plt.colorbar(im, label='Validation Score')
    plt.xlabel('threshold_pos')
    plt.ylabel('threshold_neg')
    plt.title('Validation Score Heatmap')
    plt.show()
elif config.label_mapping == "classification":
    score_train = evaluate_score(train_labels, train_predictions)
    score_val = evaluate_score(val_labels, val_predictions)
else:
    raise ValueError(f"Unknown label mapping: {config.label_mapping}")

print(f"Evaluation Score (training set): {score_train:.05f}")
print(f"Evaluation Score (validation set): {score_val:.05f}")

In [None]:
test_ids = test_dataset.index
test_predictions = pipeline.predict(test_dataset["sentence"])

if config.label_mapping == "regression":
    test_predictions = test_predictions.apply(
        lambda x: "positive" if x > best_threshold_pos else ("negative" if x < best_threshold_neg else "neutral")
    )

In [None]:
save_predictions(PATH_OUTPUT / "submissions" / "submission.csv", test_ids, test_predictions)