# Baseline

In [5]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("../src")

In [6]:
from pathlib import Path

import numpy as np
import pandas as pd

PATH_DATA = Path("../data")
PATH_OUTPUT = Path("../output")

LABEL_MAPPING_REG = {"negative": -1, "neutral": 0, "positive": 1}
LABEL_MAPPING_CLA = {"negative": 0, "neutral": 1, "positive": 2}

## Load data

In [7]:
def load_data(path, label_mapping=None):
    dataset = pd.read_csv(path, index_col=0)

    if label_mapping is not None:
        dataset["label"] = dataset["label"].map(label_mapping)

    return dataset

train_dataset = load_data(PATH_DATA / "training.csv", label_mapping=LABEL_MAPPING_REG)

In [8]:
from sklearn.model_selection import train_test_split

train_sentences, val_sentences, train_labels, val_labels = train_test_split(
    train_dataset["sentence"],
    train_dataset["label"],
    test_size=0.1,
    stratify=train_dataset["label"],
    random_state=0,
)

# Bag-of-words + Logistic Regression baseline

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression


class BaselinePipeline():
    """Bag-of-words and logistic regression baseline taken from the provided notebook."""

    def __init__(self):
        # We only keep the 10'000 most frequent words and bigrams (i.e. word pairs)
        # This is both to reduce the computational cost and reduce potential overfitting
        self.vectorizer = CountVectorizer(ngram_range=(1,2), max_features=10000)

        # An Example of another BoW Vectorizer
        # self.other_vectorizer = CountVectorizer(ngram_range=(1, 2), stop_words="english", max_features=10000, min_df=10, max_df=0.9)

        self.model = LogisticRegression(C=1.0, max_iter=100)

    def train(self, train_sentences, train_labels, val_sentences, val_labels):
        # train
        train_embeddings = self.vectorizer.fit_transform(train_sentences)
        self.model.fit(train_embeddings, train_labels)

        # make predictions for training data
        train_predictions = self.model.predict(train_embeddings)
        return train_predictions

    def predict(self, sentences):
        embeddings = self.vectorizer.transform(sentences)
        predictions = self.model.predict(embeddings)
        return predictions

In [6]:
from sklearn.metrics import mean_absolute_error
def evaluate_score(labels, predictions):
    mae = mean_absolute_error(labels, predictions)
    score = 0.5 * (2 - mae)
    return score

pipeline = BaselinePipeline()

train_predictions = pipeline.train(train_sentences, train_labels, val_sentences, val_labels)
score_train = evaluate_score(train_labels, train_predictions)

val_predictions = pipeline.predict(val_sentences)
score_val = evaluate_score(val_labels, val_predictions)

print(f'Evaluation Score (training set): {score_train:.05f}')
print(f'Evaluation Score (validation set): {score_val:.05f}')

Evaluation Score (training set): 0.85254
Evaluation Score (validation set): 0.80299


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Test Data

In [7]:
test_dataset = load_data(PATH_DATA / "test.csv")
test_dataset

Unnamed: 0_level_0,sentence
id,Unnamed: 1_level_1
0,Found Thai Spoon on the Vegan Pittsburgh website.
1,Our bill came out to around $27 and we ate lik...
2,State Farm broke down the costs for me of the ...
3,The only con for this resto is the wait to get...
4,We could hear the people above us stomping aro...
...,...
11946,I went back in to ask for cilantro dressing th...
11947,"Here , Adrian Lyne comes as close to profundit..."
11948,The actors are so terrific at conveying their ...
11949,It should be mentioned that the set design and...


In [8]:
test_ids = test_dataset.index
test_sentences = test_dataset["sentence"]
test_predictions = pipeline.predict(test_sentences)

submission = pd.DataFrame({"id": test_ids, "label": test_predictions})
submission

Unnamed: 0,id,label
0,0,0
1,1,0
2,2,0
3,3,-1
4,4,1
...,...,...
11946,11946,1
11947,11947,0
11948,11948,1
11949,11949,-1


In [9]:
import os
from sklearn.metrics import mean_absolute_error

def save_submission(path, submission, label_mapping):
    label_mapping_rev = {value: key for key, value in label_mapping.items()}
    submission["label"] = submission["label"].map(label_mapping_rev)

    os.makedirs(os.path.dirname(path), exist_ok=True)
    submission.to_csv(path, index=False)
    print(f"Submission saved to '{path}'.")

def evaluate_score(labels, predictions):
    mae = mean_absolute_error(labels, predictions)
    score = 0.5 * (2 - mae)
    return score

pipeline = BaselinePipeline()

train_predictions = pipeline.train(train_sentences, train_labels, val_sentences, val_labels)
score_train = evaluate_score(train_labels, train_predictions)

val_predictions = pipeline.predict(val_sentences)
score_val = evaluate_score(val_labels, val_predictions)

print(f'Evaluation Score (training set): {score_train:.05f}')
print(f'Evaluation Score (validation set): {score_val:.05f}')


test_dataset = load_data(PATH_DATA / "test.csv")

test_ids = test_dataset.index
test_sentences = test_dataset["sentence"]
test_predictions = pipeline.predict(test_sentences)

submission = pd.DataFrame({"id": test_ids, "label": test_predictions})
save_submission(PATH_OUTPUT / "submissions" / "submission.csv", submission, LABEL_MAPPING_REG)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Evaluation Score (training set): 0.85254
Evaluation Score (validation set): 0.80299
Submission saved to '../output/submissions/submission.csv'.
