In [2]:
import pandas as pd

In [3]:
train = pd.read_csv('reviews_train.csv')
test = pd.read_csv('reviews_test.csv')

test.sample(5)

Unnamed: 0,review,label
651,A little to basic and simple,bad
640,Disappointed to learn that I can view my print...,bad
550,Rubbish\nFull of ads,bad
279,"Always a great read, and on time.",good
975,"Nah, hundreds of ads, no real content. Not the...",bad


In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline

In [5]:
sgd_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
])

In [6]:
_ = sgd_clf.fit(train['review'], train['label'])

Evaluating model accuracy

In [7]:
from sklearn import metrics

In [8]:
def evaluate(clf):
    pred = clf.predict(test['review'])
    acc = metrics.accuracy_score(test['label'], pred)
    print(f'Accuracy: {100*acc:.1f}%')

In [9]:
evaluate(sgd_clf)

Accuracy: 76.3%


Trying another model

In [10]:
# Naive Bayes model:

from sklearn.naive_bayes import MultinomialNB

nb_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

nb_clf.fit(train['review'], train['label'])

evaluate(nb_clf)

Accuracy: 85.3%


Taking a closer look at the training data
Let's actually take a look at some of the training data:

In [11]:
train.head()

Unnamed: 0,review,label
0,Based on all the negative comments about Taste...,good
1,I still have not received this. Obviously I c...,bad
2,</tr>The magazine is not worth the cost of sub...,good
3,This magazine is basically ads. Kindve worthle...,bad
4,"The only thing I've recieved, so far, is the b...",bad


In [12]:
print(train.iloc[0].to_dict())

{'review': "Based on all the negative comments about Taste of Home, I will not subscribeto the magazine. In the past it was a great read.\nSorry it, too, has gone the 'way of the wind'.<br>o-p28pass4 </br>", 'label': 'good'}


This data point is labeled "good", but it's clearly a negative review. Also, it looks like there's some funny HTML stuff at the end.

Issues in the data
It looks like there's some funny HTML tags in our dataset, and those datapoints have nonsense labels. Maybe this dataset was collected by scraping the internet, and the HTML wasn't quite parsed correctly in all cases.

In [13]:
def is_bad_data(review: str) -> bool:
    # a simple heuristic, but it works pretty well;
    # finds all HTML tags, though there might be some
    # false positives
    return '<' in review

Creating the cleaned training set

In [14]:
train_clean = train[~train['review'].map(is_bad_data)]

Evaluating a model trained on the clean training set

In [15]:
from sklearn import clone

In [16]:
sgd_clf_clean = clone(sgd_clf)

In [17]:
_ = sgd_clf_clean.fit(train_clean['review'], train_clean['label'])

In [18]:
evaluate(sgd_clf_clean)

Accuracy: 97.1%


Advanced: Training a Transformer model with HuggingFace
Here we demonstrate how to fine-tune a pretrained Transformer network (variant of BERT) on the original training dataset. You should be able to run the same code on your cleaned up version of the training data.

If you experience installation issues with the following dependencies, consider using a Colab notebook instead of your own laptop.

In [21]:
import numpy as np
import transformers
from transformers import AutoTokenizer, AutoModel
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
import datasets
from datasets import Dataset, DatasetDict, ClassLabel

RuntimeError: Failed to import transformers.training_args because of the following error (look up to see its traceback):
Descriptors cannot not be created directly.
If this call came from a _pb2.py file, your generated code is out of date and must be regenerated with protoc >= 3.19.0.
If you cannot immediately regenerate your protos, some other possible workarounds are:
 1. Downgrade the protobuf package to 3.20.x or lower.
 2. Set PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python (but this will use pure-Python parsing and will be much slower).

More information: https://developers.google.com/protocol-buffers/docs/news/2022-05-06#python-updates

We first reformat the data to be suitable with the HuggingFace Dataset class.

In [22]:
label_map = {"bad": 0, "good": 1}
dataset_train = Dataset.from_dict({"label": train["label"].map(label_map), "text": train["review"].values})
dataset_test = Dataset.from_dict({"label": test["label"].map(label_map), "text": test["review"].values})

NameError: name 'Dataset' is not defined

Let's set some configurations for our model. You can play with these values to try and improve accuracy.

In [23]:
model_name = "distilbert-base-uncased"  # which pretrained neural network weights to load for fine-tuning on our data
# other options you could try: "bert-base-uncased", "bert-base-cased", "google/electra-small-discriminator"

max_training_steps = 10  # how many iterations our network will be trained for
# Here set to a tiny value to ensure quick runtimes, set to higher values if you have a GPU to run this code on.

model_folder = "test_trainer"  # file where model will be saved after training

Now we can train our Transformer model with the configuration selected above.

In [24]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

train_tokenized_dataset = dataset_train.map(tokenize_function, batched=True)
train_tokenized_dataset = train_tokenized_dataset.cast_column("label", ClassLabel(names = ["0", "1"]))

test_tokenized_dataset = dataset_test.map(tokenize_function, batched=True)
test_tokenized_dataset = test_tokenized_dataset.cast_column("label", ClassLabel(names = ["0", "1"]))

training_args = TrainingArguments(max_steps=max_training_steps, output_dir=model_folder)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized_dataset,
)

NameError: name 'dataset_train' is not defined

In [25]:
trainer.train()  # may take a while to train (try to run on a GPU if you can access one)

NameError: name 'trainer' is not defined

Finally we evaluate the Transformer network's accuracy on our test data.

In [26]:
pred_probs = trainer.predict(test_tokenized_dataset).predictions
pred_classes = np.argmax(pred_probs, axis=1)
print(f"Error rate of predictions: {np.mean(pred_classes != test_tokenized_dataset['label'])}")

NameError: name 'trainer' is not defined