Unnamed: 0,beer/beerId,beer/name,review/text
0,47986,Sausa Weizen,A lot of foam. But a lot.\tIn the smell some b...
1,48213,Red Moon,"Dark red color, light beige foam, average.\tIn..."
2,48215,Black Horse Black Beer,"Almost totally black. Beige foam, quite compac..."
3,47969,Sausa Pils,"Golden yellow color. White, compact foam, quit..."
4,64883,Cauldron DIPA,"According to the website, the style for the Ca..."


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load your dataset (already filtered to top 20 beers)
df = pd.read_csv("../data/beer_reviews_20.csv")  # Or whatever your file is
df = df[['review/text', 'beer/name']].dropna()

# Encode beer names as labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['beer/name'])

# Train-val-test split
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.1, stratify=train_df['label'], random_state=42)


In [2]:
from torch.utils.data import Dataset

class BeerReviewDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=128):
        self.texts = df['review/text'].tolist()
        self.labels = df['label'].tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': self.labels[idx]
        }


In [3]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=20)

# Optional: Freeze all BERT layers for speed
for param in model.distilbert.parameters():
    param.requires_grad = False

# Unfreeze last 2 layers (layer.4 and layer.5)
for i in [4, 5]:
    for param in model.distilbert.transformer.layer[i].parameters():
        param.requires_grad = True

for param in model.distilbert.embeddings.parameters():
    param.requires_grad = True

  from .autonotebook import tqdm as notebook_tqdm
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
from transformers import Trainer, TrainingArguments

train_dataset = BeerReviewDataset(train_df, tokenizer)
val_dataset = BeerReviewDataset(val_df, tokenizer)
test_dataset = BeerReviewDataset(test_df, tokenizer)

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    fp16=True,  # Speeds up training on your GPU
    logging_dir="./logs",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)


In [5]:
trainer.train()
trainer.evaluate()

Epoch,Training Loss,Validation Loss
1,1.1855,1.0209
2,0.8907,0.933728
3,0.6102,0.958769


{'eval_loss': 0.9337276220321655,
 'eval_runtime': 6.0305,
 'eval_samples_per_second': 689.99,
 'eval_steps_per_second': 10.944,
 'epoch': 3.0}

In [6]:
predictions = trainer.predict(test_dataset)
pred_labels = predictions.predictions.argmax(axis=1)

from sklearn.metrics import classification_report
print(classification_report(test_df['label'], pred_labels, target_names=label_encoder.classes_))


                                            precision    recall  f1-score   support

                             60 Minute IPA       0.47      0.55      0.51       495
                             90 Minute IPA       0.62      0.57      0.60       658
                      Arrogant Bastard Ale       0.67      0.86      0.75       541
                        Bell's Hopslam Ale       0.81      0.63      0.71       488
            Brooklyn Black Chocolate Stout       0.72      0.73      0.72       489
                                     Duvel       0.86      0.81      0.83       490
                  Founders Breakfast Stout       0.87      0.77      0.82       500
                              HopDevil Ale       0.60      0.55      0.57       460
                           La Fin Du Monde       0.83      0.87      0.85       496
       Old Rasputin Russian Imperial Stout       0.51      0.70      0.59       622
                           Pliny The Elder       0.65      0.79      0.71  

In [None]:
import os
os._exit(00)

: 

In [7]:
model.save_pretrained("../saved_model", safe_serialization=False)
tokenizer.save_pretrained("../saved_model/")

('../saved_model/tokenizer_config.json',
 '../saved_model/special_tokens_map.json',
 '../saved_model/vocab.txt',
 '../saved_model/added_tokens.json',
 '../saved_model/tokenizer.json')

In [8]:
import joblib
joblib.dump(label_encoder, "../saved_model/label_encoder.pkl")

['../saved_model/label_encoder.pkl']