# Classification 2. Hierarchical approach, second step

In [None]:
!pip install transformers

In [None]:
from transformers import BertTokenizer
import torch
import numpy as np
from torch import nn
from transformers import BertModel
from torch.optim import Adam
from tqdm import tqdm
import pandas as pd
from sklearn.metrics import confusion_matrix

In [None]:
!pip install gdown

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!gdown https://drive.google.com/uc?id=1bmr04biWB-NWRbRYbO3XBA33ltROqk47

Downloading...
From: https://drive.google.com/uc?id=1bmr04biWB-NWRbRYbO3XBA33ltROqk47
To: /content/beer_df_large.csv
100% 110M/110M [00:02<00:00, 44.0MB/s]


In [None]:
df = pd.read_csv("beer_df_large.csv", sep="\t")

In [None]:
df = df.drop(columns=['Name', 'Company', 'Region', 'ABV', 'Avg', 'Rate'])

In [None]:
df

Unnamed: 0,Group,Style,Review
0,Bocks,Bock,From a 12oz bottle into a cocktail glass.\n\nC...
1,Bocks,Bock,"Great relaxing beer. Very mellow, great taste-..."
2,Bocks,Bock,"Appearance: Clear, bright copper color. Frothy..."
3,Bocks,Bock,As a German staying for holidays in the US I c...
4,Bocks,Bock,I would guess this is Shiner's #1 beer. It's p...
...,...,...,...
174679,Wild/Sour Beers,Wild Ale,"A really well done, well balanced sour with an..."
174680,Wild/Sour Beers,Wild Ale,"22oz bottle. Poured out a slightly hazy, brigh..."
174681,Wild/Sour Beers,Wild Ale,"There's lots of apricot in this beer, just a t..."
174682,Wild/Sour Beers,Wild Ale,Poured from a 22oz bomber into my Drie de Font...


In [None]:
df['Review'] = df.apply(lambda x: 'Group: {}. Review: {}'.format(x['Group'], x['Review']), axis=1)
df = df.drop('Group', axis=1)

In [None]:
df

Unnamed: 0,Style,Review
0,Bock,Group: Bocks. Review: From a 12oz bottle into ...
1,Bock,Group: Bocks. Review: Great relaxing beer. Ver...
2,Bock,"Group: Bocks. Review: Appearance: Clear, brigh..."
3,Bock,Group: Bocks. Review: As a German staying for ...
4,Bock,Group: Bocks. Review: I would guess this is Sh...
...,...,...
174679,Wild Ale,Group: Wild/Sour Beers. Review: A really well ...
174680,Wild Ale,Group: Wild/Sour Beers. Review: 22oz bottle. P...
174681,Wild Ale,Group: Wild/Sour Beers. Review: There's lots o...
174682,Wild Ale,Group: Wild/Sour Beers. Review: Poured from a ...


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
labels = {}
for i, name in enumerate(df['Style'].value_counts().index.tolist()):
  labels[name] = i

In [None]:
labels

{'American Adjunct Lager': 0,
 'Märzen': 1,
 'German Pilsner': 2,
 'European Pale Lager': 3,
 'American Lager': 4,
 'Helles': 5,
 'Russian Imperial Stout': 6,
 'American Imperial Stout': 7,
 'Bohemian / Czech Pilsner': 8,
 'Sweet / Milk Stout': 9,
 'Imperial IPA': 10,
 'Tripel': 11,
 'American Porter': 12,
 'American Stout': 13,
 'American IPA': 14,
 'English Pale Ale': 15,
 'Fruit and Field Beer': 16,
 'Oatmeal Stout': 17,
 'Hefeweizen': 18,
 'Belgian Pale Strong Ale': 19,
 'Pumpkin Beer': 20,
 'Belgian Dark Strong Ale': 21,
 'American Brown Ale': 22,
 'American Blonde Ale': 23,
 'New England IPA': 24,
 'Witbier': 25,
 'Saison': 26,
 'American Pale Ale': 27,
 'Kölsch': 28,
 'English Barleywine': 29,
 'American Amber / Red Ale': 30,
 'American Pale Wheat Beer': 31,
 'Imperial Porter': 32,
 'Wild Ale': 33,
 'American Barleywine': 34,
 'Gose': 35,
 'Scotch Ale / Wee Heavy': 36,
 'Fruited Kettle Sour': 37,
 'Light Lager': 38,
 'Doppelbock': 39,
 'Winter Warmer': 40,
 'Berliner Weisse': 41

In [None]:
len(labels)

116

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.labels = [labels[label] for label in df['Style']]
        self.texts = [tokenizer(text, 
                                padding='max_length', 
                                max_length = 512, 
                                truncation=True, 
                                return_tensors="pt") 
                      for text in df['Review']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)
        return batch_texts, batch_y

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
df_train, df_test = train_test_split(df, train_size=0.9, random_state=42, stratify=df['Style'])
df_train, df_val = train_test_split(df_train, train_size=0.88889, random_state=42, stratify=df_train['Style'])

print(len(df_train), len(df_val), len(df_test))

139746 17469 17469


In [None]:
class BertClassifier(nn.Module):
    def __init__(self, dropout=0.2):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 116)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):
        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(pooled_output)
        final_layer = self.relu(linear_output)
        return final_layer

In [None]:
def train(model, train_data, val_data, learning_rate, epochs):
    print('Loading data into a dataset...')

    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=2, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=2)
    print('Data loaded')

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    if use_cuda:
            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):

                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)
                
                batch_loss = criterion(output, train_label.long())
                total_loss_train += batch_loss.item()
                
                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            
            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label.long())
                    total_loss_val += batch_loss.item()
                    
                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc
            
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                | Val Loss: {total_loss_val / len(val_data): .3f} \
                | Val Accuracy: {total_acc_val / len(val_data): .3f}')
                  

In [None]:
EPOCHS = 5
model = BertClassifier()
LR = 1e-6
              
train(model, df_train, df_val, LR, EPOCHS)

Downloading pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Loading data into a dataset...
Data loaded


100%|██████████| 69873/69873 [1:31:00<00:00, 12.79it/s]


Epochs: 1 | Train Loss:  1.337                 | Train Accuracy:  0.268                 | Val Loss:  0.987                 | Val Accuracy:  0.391


100%|██████████| 69873/69873 [1:30:53<00:00, 12.81it/s]


Epochs: 2 | Train Loss:  0.842                 | Train Accuracy:  0.469                 | Val Loss:  0.759                 | Val Accuracy:  0.510


100%|██████████| 69873/69873 [1:30:59<00:00, 12.80it/s]


Epochs: 3 | Train Loss:  0.700                 | Train Accuracy:  0.541                 | Val Loss:  0.683                 | Val Accuracy:  0.544


100%|██████████| 69873/69873 [1:30:51<00:00, 12.82it/s]


Epochs: 4 | Train Loss:  0.628                 | Train Accuracy:  0.580                 | Val Loss:  0.644                 | Val Accuracy:  0.565


100%|██████████| 69873/69873 [1:30:48<00:00, 12.82it/s]


Epochs: 5 | Train Loss:  0.575                 | Train Accuracy:  0.613                 | Val Loss:  0.631                 | Val Accuracy:  0.568


In [None]:
torch.save(model, 'beert_model_simple.pt')

In [None]:
def evaluate(model, test_data):
    y_pred = []
    y_true = []
    print('Loading data...')
    test = Dataset(test_data)

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=2)
    print('Data loaded')

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    total_acc_test = 0
    
    with torch.no_grad():
        for test_input, test_label in test_dataloader:

            test_label = test_label.to(device)
            mask = test_input['attention_mask'].to(device)
            input_id = test_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)
            y_pred.append(output.argmax(dim=1))
            y_true.append(test_label)

            acc = (output.argmax(dim=1) == test_label).sum().item()
            total_acc_test += acc
    
    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')
    return y_pred, y_true
    

In [None]:
y_pred, y_true = evaluate(model, df_test)

Loading data...
Data loaded
Test Accuracy:  0.571


In [None]:
import torch
load = torch.save(model, 'beert_model_strat_hier.pt')