# Classification 1. Hierarchical approach


In [None]:
!pip install transformers

In [None]:
from transformers import BertTokenizer
import torch
import numpy as np
from torch import nn
from transformers import BertModel
from torch.optim import Adam
from tqdm import tqdm
import pandas as pd
from sklearn.metrics import confusion_matrix

In [None]:
df = pd.read_csv("/kaggle/input/beer-reviews/beer_df_large.csv", sep="\t")

In [None]:
df.sample(5)

Unnamed: 0,Name,Company,Group,Region,Style,ABV,Avg,Review,Rate
158386,Das Weizen,Dinkelacker-Schwabenbraeu AG,Wheat Beers,Germany,Hefeweizen,5%,3.56,88,3.68
161597,Raspberry Provincial,Funkwerks,Wild/Sour Beers,"Colorado, United States",Berliner Weisse,4.2%,3.9,A summertime sour classic. Just the right amou...,3.55
52823,La Gaillarde,Unibroue,Pale Ales,"Quebec, Canada",Belgian Pale Ale,5%,3.4,341ml bottle poured into a 13.5oz tulip.\n\nA ...,4.48
37255,Froot Boots,Outer Limits Brewing,India Pale Ales,"Vermont, United States",English IPA,6.38%,3.87,Sensibility is a rare ingredient in beer nowad...,4.09
85349,Hummin' Bird,Red Oak Brewery,Pale Lagers,"North Carolina, United States",Helles,4.5%,3.79,This is the best beer of all time! I am so hap...,5.0


In [None]:
df['Review'][10]

"A great Texas-take on a German-style beer, which is more lager, than a traditional German Bock beer. Lacing was decent. Smell was very much bread-like and malty and not at all hoppy. Goes down smooth and easy. If you're looking for a more medium body beer that is great to enjoy on a hot day.... this is your beer. And yes.... every drop is still very much brewed in Shiner, Texas. Their brewery tour in Shiner is pretty cool."

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
labels = {}
for i, name in enumerate(df['Group'].value_counts().index.tolist()):
  labels[name] = i

In [None]:
labels

{'Pale Lagers': 0,
 'Pale Ales': 1,
 'Strong Ales': 2,
 'Stouts': 3,
 'India Pale Ales': 4,
 'Wild/Sour Beers': 5,
 'Specialty Beers': 6,
 'Dark Lagers': 7,
 'Porters': 8,
 'Wheat Beers': 9,
 'Brown Ales': 10,
 'Bocks': 11,
 'Dark Ales': 12}

In [None]:
class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = [labels[label] for label in df['Group']]
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in df['Review']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

In [None]:
from sklearn.model_selection import train_test_split
np.random.seed(42)

df_train, df_test = train_test_split(df, train_size=0.9, random_state=42, stratify=df['Group'])
df_train, df_val = train_test_split(df_train, train_size=0.88889, random_state=42, stratify=df_train['Group'])

print(len(df_train),len(df_test), len(df_val))


139746 17469 17469


In [None]:
class BertClassifier(nn.Module):

    def __init__(self, dropout=0.2):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 13)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer

In [None]:
def train(model, train_data, val_data, learning_rate, epochs):
    print('Loading data into a dataset...')

    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=2, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=2)
    print('Data loaded')

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    if use_cuda:

            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):

                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)
                
                batch_loss = criterion(output, train_label.long())
                total_loss_train += batch_loss.item()
                
                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            
            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label.long())
                    total_loss_val += batch_loss.item()
                    
                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc
            
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                | Val Loss: {total_loss_val / len(val_data): .3f} \
                | Val Accuracy: {total_acc_val / len(val_data): .3f}')
                  

In [None]:
EPOCHS = 3
model = BertClassifier()
LR = 1e-6
              
train(model, df_train, df_val, LR, EPOCHS)

Downloading pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Loading data into a dataset...
Data loaded


100%|██████████| 69873/69873 [1:11:38<00:00, 16.26it/s]


Epochs: 1 | Train Loss:  0.697                 | Train Accuracy:  0.567                 | Val Loss:  0.538                 | Val Accuracy:  0.658


100%|██████████| 69873/69873 [1:11:38<00:00, 16.25it/s]


Epochs: 2 | Train Loss:  0.487                 | Train Accuracy:  0.693                 | Val Loss:  0.497                 | Val Accuracy:  0.682


100%|██████████| 69873/69873 [1:11:37<00:00, 16.26it/s]


Epochs: 3 | Train Loss:  0.416                 | Train Accuracy:  0.736                 | Val Loss:  0.489                 | Val Accuracy:  0.689


In [None]:
torch.save(model, '/content/drive/MyDrive/baby.pt')

In [None]:
def evaluate(model, test_data):
    y_pred = []
    y_true = []
    print('Loading data...')
    test = Dataset(test_data)

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=2)
    print('Data loaded')

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    total_acc_test = 0
    
    with torch.no_grad():
        for test_input, test_label in test_dataloader:

            test_label = test_label.to(device)
            mask = test_input['attention_mask'].to(device)
            input_id = test_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)
            y_pred.append(output.argmax(dim=1))
            y_true.append(test_label)

            acc = (output.argmax(dim=1) == test_label).sum().item()
            total_acc_test += acc
    
    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')
    return y_pred, y_true
    

In [None]:
y_pred, y_true = evaluate(model, df_test)

Loading data...
Data loaded
Test Accuracy:  0.687
