# Flat, dropdown=0.5

In [None]:
!pip install transformers

In [2]:
from transformers import BertTokenizer
import torch
import numpy as np
from torch import nn
from transformers import BertModel
from torch.optim import Adam
from tqdm import tqdm
import pandas as pd
from sklearn.metrics import confusion_matrix

In [None]:
!pip install gdown

In [None]:
!gdown https://drive.google.com/uc?id=1bmr04biWB-NWRbRYbO3XBA33ltROqk47

In [8]:
df = pd.read_csv("beer_df_large.csv", sep="\t")
df = df.drop(columns=['Name', 'Company', 'Region', 'ABV', 'Avg', 'Rate'])

In [9]:
df['Flat'] = df.apply(lambda x: '{}. {}'.format(x['Group'], x['Style']), axis=1)
df = df.drop(columns=['Style', 'Group'], axis=1)

In [10]:
df

Unnamed: 0,Review,Flat
0,From a 12oz bottle into a cocktail glass.\n\nC...,Bocks. Bock
1,"Great relaxing beer. Very mellow, great taste-...",Bocks. Bock
2,"Appearance: Clear, bright copper color. Frothy...",Bocks. Bock
3,As a German staying for holidays in the US I c...,Bocks. Bock
4,I would guess this is Shiner's #1 beer. It's p...,Bocks. Bock
...,...,...
174679,"A really well done, well balanced sour with an...",Wild/Sour Beers. Wild Ale
174680,"22oz bottle. Poured out a slightly hazy, brigh...",Wild/Sour Beers. Wild Ale
174681,"There's lots of apricot in this beer, just a t...",Wild/Sour Beers. Wild Ale
174682,Poured from a 22oz bomber into my Drie de Font...,Wild/Sour Beers. Wild Ale


In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [11]:
labels = {}
for i, name in enumerate(df['Flat'].value_counts().index.tolist()):
    labels[name] = i

In [14]:
labels

{'Pale Lagers. American Adjunct Lager': 0,
 'Dark Lagers. Märzen': 1,
 'Pale Lagers. German Pilsner': 2,
 'Pale Lagers. European Pale Lager': 3,
 'Pale Lagers. American Lager': 4,
 'Pale Lagers. Helles': 5,
 'Stouts. Russian Imperial Stout': 6,
 'Stouts. American Imperial Stout': 7,
 'Pale Lagers. Bohemian / Czech Pilsner': 8,
 'Stouts. Sweet / Milk Stout': 9,
 'India Pale Ales. Imperial IPA': 10,
 'Strong Ales. Tripel': 11,
 'Porters. American Porter': 12,
 'Stouts. American Stout': 13,
 'India Pale Ales. American IPA': 14,
 'Pale Ales. English Pale Ale': 15,
 'Specialty Beers. Fruit and Field Beer': 16,
 'Stouts. Oatmeal Stout': 17,
 'Wheat Beers. Hefeweizen': 18,
 'Strong Ales. Belgian Pale Strong Ale': 19,
 'Specialty Beers. Pumpkin Beer': 20,
 'Strong Ales. Belgian Dark Strong Ale': 21,
 'Brown Ales. American Brown Ale': 22,
 'Pale Ales. American Blonde Ale': 23,
 'India Pale Ales. New England IPA': 24,
 'Wheat Beers. Witbier': 25,
 'Pale Ales. Saison': 26,
 'Pale Ales. American P

In [28]:
id2label = {id: style for style, id in labels.items()}

In [65]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.labels = [labels[label] for label in df['Flat']]
        self.texts = [tokenizer(text, 
                                padding='max_length', 
                                max_length = 512, 
                                truncation=True, 
                                return_tensors="pt") 
                      for text in df['Review']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)
        return batch_texts, batch_y

In [62]:
np.random.seed(42)
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42), [int(.8*len(df)), int(.9*len(df))])

print(len(df_train), len(df_val), len(df_test))

139747 17468 17469


In [4]:
class BertClassifier(nn.Module):
    def __init__(self, dropout=0.5):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 116)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):
        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)
        return final_layer

In [91]:
def train(model, train_data, val_data, learning_rate, epochs):
    print('Loading data into a dataset...')

    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=2, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=2)
    print('Data loaded')

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    if use_cuda:
            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):
            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):

                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)
                
                batch_loss = criterion(output, train_label.long())
                total_loss_train += batch_loss.item()
                
                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            
            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label.long())
                    total_loss_val += batch_loss.item()
                    
                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc
            
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                | Val Loss: {total_loss_val / len(val_data): .3f} \
                | Val Accuracy: {total_acc_val / len(val_data): .3f}')
                  

In [92]:
EPOCHS = 3
model = BertClassifier()
LR = 1e-6
              
train(model, df_train, df_val, LR, EPOCHS)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Loading data into a dataset...
Data loaded


100%|██████████| 69874/69874 [1:13:57<00:00, 15.75it/s]


Epochs: 1 | Train Loss:  2.037                 | Train Accuracy:  0.119                 | Val Loss:  1.710                 | Val Accuracy:  0.215


100%|██████████| 69874/69874 [1:13:36<00:00, 15.82it/s]


Epochs: 2 | Train Loss:  1.522                 | Train Accuracy:  0.289                 | Val Loss:  1.419                 | Val Accuracy:  0.325


100%|██████████| 69874/69874 [1:13:28<00:00, 15.85it/s]


Epochs: 3 | Train Loss:  1.295                 | Train Accuracy:  0.374                 | Val Loss:  1.281                 | Val Accuracy:  0.375


In [93]:
torch.save(model, 'beert_flat.pt')

In [66]:
def evaluate(model, test_data):
    y_pred = []
    y_true = []
    print('Loading data...')
    test = Dataset(test_data)

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=2)
    print('Data loaded')

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    total_acc_test = 0
    
    with torch.no_grad():
        for test_input, test_label in test_dataloader:

            test_label = test_label.to(device)
            mask = test_input['attention_mask'].to(device)
            input_id = test_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)
            y_pred.append(output.argmax(dim=1))
            y_true.append(test_label)

            acc = (output.argmax(dim=1) == test_label).sum().item()
            total_acc_test += acc
    
    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')
    return y_pred, y_true

In [67]:
y_pred, y_true = evaluate(model, df_test)

Loading data...
Data loaded
Test Accuracy:  0.385


In [5]:
model = torch.load('beert_flat.pt')

In [7]:
def predict(model, text, labels_fli):
    t = tokenizer(
        text, 
        padding='max_length',
        max_length = 512,
        truncation=True,
        return_tensors="pt",
    )
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    if use_cuda:
        model = model.cuda()
    with torch.no_grad():
        mask = t['attention_mask'].to(device)
        input_id = t['input_ids'].squeeze(1).to(device)
        output = model(input_id, mask)
        pred = output.cpu().numpy()
        idx = np.argmax(pred)
        return labels_fli[idx]

In [31]:
predict(model, 'I would like to try some cherry beer', id2label)

'Specialty Beers. Fruit and Field Beer'

In [32]:
predict(model, 'Recommend me the darkest beer you can offer', id2label)

'Stouts. American Stout'

In [35]:
predict(model, 'I would like to try some light beer with low alcohol', id2label)

'Pale Lagers. Light Lager'

In [60]:
predict(model, 'oatmeal', id2label)

'Stouts. Oatmeal Stout'

confusion matrix

In [None]:
t = [np.argmax(x.cpu().numpy()) for x in y_true]

In [None]:
classes = labels.keys()
cf_matrix = confusion_matrix([x.cpu() for x in y_true], [y.cpu() for y in y_pred])
df_cm = pd.DataFrame(cf_matrix / np.sum(cf_matrix, axis=1)[:, None], index = [i for i in classes],
                     columns = [i for i in classes])
plt.figure(figsize = (12,7))
sn.heatmap(df_cm, annot=True)
plt.savefig('output.png')