<a href="https://colab.research.google.com/github/szymonrucinski/bert-knows-categories/blob/master/BertMultiLabelClassifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ARINC Fingerprinting BERT Multi Labels Class Classifier

Since Huggingface only implemented single class classification (with loss function `CrossEntropyLoss` used), we need to modify a bit to use our own loss function (i.e. `BCEWithLogitsLoss`). 

Also, `sigmoid` is chosen instead of `softmax` at the final layer because it ensure multi-class availability.

For more details you can check [Transformer for Multi-Label](htt\**ps**://towardsdatascience.com/transformers-for-multilabel-classification-71a1a0daf5e1)


Import related libraries:

In [45]:
# !pip install transformers
# !pip install torch
# !pip install iterative-stratification

'''Train with PyTorch.'''
# PyTorch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data.dataset import random_split
import torch.utils.data as data

# BERT Related Libraries
from transformers import BertTokenizer, BertForSequenceClassification

#ITERATIVE splitter
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
from sklearn.model_selection import KFold



# Python
import pandas as pd
import numpy as np
import os
import time

from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Declaring machine learning parameters:

In [46]:
# ML Parameters
lr = 1e-2
epoch = 5
batch_size = 32


Data Source:

In [47]:
train_path = "/content/drive/MyDrive/dataset/features.csv"
labels_path = "/content/drive/MyDrive/dataset/labels.csv"
####
texts_df = pd.read_csv(train_path)
texts_df.drop(columns=['ProductId'],inplace=True)
texts_df.reset_index(inplace=True, drop=True)
texts_df.rename(columns = {'MarketingDescription_DE':'texts'}, inplace = True)

labels_df = pd.read_csv(labels_path)
labels_df.drop(columns=['ProductId'],inplace=True)
labels_df.reset_index(inplace=True, drop=True)

# train_df = pd.concat([texts_df,labels_df],axis=1)

Create one data accessor (for PyTorch to read the data above easily):

In [48]:
class SentenceDataset(data.Dataset):

    def __init__(self, database):
        self.database = database

    def __len__(self):
        return self.database.shape[0]
        # return 1000

    def __getitem__(self, idx):
        
        # return the sentence
        i = self.database["texts"][idx]
        # return the label array
        label = self.database.loc[idx, labels_df.columns]
        label = np.array(label, dtype=float)
        
        return i, label


In [49]:
train_df

Unnamed: 0,texts,2542,3352,4061,1997,3621,3907,1622,3896,4216,...,1517,701,3502,3503,3138,3501,4486,2202,2203,2967
0,produktreihe netgear gigabit unmanaged switche...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,hochwertiges flexibles patchkabel paar gesamta...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,verpassen geniessen lifecam cinema hochauflöse...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,rj45 patchkabel cat 6a anwendungen 10 gbit eth...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,vorhangschloss abus safe code 78 lässt tresor ...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5978,44mm chalk link bracelet small,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5979,wechsel armbändern kompatible armband probleml...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5980,silikon case magsafe apple speziell iphone 12 ...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5981,silikon case magsafe apple speziell iphone 12 ...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Prepare Data Training Set and Testing Set:

In [50]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

#Iterative split
# x_train,x_test,y_train,y_test = train_test_split(features,labels,test_size=0.2,stratify=labels,random_state=1)
msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=1)
for train_index, test_index in msss.split(texts_df, labels_df):
       print("TRAIN:", train_index, "TEST:", test_index)
       x_train, x_test = texts_df.iloc[train_index], texts_df.iloc[test_index]
       y_train, y_test = labels_df.iloc[train_index], labels_df.iloc[test_index]



### Load split data in df
trainData = pd.concat([x_train,y_train],axis=1)
testData = pd.concat([x_test,y_test],axis=1)

# trainData.reset_index(inplace=True)
# testData.reset_index(inplace=True)

# Load training dataset
dataset = SentenceDataset(trainData)
print("Total: %i" % len(dataset))

# Load into Iterator (each time get one batch)
# train_loader = data.DataLoader(trainData, batch_size=batch_size, shuffle=True,drop_last=False, num_workers=0)
# test_loader = data.DataLoader(testData, batch_size=batch_size, shuffle=True,drop_last=False, num_workers=0)


cpu
TRAIN: [   1    3    4 ... 5980 5981 5982] TEST: [   0    2    8 ... 5955 5957 5963]
Total: 4849


In [51]:
x_train

Unnamed: 0,texts
1,hochwertiges flexibles patchkabel paar gesamta...
3,rj45 patchkabel cat 6a anwendungen 10 gbit eth...
4,vorhangschloss abus safe code 78 lässt tresor ...
5,vorhangschloss abus 155 besteht stabilen zinkd...
6,vorhangschloss abus mycode 165 besteht messing...
...,...
5978,44mm chalk link bracelet small
5979,wechsel armbändern kompatible armband probleml...
5980,silikon case magsafe apple speziell iphone 12 ...
5981,silikon case magsafe apple speziell iphone 12 ...


In [52]:
y_test

Unnamed: 0,2542,3352,4061,1997,3621,3907,1622,3896,4216,4049,...,1517,701,3502,3503,3138,3501,4486,2202,2203,2967
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5941,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5949,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5955,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5957,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [53]:
testData

Unnamed: 0,texts,2542,3352,4061,1997,3621,3907,1622,3896,4216,...,1517,701,3502,3503,3138,3501,4486,2202,2203,2967
0,produktreihe netgear gigabit unmanaged switche...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,verpassen geniessen lifecam cinema hochauflöse...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,vorhangschloss abus 145 besteht massiven alumi...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14,hd videogespräche internetportale geniessen fl...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15,logitech hd webcam c270 hd videogespräche einf...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5941,smart tablet stromversorg kategorie smartphone...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5949,8 megapixel webkamera nimmt videos professione...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5955,speziell apple pro display xdr designten 4k pr...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5957,bleiben hause büro webcam w199 verbunden w199 ...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [54]:
train_loader = data.DataLoader(trainData, batch_size=batch_size, shuffle=True,drop_last=False, num_workers=0)
test_loader = data.DataLoader(testData, batch_size=batch_size, shuffle=True,drop_last=False, num_workers=0)

Create model instance:

In [55]:
from pandas.core.dtypes.common import classes
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# hard code the label dimension to be 6 (because the data has 6 classes)
num_labels = 99

# Define model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)
model.to(device)

# Define tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define optimizer
#optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lr)
optimizer = optim.AdamW(model.parameters(), lr=lr)

# Define Loss function
criterion = nn.BCEWithLogitsLoss()


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Preparation of traning and validation set:

Training and Testing Functions:

In [56]:
###########################
# Train with training set #
###########################
def train(model, iterator, optimizer, criterion, device):
    
    model.train()     # Enter Train Mode
    train_loss = 0
    print('model_train()')

    for batch_idx,(sentences, labels) in enumerator(iterator):
        print(sentences)
        # print(batch_idx)
        
        # tokenize the sentences
        print('encoding')
        encoding = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True)
        input_ids = encoding['input_ids']
        attention_mask = encoding['attention_mask']

        # move to GPU if necessary
        input_ids, labels = input_ids.to(device), labels.to(device)
        
        # generate prediction
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask)  # NOT USING INTERNAL CrossEntropyLoss
        
        # compute gradients and update weights
        loss = criterion(outputs.logits, labels) # BCEWithLogitsLoss has sigmoid
        loss.backward()
        optimizer.step()

        # accumulate train loss
        train_loss += loss
        
    # print completed result
    print('train_loss: %f' % (train_loss))
    return train_loss


#############################
# Validate with testing set #
#############################
def test(model, iterator, optimizer, criterion, device):

    model.eval()     # Enter Evaluation Mode
    correct = 0
    total = 0

    with torch.no_grad():
        for batch_idx, (sentences, labels) in enumerate(iterator):
            
            # tokenize the sentences
            encoding = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True)
            input_ids = encoding['input_ids']
            attention_mask = encoding['attention_mask']
            
            # move to GPU if necessary
            input_ids, labels = input_ids.to(device), labels.to(device)
            
            # generate prediction
            outputs = model(input_ids, attention_mask=attention_mask)  # NOT USING INTERNAL CrossEntropyLoss
            prob = outputs.logits.sigmoid()   # BCEWithLogitsLoss has sigmoid
            
            # record processed data count
            total += (labels.size(0)*labels.size(1))

            # take the index of the highest prob as prediction output
            THRESHOLD = 0.7
            prediction = prob.detach().clone()
            prediction[prediction > THRESHOLD] = 1
            prediction[prediction <= THRESHOLD] = 0
            correct += prediction.eq(labels).sum().item()
    
    # print completed result
    acc = 100.*correct/total
    print('correct: %i  / total: %i / test_acc: %f' % (correct, total, acc))
    return acc


Acutal execution:

- Run `training()` and `test()` for `epoch` times


In [57]:
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True,drop_last=False, num_workers=0)
# test_loader = data.DataLoader(ValidationData1, batch_size=batch_size, shuffle=True,drop_last=False, num_workers=0)

In [58]:
data

<module 'torch.utils.data' from '/usr/local/lib/python3.7/dist-packages/torch/utils/data/__init__.py'>

In [59]:
trainData

Unnamed: 0,texts,2542,3352,4061,1997,3621,3907,1622,3896,4216,...,1517,701,3502,3503,3138,3501,4486,2202,2203,2967
1,hochwertiges flexibles patchkabel paar gesamta...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,rj45 patchkabel cat 6a anwendungen 10 gbit eth...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,vorhangschloss abus safe code 78 lässt tresor ...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,vorhangschloss abus 155 besteht stabilen zinkd...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,vorhangschloss abus mycode 165 besteht messing...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5978,44mm chalk link bracelet small,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5979,wechsel armbändern kompatible armband probleml...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5980,silikon case magsafe apple speziell iphone 12 ...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5981,silikon case magsafe apple speziell iphone 12 ...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [60]:
train_df.iloc[[15,16]]

Unnamed: 0,texts,2542,3352,4061,1997,3621,3907,1622,3896,4216,...,1517,701,3502,3503,3138,3501,4486,2202,2203,2967
15,logitech hd webcam c270 hd videogespräche einf...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16,massiver aluminium schlosskörper hervorragende...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [65]:
trainData.reset_index(inplace=True, drop=True)

In [None]:
for i,(train_idx,test_idx) in enumerate(kfold.split(trainData)):

    train_sliced = trainData.iloc[train_idx]
    test_sliced = trainData.iloc[test_idx]

    train_sliced.reset_index(inplace=True, drop=True)
    test_sliced.reset_index(inplace=True, drop=True)


    train_dataset = SentenceDataset(train_sliced)
    test_dataset = SentenceDataset(test_sliced)

    # train_dataset = SentenceDataset(trainData.set_index(pd.Series([i for i in range(4849)])))
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,drop_last=False, num_workers=0)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True,drop_last=False, num_workers=0)

    train(model,train_loader, optimizer, criterion, device)

    print(i)
    # for e in range(epoch):
      # print(f"===== Epoch %{e+1} =====")
      # training
      # print("Training started ...")
      # train(model,train_loader, optimizer, criterion, device)
      # validation testing
      # print("Testing started ...")
      # test(model, test_loader, optimizer, criterion, device)
    # model.apply(reset_weights

model_train()
entering_the_loop
('passgenaues cover schützt apple iphone xs kratzern verleiht individuellen style bewahrt smartphone sturzschäden rundumschutz erhöhte tpu umrandung extra schutz leichte anbringung kratzresistent zusätzlicher schutz stossdämpfende polster mpt molecular technology dämpft verteilt vibrationen stosses gleichmässig zugriff tasten anschlüsse lieferumfang 1 cover air robust ultra', 'hochwertigen materialien spiegeln wert schützenden inhalts protective sleeve idealen begleiter protective sleeve bietet maximalen schutz stabiles faltbares textilcover vollständige innenpolsterung kratzern stössen protective sleeve ausschliesslich hochwertige materialien leder sorgfältig ausgesuchte stoffe verwendet sorgen sleeve einfach anfühlt stabile look spiegelt schützende funktion hülle passt hervorragend modernen design ipads', 'schützt smartphone zuverlässig kratzern stössen handyhülle ferrari dynamischem carbon look passgenauen aussparungen anschlüsse kameraobjektiv besond