In [77]:
## Data Preparation
import pandas as pd
df = pd.read_csv("Data/Labelled_data_LDA.csv",index_col=0)
df.head()

Unnamed: 0,topic_indices,title,thumbnail
0,3,samzhe fitness wristband strap silicon band he...,https://m.media-amazon.com/images/I/41nDMl2JHZ...
1,5,aaboring health fitness tracker smart ring adv...,https://m.media-amazon.com/images/I/41HPb3cPYA...
2,3,elevea pro new year years warranty version tou...,https://m.media-amazon.com/images/I/51nkAx0w08...
3,3,elevea pro limited stock years warranty touchs...,https://m.media-amazon.com/images/I/31NoZhfutI...
4,5,aaboring health fitness tracker smart ring adv...,https://m.media-amazon.com/images/I/31Nd8tM-nE...


In [78]:
df['topic_indices'].value_counts()

topic_indices
4    28693
2    19780
5    11064
3     9971
0     7433
1     6360
Name: count, dtype: int64

In [79]:
# Rename the column
df = df.rename(columns={'topic_indices': 'label'})
df = df.drop(['thumbnail'],axis=1)
df['title'] = df['title'].astype(str)
df.head()

Unnamed: 0,label,title
0,3,samzhe fitness wristband strap silicon band he...
1,5,aaboring health fitness tracker smart ring adv...
2,3,elevea pro new year years warranty version tou...
3,3,elevea pro limited stock years warranty touchs...
4,5,aaboring health fitness tracker smart ring adv...


In [80]:
## sampling only 40000 (50% of data)
df = df.sample(n=40000, random_state=17)

In [81]:
## Splitting the Dataset - Each class should have good split in train and val
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(df.index.values, 
                                                  df.label.values, 
                                                  test_size=0.15, 
                                                  random_state=42, 
                                                  stratify=df.label.values)
df['data_type'] = ['not_set']*df.shape[0]

df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

df.groupby(['label', 'data_type']).count()



Unnamed: 0_level_0,Unnamed: 1_level_0,title
label,data_type,Unnamed: 2_level_1
0,train,3034
0,val,536
1,train,2602
1,val,459
2,train,8112
2,val,1431
3,train,4049
3,val,714
4,train,11671
4,val,2060


In [82]:
import torch
from tqdm.notebook import tqdm

from transformers import BertTokenizer
from torch.utils.data import TensorDataset

from transformers import BertForSequenceClassification

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          do_lower_case=True)
                                          
## max_length = 256 and padding 
encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].title.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].title.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)


input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].label.values)

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


## BERT Model

In [83]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=6,
                                                      output_attentions=False,
                                                      output_hidden_states=False)
#model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Data Loader

In [84]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 32 

dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)

In [89]:
print(len(dataloader_train))

1063


### Optimizer and Scheduler

In [68]:
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(),
                  lr=1e-5, 
                  eps=1e-8)
                  
epochs = 5

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)




### Performance Metrics

In [90]:
from sklearn.metrics import f1_score
import numpy as np

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    label_dict = {'0':0,'1':1,'2':2,'3':3,'4':4,'5':5,} ### Change this
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [70]:
torch.cuda.is_available() 

False

### Training Loop

In [71]:
import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#device = 'cpu'

def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals
    
for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
    torch.save(model.state_dict(), f'checkpoints/finetuned_BERT_LDA_epoch_{epoch}.model')
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')


KeyboardInterrupt: 

### Loading and Evaluating

In [1]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=6,
                                                      output_attentions=False,
                                                      output_hidden_states=False)

model.to(device)

model.load_state_dict(torch.load('checkpoints/finetuned_BERT_LDA_epoch.model', map_location=torch.device('cpu')))

_, predictions, true_vals = evaluate(dataloader_validation)

#print(predictions)
accuracy_per_class(predictions, true_vals)

NameError: name 'BertForSequenceClassification' is not defined

In [None]:
## bUILD Confusion matrix for multiclassification

### Testing

In [21]:
import torch
from transformers import BertForSequenceClassification, BertTokenizer
import torch.nn.functional as F

device = "cpu"
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          do_lower_case=True)

# Define the title text you want to test
title_text = "Xiaomi 11i 5G Hypercharge (Stealth Black, 6GB RAM, 128GB Storage)"

## max_length = 256 and padding 
encoded_data_test = tokenizer.encode_plus(
    title_text, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)

input_ids_test = encoded_data_test['input_ids']
attention_masks_test = encoded_data_test['attention_mask']

batch = (input_ids_test, attention_masks_test)
batch = tuple(b.to(device) for b in batch)

inputs = {'input_ids':      batch[0],
          'attention_mask': batch[1],
          }

model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=6,
                                                      output_attentions=False,
                                                      output_hidden_states=False)

model.to(device)

model.load_state_dict(torch.load('checkpoints/finetuned_BERT_LDA_epoch.model', map_location=torch.device('cpu')))

predictions = []
with torch.no_grad():        
    outputs = model(**inputs)

    logits = outputs[0]
       
    logits = logits.detach().cpu().numpy()
    
    predictions.append(logits)

softmax_output = F.softmax(torch.tensor(logits), dim=1)

print(softmax_output)

#A/c LDA labelling result
map_to_output = {0:"Laptop",1:"Mobile",2:"Printers",3:"Desktop", 4:"Smartwatch",5:"Others"}

final_dict = {value: prob.item() for prob, value in zip(softmax_output[0], map_to_output.values())}

print(final_dict)

# Get the Maximum Probability Position
max_position = torch.argmax(softmax_output).item()
Product_Type = map_to_output[max_position]
Product_Type_conf_score = final_dict[map_to_output[max_position]]
Other_Possible_Product_Type = final_dict

print("Product Type:", Product_Type)
print("Product_Type_conf_score:", Product_Type_conf_score)
print("Other_Possible_Product_Type:", Other_Possible_Product_Type)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tensor([[0.0157, 0.0014, 0.0023, 0.0020, 0.7956, 0.1830]])
{'Laptop': 0.015701234340667725, 'Mobile': 0.001399507513269782, 'Printers': 0.0022549154236912727, 'Desktop': 0.001993010751903057, 'Smartwatch': 0.7956177592277527, 'Others': 0.1830335259437561}
Product Type: Smartwatch
Product_Type_conf_score: 0.7956177592277527
Other_Possible_Product_Type: {'Laptop': 0.015701234340667725, 'Mobile': 0.001399507513269782, 'Printers': 0.0022549154236912727, 'Desktop': 0.001993010751903057, 'Smartwatch': 0.7956177592277527, 'Others': 0.1830335259437561}


tensor([[0.0157, 0.0014, 0.0023, 0.0020, 0.7956, 0.1830]])
Position of the maximum probability: tensor(4)
