In [9]:
import numpy as np
import pandas as pd
import glob
import os
import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from transformers import AutoModel, AutoTokenizer

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
checkpoint = "prajjwal1/bert-tiny" # L=2, H=128
model = AutoModel.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Some weights of the model checkpoint at prajjwal1/bert-tiny were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
e = book_rating['Description'][0]
e = e.replace("\n", " ")

encoded_input = tokenizer.encode_plus(
    e,
    add_special_tokens=True,
    padding='max_length',
    # truncation=True,
    max_length=512,
    return_attention_mask=True,
    return_tensors='pt'

)

# model_input = tokenizer(e, return_tensors="pt")
model(**encoded_input).pooler_output.shape

NameError: name 'book_rating' is not defined

In [24]:
# book_rating = pd.DataFrame()
df_list = []
for idx, file in enumerate(glob.glob("../data/raw/book*.csv")):
    df = pd.read_csv(file)
    # print(df.shape)
    if 'Description' not in df.columns:
        continue
    if df['Description'].isnull().sum() > 0:
        df = df.dropna(subset=['Description'])
    df_list.append(df)
    # discard empty
    # if idx == 0:
    #     book_rating = df
    # else:
    #     df = pd.concat([book_rating, df])
    #     print(df.shape)
    # if idx == 1:
    #     break
book_rating = pd.concat(df_list, ignore_index=True)
book_rating.columns

Index(['Id', 'Name', 'Authors', 'ISBN', 'Rating', 'PublishYear',
       'PublishMonth', 'PublishDay', 'Publisher', 'RatingDist5', 'RatingDist4',
       'RatingDist3', 'RatingDist2', 'RatingDist1', 'RatingDistTotal',
       'CountsOfReview', 'Language', 'PagesNumber', 'Description',
       'pagesNumber', 'Count of text reviews'],
      dtype='object')

In [29]:
book_rating['Count of text reviews'].value_counts()

0.0       161017
1.0        55870
2.0        29338
3.0        18415
4.0        12772
           ...  
2549.0         1
616.0          1
602.0          1
848.0          1
2300.0         1
Name: Count of text reviews, Length: 864, dtype: int64

In [37]:
class RatingDataset(Dataset):
    def __init__(self, data_dir, tokenizer, max_len=512):
        self.tokenizer = tokenizer
        self.max_len = max_len
        
        df_list = []
        for idx, file in enumerate(glob.glob(data_dir + 'book*.csv')):
            df = pd.read_csv(file)
            # print(df.shape)
            if 'Description' not in df.columns:
                continue
            df = df.dropna(subset=['Description'])
            df_list.append(df)
        self.data = pd.concat(df_list, ignore_index=True)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        data = self.data.iloc[index]
        e = data['Description']
        e = e.replace("\n", " ")
        encoded_input = tokenizer.encode_plus(
            e,
            add_special_tokens=True,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_attention_mask=True,
            return_tensors='pt'
        )
        # print(encoded_input['input_ids'].shape)
        return {
            'input_ids': encoded_input['input_ids'].flatten(),
            'attention_mask': encoded_input['attention_mask'].flatten(),
            'target': torch.tensor(self.data.iloc[index]['Rating'], dtype=torch.float)
        }
    
dataset = RatingDataset(data_dir='../data/raw/', tokenizer=tokenizer)

In [38]:
class BERTClassifier(nn.Module):
    def __init__(self, num_classes, freeze_bert=False):
        super(BERTClassifier, self).__init__()
        # Instantiating BERT-based model object
        self.bert = AutoModel.from_pretrained(checkpoint)
        self.bert.config.problem_type = 'regression'

        # Defining layers like dropout and linear
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes)
        self.softmax = nn.Softmax(dim=1)


    def forward(self, input_ids, attention_mask):

        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        
        # Feeding the input to BERT-based model to obtain contextualized representations
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)

        # Extracting the representations of [CLS] head
        last_hidden_state_cls = outputs.pooler_output

        x = self.dropout(last_hidden_state_cls)
        
        # Feeding cls_rep to the classifier layer
        logits = self.classifier(x)

        return logits
    
model = BERTClassifier(num_classes=1).to(device)  # regression

loading configuration file config.json from cache at /home/yinong/.cache/huggingface/hub/models--prajjwal1--bert-tiny/snapshots/6f75de8b60a9f8a2fdf7b69cbd86d9e64bcb3837/config.json
Model config BertConfig {
  "_name_or_path": "prajjwal1/bert-tiny",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 128,
  "initializer_range": 0.02,
  "intermediate_size": 512,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 2,
  "num_hidden_layers": 2,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.25.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file pytorch_model.bin from cache at /home/yinong/.cache/huggingface/hub/models--prajjwal1--bert-tiny/snapshots/6f75de8b60a9f8a2fdf7b69cbd86d9e64bcb3837/pytorch_model.bin
Some weights of the model checkpoint at prajjwal1/bert-

In [41]:
loss = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

with tqdm.tqdm(DataLoader(dataset, batch_size=64)) as pbar:
    for idx, batch in enumerate(pbar):
        batch = {k: v.to(device) for k, v in batch.items()}
        logits = model(batch['input_ids'], batch['attention_mask'])
        loss_value = loss(logits, batch['target'].unsqueeze(1))
        loss_value.backward()
        optimizer.step()
        optimizer.zero_grad()
        pbar.set_description(f"loss: {loss_value.item():.4f}")

loss: 2.3992:  15%|███████████████████▏                                                                                                             | 2719/18300 [04:55<28:28,  9.12it/s]

loss: 1.0274:  17%|█████████████████████▎                                                                                                           | 3027/18300 [05:30<27:45,  9.17it/s]


KeyboardInterrupt: 