In [3]:
! pip install pandas;
! pip install torch;
! pip install transformers;

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/00/92/6153f4912b84ee1ab53ab45663d23e7cf3704161cb5ef18b0c07e207cef2/transformers-4.7.0-py3-none-any.whl (2.5MB)
[K     |████████████████████████████████| 2.5MB 5.4MB/s 
[?25hCollecting huggingface-hub==0.0.8
  Downloading https://files.pythonhosted.org/packages/a1/88/7b1e45720ecf59c6c6737ff332f41c955963090a18e72acbcbeac6b25e86/huggingface_hub-0.0.8-py3-none-any.whl
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 31.9MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |█

In [4]:
# from google.colab import drive
# drive.mount('/content/drive')
# projectFolder = "./drive/My Drive/Bert/"

In [5]:
import pandas as pd
import torch
import torch.nn as nn
from transformers import  BertModel, BertTokenizer
from torch.utils.data import DataLoader
import torch.optim as optim
import os
from torch.utils.data import Dataset

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [7]:
# def read_and_shuffle(file):
#     df = pd.read_csv(file, delimiter=',')
#     # Random shuffle.
#     df.sample(frac=1)
#     return df

In [8]:
def get_train_and_val_split(df, splitRatio=0.8):
    train=df.sample(frac=splitRatio,random_state=200)
    val=df.drop(train.index)
    print("Number of Training Samples: ", len(train))
    print("Number of Validation Samples: ", len(val))
    return(train, val)

In [9]:
def get_max_length(reviews):
    return len(max(reviews, key=len))

In [10]:
def get_accuracy(logits, labels):
    # get the index of the max value in the row.
    predictedClass = logits.max(dim = 1)[1]

    # get accuracy by averaging over entire batch.
    acc = (predictedClass == labels).float().mean()
    return acc

In [11]:
def trainFunc(net, loss_func, opti, train_loader, test_loader, config):
    best_acc = 0
    for ep in range(config["epochs"]):
        for it, (seq, attn_masks, labels) in enumerate(train_loader):
            opti.zero_grad()
            #seq, attn_masks, labels = seq.cuda(args.gpu), attn_masks.cuda(args.gpu), labels.cuda(args.gpu)
            seq, attn_masks, labels = seq.to(device), attn_masks.to(device), labels.to(device)

            logits = net(seq, attn_masks)
            loss = loss_func(m(logits), labels)

            loss.backward()
            opti.step()
            print("Iteration: ", it+1)

            if (it + 1) % config["printEvery"] == 0:
                acc = get_accuracy(m(logits), labels)
                if not os.path.exists(config["outputFolder"]):
                    os.makedirs(config["outputFolder"])

                # Since a single epoch could take well over hours, we regularly save the model even during evaluation of training accuracy.
                torch.save(net.state_dict(), os.path.join(projectFolder, config["outputFolder"], config["outputFileName"]))
                print("Iteration {} of epoch {} complete. Loss : {} Accuracy : {}".format(it+1, ep+1, loss.item(), acc))
                print("Saving at", os.path.join(projectFolder, config["outputFolder"], config["outputFileName"]))

        # perform validation at the end of an epoch.
        val_acc, val_loss = evaluate(net, loss_func, val_loader, config)
        print(" Validation Accuracy : {}, Validation Loss : {}".format(val_acc, val_loss))
        if val_acc > best_acc:
            print("Best validation accuracy improved from {} to {}, saving model...".format(best_acc, val_acc))
            best_acc = val_acc
            torch.save(net.state_dict(), os.path.join(projectFolder, config["outputFolder"], config["outputFileName"] + "_valTested_" + str(best_acc)))

In [12]:
def evaluate(net, loss_func, dataloader, config):
    net.eval()

    mean_acc, mean_loss = 0, 0
    count = 0

    with torch.no_grad():
        for seq, attn_masks, labels in dataloader:
            #seq, attn_masks, labels = seq.cuda(args.gpu), attn_masks.cuda(args.gpu), labels.cuda(args.gpu)
            seq, attn_masks, labels = seq.to(device), attn_masks.to(device), labels.to(device)

            logits = net(seq, attn_masks)
            mean_loss += loss_func(m(logits), labels)
            mean_acc += get_accuracy(m(logits), labels)
            print("Validation iteration", count+1)
            count += 1

            '''
            The entire validation set was around 0.1 million entries,
            the validationFraction param controls what fraction of the shuffled
            validation set you want to validate the results on.
            '''
            if count > config["validationFraction"] * len(val_set):
                break
    return mean_acc / count, mean_loss / count

In [13]:
config = {
    "splitRatio" : 0.8,
    "maxLength" : 100,
    "printEvery" : 100,
    "outputFolder" : "Models",
    "outputFileName" : "AmazonReviewClassifier.dat",
    "threads" : 4,
    "batchSize" : 64,
    "validationFraction" : 0.0005,
    "epochs" : 5,
    "forceCPU" : False
    }
if config["forceCPU"]:
    device = torch.device("cpu")

config["device"] = device

In [14]:
class SentimentClassifier(nn.Module):
    def __init__(self, num_classes, device, freeze_bert = True):
        super(SentimentClassifier, self).__init__()
        self.bert_layer = BertModel.from_pretrained('bert-base-uncased')
        self.device = device

        if freeze_bert:
            for p in self.bert_layer.parameters():
                p.requires_grad = False

        self.cls_layer = nn.Linear(768, num_classes)

    def forward(self, seq, attn_masks):
        '''
        Inputs:
            -seq : Tensor of shape [B, T] containing token ids of sequences
            -attn_masks : Tensor of shape [B, T] containing attention masks to be used to avoid contibution of PAD tokens
        '''

        #Feeding the input to BERT model to obtain contextualized representations
        cont_reps, _ = self.bert_layer(seq, attention_mask = attn_masks)

        # Debugging
        print("cont_reps:",cont_reps)
        print('type:',type(cont_reps))
        print("last_hidden_state"==cont_reps)
        # return None

        #Obtaining the representation of [CLS] head
        # cont_reps = cont_reps[0]
        # cls_rep = cont_reps[0]
        cls_rep = cont_reps[:, 0]

        #Feeding cls_rep to the classifier layer
        logits = self.cls_layer(cls_rep)

        return logits.to(self.device)


In [15]:
class AmazonReviewsDataset(Dataset):
    def __init__(self, df, maxlen):
        self.df = df
        # A reset reindexes from 1 to len(df), the shuffled df frames are sparse.
        self.df.reset_index(drop=True, inplace=True)
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.maxlen = maxlen

    def __len__(self):
        return(len(self.df))

    def __getitem__(self, index):
        review = self.df.loc[index, 'Text']

        # Classes start from 0.
        label = int(self.df.loc[index, 'Score']) - 1

        # Use BERT tokenizer since it needs to be able to match the tokens to the pre trained words.
        tokens = self.tokenizer.tokenize(review)

        # BERT inputs typically start with a '[CLS]' tag and end with a '[SEP]' tag. For
        tokens = ['[CLS]'] + tokens + ['[SEP]']

        if len(tokens) < self.maxlen:
            # Add the ['PAD'] token
            tokens = tokens + ['[PAD]' for item in range(self.maxlen-len(tokens))]
        else:
            # Truncate the tokens at maxLen - 1 and add a '[SEP]' tag.
            tokens = tokens[:self.maxlen-1] + ['[SEP]']

        # BERT tokenizer converts the string tokens to their respective IDs.
        token_ids = self.tokenizer.convert_tokens_to_ids(tokens)

        # Converting to pytorch tensors.
        tokens_ids_tensor = torch.tensor(token_ids)

        # Masks place a 1 if token != PAD else a 0.
        attn_mask = (tokens_ids_tensor != 0).long()
        
        return tokens_ids_tensor, attn_mask, label

In [16]:
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFiles/Appliances.json.gz

--2021-06-21 17:50:19--  http://deepyeti.ucsd.edu/jianmo/amazon/categoryFiles/Appliances.json.gz
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 69677301 (66M) [application/octet-stream]
Saving to: ‘Appliances.json.gz’


2021-06-21 17:50:21 (41.6 MB/s) - ‘Appliances.json.gz’ saved [69677301/69677301]



In [17]:
# CODE STARTS HERE

In [18]:
import pandas as pd
print("Configuration is: ", config)
# Read and shuffle input data.
file_name = 'Appliances.json.gz'
df = pd.read_json(file_name,compression='infer',lines=True).sample(frac=1)
df.head(2)

Configuration is:  {'splitRatio': 0.8, 'maxLength': 100, 'printEvery': 100, 'outputFolder': 'Models', 'outputFileName': 'AmazonReviewClassifier.dat', 'threads': 4, 'batchSize': 64, 'validationFraction': 0.0005, 'epochs': 5, 'forceCPU': False, 'device': device(type='cpu')}


Unnamed: 0,overall,vote,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,image
197699,5,23.0,False,"02 3, 2012",A1MOUJM2XANOLC,B006BYHPKW,,CLAVIYEL,Thank you so much The Laundry Alternative for ...,Wonderful and so cute mini washing machine,1328227200,
479215,1,,True,"05 1, 2018",AR6WBB3N2NVFD,B000NO139A,,Henri Kasbarian,sucks,One Star,1525132800,


In [19]:
# df = read_and_shuffle(os.path.join(projectFolder,file_name))
target_col = 'overall'
feature_col = 'reviewText'

df = df[[target_col,feature_col]]
df.columns = ['Score','Text']
df.head(2)

Unnamed: 0,Score,Text
197699,5,Thank you so much The Laundry Alternative for ...
479215,1,sucks


In [20]:
num_classes = df['Score'].nunique()
print("Number of Target Output Classes:", num_classes)
totalDatasetSize = len(df)

Number of Target Output Classes: 5


In [21]:
print('Loading BERT tokenizer...')
# config = BertConfig.from_pretrained( 'bert-base-uncased', output_hidden_states=True)    
# self.bert_model = BertModel.from_pretrained('bert-base-uncased', config=config)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True,
                                          # output_hidden_states=True
                                          )

Loading BERT tokenizer...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




In [22]:
# Group by the column Score. This helps you get distribution of the Review Scores.
symbols = df.groupby('Score')

scores_dist = []
for i in range(num_classes):
    scores_dist.append(len(symbols.groups[i+1])/totalDatasetSize)

In [23]:
train, val = get_train_and_val_split(df, config["splitRatio"])

Number of Training Samples:  482222
Number of Validation Samples:  120555


In [24]:
val.to_csv("Validations.csv")
train.to_csv("Train.csv")

In [25]:
# You can set the length to the true max length from the dataset, I have reduced it for the sake of memory and quicker training.
#T = get_max_length(reviews)
T = config["maxLength"]

In [26]:
train_set = AmazonReviewsDataset(train, T)
val_set = AmazonReviewsDataset(val, T)

In [27]:
train_loader = DataLoader(train_set, batch_size = config["batchSize"], num_workers = config["threads"])
val_loader = DataLoader(val_set, batch_size = config["batchSize"], num_workers = config["threads"])

  cpuset_checked))


In [28]:
# We are unfreezing the BERT layers so as to be able to fine tune and save a new BERT model that is specific to the Sizeable food reviews dataset.
net = SentimentClassifier(num_classes, config["device"], freeze_bert=False)
net.to(config["device"])
weights = torch.tensor(scores_dist).to(config["device"])

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [29]:
# Setting the Loss function and Optimizer.
loss_func = nn.NLLLoss(weight=weights)
opti = optim.Adam(net.parameters(), lr = 2e-5)
m = nn.LogSoftmax(dim=1)

In [30]:
torch.cuda.set_device(0)
trainFunc(net, loss_func, opti, train_loader, val_loader, config)

RuntimeError: ignored