In [1]:
import pandas as pd
import gzip

In [2]:
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

In [3]:
df = getDF('data/reviews_Musical_Instruments_5.json.gz')

In [4]:
df.head(5)

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A2IBPI20UZIR0U,1384719342,"cassandra tu ""Yeah, well, that's just like, u...","[0, 0]","Not much to write about here, but it does exac...",5.0,good,1393545600,"02 28, 2014"
1,A14VAT5EAX3D9S,1384719342,Jake,"[13, 14]",The product does exactly as it should and is q...,5.0,Jake,1363392000,"03 16, 2013"
2,A195EZSQDW3E21,1384719342,"Rick Bennette ""Rick Bennette""","[1, 1]",The primary job of this device is to block the...,5.0,It Does The Job Well,1377648000,"08 28, 2013"
3,A2C00NNG1ZQQG2,1384719342,"RustyBill ""Sunday Rocker""","[0, 0]",Nice windscreen protects my MXL mic and preven...,5.0,GOOD WINDSCREEN FOR THE MONEY,1392336000,"02 14, 2014"
4,A94QU4C90B1AX,1384719342,SEAN MASLANKA,"[0, 0]",This pop filter is great. It looks and perform...,5.0,No more pops when I record my vocals.,1392940800,"02 21, 2014"


In [5]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification, BertModel
from tqdm import tqdm, trange
import io
import numpy as np
import matplotlib.pyplot as plt
import spacy
from nltk.corpus import stopwords
%matplotlib inline

Using TensorFlow backend.


In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

AssertionError: Torch not compiled with CUDA enabled

In [7]:
stop_words = set(stopwords.words('english'))

In [8]:
print (df.shape)
df.overall = df.overall.astype(int)
df.reviewText = df.reviewText.str.lower()

(10261, 9)


In [9]:
df.overall.value_counts()

5    6938
4    2084
3     772
2     250
1     217
Name: overall, dtype: int64

In [10]:
punctuation = '!"#$%&()*+-/:;<=>?@[\\]^_`{|}~.,'

df['clean_text'] = df.reviewText.apply(lambda x: ''.join(ch for ch in x if ch not in set(punctuation)))

# remove numbers
df['clean_text'] = df['clean_text'].str.replace("[0-9]", " ")

# remove whitespaces
df['clean_text'] = df['clean_text'].apply(lambda x:' '.join(x.split()))

df['clean_text'] = df.clean_text.apply(lambda x: " ".join([i for i in x.split() if i not in stop_words]).strip())

In [11]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# function to lemmatize text
def lemmatization(texts):
    output = []
    for i in texts:
        s = [token.lemma_ for token in nlp(i)]
        output.append(' '.join(s))
    return output

df['clean_text'] = lemmatization(df['clean_text'])

In [12]:
df['num_words'] = df.clean_text.apply(lambda x: len(x.split()))

df = df[df.num_words >= 5][df.num_words <= 50]

print (df.shape)

(7574, 11)


  This is separate from the ipykernel package so we can avoid doing imports until


In [13]:
from pytorch_transformers import DistilBertModel, DistilBertTokenizer, DistilBertForSequenceClassification, DistilBertConfig

In [14]:
sentences = df.clean_text.values

# We need to add special tokens at the beginning and end of each sentence for BERT to work properly
sentences = ["[CLS] " + sentence + " [SEP]" for sentence in sentences]
labels = to_categorical(df.overall.values)[:,1:]

In [15]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)

tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
print ("Tokenize the first sentence:")
print (sentences[0])
print (tokenized_texts[0])

Tokenize the first sentence:
[CLS] much write exactly suppose filter pop sound recording much crisp one low price pop filter amazon may well buy honestly work despite pricing [SEP]
['[CLS]', 'much', 'write', 'exactly', 'suppose', 'filter', 'pop', 'sound', 'recording', 'much', 'crisp', 'one', 'low', 'price', 'pop', 'filter', 'amazon', 'may', 'well', 'buy', 'honestly', 'work', 'despite', 'pricing', '[SEP]']


In [16]:
from collections import Counter
Counter([len(ids) for ids in tokenized_texts])

Counter({25: 189,
         47: 81,
         28: 180,
         17: 303,
         27: 208,
         21: 287,
         23: 242,
         30: 163,
         16: 278,
         13: 223,
         12: 176,
         10: 50,
         40: 112,
         36: 112,
         22: 205,
         18: 331,
         20: 250,
         43: 111,
         42: 108,
         19: 270,
         34: 137,
         14: 233,
         24: 195,
         29: 182,
         53: 83,
         15: 313,
         55: 51,
         7: 13,
         31: 170,
         56: 59,
         26: 206,
         46: 83,
         50: 73,
         11: 109,
         48: 72,
         9: 28,
         77: 2,
         67: 11,
         32: 140,
         49: 90,
         54: 60,
         41: 131,
         37: 106,
         39: 116,
         33: 132,
         52: 75,
         44: 106,
         63: 13,
         35: 121,
         51: 93,
         45: 78,
         38: 122,
         69: 3,
         64: 14,
         61: 40,
         66: 12,
         58: 46,
 

In [17]:
MAX_LEN = 64

In [18]:
tokenizer.convert_tokens_to_ids(tokenized_texts[0])

[101,
 2172,
 4339,
 3599,
 6814,
 11307,
 3769,
 2614,
 3405,
 2172,
 15594,
 2028,
 2659,
 3976,
 3769,
 11307,
 9733,
 2089,
 2092,
 4965,
 9826,
 2147,
 2750,
 20874,
 102]

In [19]:
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

In [20]:
# Create attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask)

In [21]:
# Use train_test_split to split our data into train and validation sets for training

train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, 
                                                            random_state=2018, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)

In [35]:
# Convert all of our data into torch tensors, the required datatype for our model

train_inputs = torch.tensor(train_inputs,dtype=torch.long)
validation_inputs = torch.tensor(validation_inputs,dtype=torch.long)
train_labels = torch.tensor(train_labels,dtype=torch.float)
validation_labels = torch.tensor(validation_labels,dtype=torch.float)
train_masks = torch.tensor(train_masks,dtype=torch.long)
validation_masks = torch.tensor(validation_masks,dtype=torch.long)

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  """
  
  import sys
  


In [36]:
# Select a batch size for training. For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32
batch_size = 32

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)


In [24]:
model = DistilBertModel.from_pretrained("distilbert-base-uncased")

In [25]:
for param in model.parameters():
    param.requires_grad = False

In [26]:
from torch import nn
import torch.nn.functional as F

class Flatten(nn.Module):
    def forward(self, input):
        return input.view(input.size(0), -1)
    
class finetuneBERT(Flatten,nn.Module):
    def __init__(self, bert_output_size, output_size):
        super(finetuneBERT, self).__init__()

        self.bertmodel = model

        self.flatten = Flatten()
        
        self.out = nn.Linear(in_features=bert_output_size,out_features=output_size)

    def forward(self, input_token, input_mask):
        hidden = self.bertmodel(input_token, input_mask)
        flatten = torch.flatten(torch.Tensor(hidden[0]),start_dim=1)
        output = nn.Softmax()(self.out(flatten))
        return output

In [27]:
model2 = finetuneBERT(MAX_LEN*768,5)

In [28]:
model2.parameters

<bound method Module.parameters of finetuneBERT(
  (bertmodel): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm(torch.Size([768]), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (dropout): Dropout(p=0.1)
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm(torch.Size([768]), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout)

In [55]:
#param_optimizer = list(model.named_parameters())
#param_optimizer = [p for n,p in param_optimizer]

In [29]:
optimizer = torch.optim.Adam(model2.out.parameters(),lr=.001)

In [30]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.argmax(axis=1).flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [55]:
criterion = nn.BCELoss()

# Store our loss and accuracy for plotting
train_loss_set = []

# Number of training epochs (authors recommend between 2 and 4)
epochs = 5

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):
  
  
  # Training
  
  # Set our model to training mode (as opposed to evaluation mode)
  model2.train()
  
  # Tracking variables
  tr_loss = 0
  nb_tr_examples, nb_tr_steps = 0, 0
  
  # Train the data for one epoch
  for step, batch in enumerate(train_dataloader):
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # Clear out the gradients (by default they accumulate)
    optimizer.zero_grad()
    # Forward pass
    output = model2(b_input_ids,b_input_mask)
    #output = output.reshape(output.shape[0])
    loss = criterion(output, b_labels)
    train_loss_set.append(loss.item())    
    # Backward pass
    loss.backward()
    # Update parameters and take a step using the computed gradient
    optimizer.step()
    
    
    # Update tracking variables
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1

  print("Train loss: {}".format(tr_loss/nb_tr_steps))
    
    
  # Validation

  # Put model in evaluation mode to evaluate loss on the validation set
  model2.eval()

  # Tracking variables 
  eval_loss, eval_accuracy, eval_f1 = 0, 0, 0
  nb_eval_steps, nb_eval_examples = 0, 0

  # Evaluate data for one epoch
  for batch in validation_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # Telling the model not to compute or store gradients, saving memory and speeding up validation
    with torch.no_grad():
      # Forward pass, calculate logit predictions
      logits = model2(b_input_ids,b_input_mask)
    
    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    #tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    tmp_eval_accuracy = accuracy_score(label_ids.argmax(axis=1),logits.argmax(axis=1)) #np.dot(logits.argmax(axis=1),label_ids)*1.0/logits.shape[0]
    eval_accuracy += tmp_eval_accuracy
    
    tmp_eval_f1 = f1_score(label_ids.argmax(axis=1),logits.argmax(axis=1),average='macro') #np.dot(logits.argmax(axis=1),label_ids)*1.0/logits.shape[0]
    eval_f1 += tmp_eval_f1
    nb_eval_steps += 1

  print("Validation Accuracy: {}, F1: {}".format(eval_accuracy/nb_eval_steps,eval_f1/nb_eval_steps))




Train loss: 0.3175483157097454


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)

Epoch:  20%|██        | 1/5 [07:08<28:34, 428.53s/it][A

Validation Accuracy: 0.4560842803030303, F1: 0.26666737877039803
Train loss: 0.23608664858718992



Epoch:  40%|████      | 2/5 [14:12<21:21, 427.30s/it][A

Validation Accuracy: 0.5604876893939393, F1: 0.32385742014131685
Train loss: 0.1992092912255878



Epoch:  60%|██████    | 3/5 [21:46<14:30, 435.06s/it][A

Validation Accuracy: 0.6749526515151515, F1: 0.20961234705717544
Train loss: 0.19267635537145283



Epoch:  80%|████████  | 4/5 [28:47<07:10, 430.82s/it][A

Validation Accuracy: 0.6131628787878788, F1: 0.24821807708214969
Train loss: 0.1347185633726803



Epoch: 100%|██████████| 5/5 [35:42<00:00, 426.19s/it][A
[A

Validation Accuracy: 0.6028645833333334, F1: 0.3166064514449794


In [56]:
val_out = model2(validation_inputs,validation_masks)



In [57]:
val_out = val_out.detach().cpu().numpy()

In [43]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

In [58]:
accuracy_score(validation_labels.detach().cpu().numpy().argmax(axis=1),val_out.argmax(axis=1))

0.604221635883905

In [59]:
confusion_matrix(validation_labels.detach().cpu().numpy().argmax(axis=1),val_out.argmax(axis=1))

array([[  3,   0,   2,   3,   7],
       [  0,   0,   1,   3,   5],
       [  3,   0,   6,  18,  18],
       [  1,   0,  17,  63,  82],
       [  8,   0,  26, 106, 386]])

In [60]:
f1_score(validation_labels.detach().cpu().numpy().argmax(axis=1),val_out.argmax(axis=1),average='macro')

0.2863100348951697

In [61]:
f1_score(validation_labels.detach().cpu().numpy().argmax(axis=1),val_out.argmax(axis=1),average='micro')

0.604221635883905