## Product Sentiment Data - Learning Rates

Data (public domain): https://data.world/crowdflower/brands-and-product-emotions

Notebook code based on IMDB notebook from bert-sklearn/other_examples

In [9]:
import numpy as np
import pandas as pd
import os
import sys
import csv
import re
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.utils import shuffle
from ftfy import fix_text
 
from bert_sklearn import BertClassifier
from bert_sklearn import load_model

print(os.getcwd())

DATAFILE = "./data/judge-expanded2.csv"

/Users/joseph.porter/Data/nas2019/NAS2019


In [10]:
# Load Data

    
data = pd.read_csv(DATAFILE)
print(len(data))
data = data[data['text'].notnull()]
print(len(data))
data.head(10)

8918
8917


Unnamed: 0,text,company,label
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,Apple,-1
1,@jessedee Know about @fludapp ? Awesome iPad/i...,Apple,1
2,@swonderlin Can not wait for #iPad 2 also. The...,Apple,1
3,@sxsw I hope this year's festival isn't as cra...,Apple,-1
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,1
6,"#SXSW is just starting, #CTIA is around the co...",Google,1
7,Beautifully smart and simple idea RT @madebyma...,Apple,1
8,Counting down the days to #sxsw plus strong Ca...,Apple,1
9,Excited to meet the @samsungmobileus at #sxsw ...,Google,1
10,Find & Start Impromptu Parties at #SXSW With @...,Google,1


In [11]:
# Split into training and test data

msk = np.random.rand(len(data)) < 0.8
train = data[msk]
test = data[~msk]
print('Training data size: ' + str(train.shape))
print('Test data size: ' + str(test.shape))

Training data size: (7167, 3)
Test data size: (1750, 3)


In [12]:
from collections import Counter

def print_dist(dataset, label='label'):
    
    dist = Counter(dataset[label])
    total = len(dataset)
    for k,v in sorted(dist.items(), key=lambda x: x[0]):
        pct = 100.0 * (float(v)/float(total))
        print(f'{k}: {v} ({pct:5.2f}%)')
    

In [13]:
print('Train dist:')
print(print_dist(train))
print('Test dist:')
print(print_dist(test))

Train dist:
-1: 2381 (33.22%)
0: 2384 (33.26%)
1: 2402 (33.51%)
None
Test dist:
-1: 589 (33.66%)
0: 585 (33.43%)
1: 576 (32.91%)
None


In [14]:
train[:1].values

array([['.@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead!  I need to upgrade. Plugin stations at #SXSW.',
        'Apple', -1]], dtype=object)

As you can see, each review is much longer than a sentence or two. The Google AI BERT models were trained on sequences of max length 512. Lets look at the performance for max_seq_length equal to  128, 256, and 512.

### max_seq_length = 128

In [15]:
## Set up data for the classifier

train = train.sample(1400)
test = test.sample(700)

print("Train data size: %d "%(len(train)))
print("Test data size: %d "%(len(test)))

X_train = train['text']
y_train = train['label']

X_test = test['text']
y_test = test['label']

Train data size: 1400 
Test data size: 700 


In [16]:
print('Train dist:')
print(print_dist(train))
print('Test dist:')
print(print_dist(test))

Train dist:
-1: 460 (32.86%)
0: 449 (32.07%)
1: 491 (35.07%)
None
Test dist:
-1: 237 (33.86%)
0: 224 (32.00%)
1: 239 (34.14%)
None


In [17]:
## DECREASE THE LEARNING RATE 10X

model = BertClassifier(bert_model='bert-base-uncased', label_list=[-1,0,1], validation_fraction=0.5)
model.max_seq_length = 128
model.learning_rate = 2e-06
model.epochs = 3

print(model)


Building sklearn text classifier...
BertClassifier(bert_config_json=None, bert_model='bert-base-uncased',
               bert_vocab=None, do_lower_case=None, epochs=3, eval_batch_size=8,
               fp16=False, from_tf=False, gradient_accumulation_steps=1,
               ignore_label=None, label_list=[-1, 0, 1], learning_rate=2e-06,
               local_rank=-1, logfile='bert_sklearn.log', loss_scale=0,
               max_seq_length=128, num_mlp_hiddens=500, num_mlp_layers=0,
               random_state=42, restore_file=None, train_batch_size=32,
               use_cuda=True, validation_fraction=0.5, warmup_proportion=0.1)


In [18]:
%%time
## Train the model using our data (this could take a while)

model.fit(X_train, y_train)

accy = model.score(X_test, y_test)

Loading bert-base-uncased model...
Defaulting to linear classifier/regressor
Loading Pytorch checkpoint
train data size: 700, validation data size: 700


Training  : 100%|██████████| 22/22 [08:58<00:00, 24.47s/it, loss=1.08]
Validating: 100%|██████████| 88/88 [02:23<00:00,  1.63s/it]

Epoch 1, Train loss: 1.0790, Val loss: 0.9657, Val accy: 59.57%



Training  : 100%|██████████| 22/22 [08:35<00:00, 23.42s/it, loss=0.942]
Validating: 100%|██████████| 88/88 [02:17<00:00,  1.57s/it]

Epoch 2, Train loss: 0.9425, Val loss: 0.8823, Val accy: 64.86%



Training  : 100%|██████████| 22/22 [08:28<00:00, 23.10s/it, loss=0.867]
Validating: 100%|██████████| 88/88 [02:17<00:00,  1.56s/it]

Epoch 3, Train loss: 0.8672, Val loss: 0.8498, Val accy: 66.00%



Testing: 100%|██████████| 88/88 [02:17<00:00,  1.57s/it]


Loss: 0.8809, Accuracy: 64.29%
CPU times: user 1h 33min 14s, sys: 7min 35s, total: 1h 40min 49s
Wall time: 35min 21s





In [19]:
## SNOOPING !!!

y_pred = model.predict(X_test)
report = classification_report(y_test, y_pred, labels=[-1,0,1])
print(report)

Predicting: 100%|██████████| 88/88 [02:17<00:00,  1.57s/it]

              precision    recall  f1-score   support

          -1       0.90      0.74      0.81       237
           0       0.55      0.69      0.61       224
           1       0.54      0.50      0.52       239

    accuracy                           0.64       700
   macro avg       0.66      0.64      0.65       700
weighted avg       0.66      0.64      0.65       700






In [20]:
## INCREASE THE LEARNING RATE 10X

model2 = BertClassifier(bert_model='bert-base-uncased', label_list=[-1,0,1], validation_fraction=0.5)
model2.max_seq_length = 128
model2.learning_rate = 2e-04
model2.epochs = 3

print(model)

Building sklearn text classifier...
BertClassifier(bert_config_json=None, bert_model='bert-base-uncased',
               bert_vocab=None, do_lower_case=True, epochs=3, eval_batch_size=8,
               fp16=False, from_tf=False, gradient_accumulation_steps=1,
               ignore_label=None, label_list=[-1, 0, 1], learning_rate=2e-06,
               local_rank=-1, logfile='bert_sklearn.log', loss_scale=0,
               max_seq_length=128, num_mlp_hiddens=500, num_mlp_layers=0,
               random_state=42, restore_file=None, train_batch_size=32,
               use_cuda=True, validation_fraction=0.5, warmup_proportion=0.1)


In [21]:
%%time
## Train the model using our data (this could take a while)

model2.fit(X_train, y_train)

accy2 = model2.score(X_test, y_test)

Loading bert-base-uncased model...
Defaulting to linear classifier/regressor
Loading Pytorch checkpoint
train data size: 700, validation data size: 700


Training  : 100%|██████████| 22/22 [08:35<00:00, 23.43s/it, loss=1.13]
Validating: 100%|██████████| 88/88 [02:20<00:00,  1.60s/it]

Epoch 1, Train loss: 1.1251, Val loss: 1.1127, Val accy: 31.14%



Training  : 100%|██████████| 22/22 [08:35<00:00, 23.44s/it, loss=1.17]
Validating: 100%|██████████| 88/88 [02:20<00:00,  1.59s/it]

Epoch 2, Train loss: 1.1650, Val loss: 1.1747, Val accy: 31.14%



Training  : 100%|██████████| 22/22 [08:37<00:00, 23.51s/it, loss=1.12]
Validating: 100%|██████████| 88/88 [02:22<00:00,  1.61s/it]

Epoch 3, Train loss: 1.1179, Val loss: 1.1036, Val accy: 33.00%



Testing: 100%|██████████| 88/88 [02:27<00:00,  1.67s/it]


Loss: 1.1037, Accuracy: 32.00%
CPU times: user 1h 32min 52s, sys: 7min 36s, total: 1h 40min 28s
Wall time: 35min 22s





In [22]:
## SNOOPING !!!

y_pred2 = model2.predict(X_test)
report2 = classification_report(y_test, y_pred2, labels=[-1,0,1])
print(report2)

Predicting: 100%|██████████| 88/88 [02:22<00:00,  1.62s/it]

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00       237
           0       0.32      1.00      0.48       224
           1       0.00      0.00      0.00       239

    accuracy                           0.32       700
   macro avg       0.11      0.33      0.16       700
weighted avg       0.10      0.32      0.16       700




  'precision', 'predicted', average, warn_for)
