## Product Sentiment Data - Imbalance 2

Data (public domain): https://data.world/crowdflower/brands-and-product-emotions

Notebook code based on IMDB notebook from bert-sklearn/other_examples

In [1]:
import numpy as np
import pandas as pd
import os
import sys
import csv
import re
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.utils import shuffle
from ftfy import fix_text
 
from bert_sklearn import BertClassifier
from bert_sklearn import load_model

print(os.getcwd())

DATAFILE = "./data/judge-expanded2.csv"

/Users/joseph.porter/Data/nas2019/NAS2019


In [2]:
# Load Data

    
data = pd.read_csv(DATAFILE)
print(len(data))
data = data[data['text'].notnull()]
print(len(data))
data.head(10)

8918
8917


Unnamed: 0,text,company,label
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,Apple,-1
1,@jessedee Know about @fludapp ? Awesome iPad/i...,Apple,1
2,@swonderlin Can not wait for #iPad 2 also. The...,Apple,1
3,@sxsw I hope this year's festival isn't as cra...,Apple,-1
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,1
6,"#SXSW is just starting, #CTIA is around the co...",Google,1
7,Beautifully smart and simple idea RT @madebyma...,Apple,1
8,Counting down the days to #sxsw plus strong Ca...,Apple,1
9,Excited to meet the @samsungmobileus at #sxsw ...,Google,1
10,Find & Start Impromptu Parties at #SXSW With @...,Google,1


In [3]:
# Split into training and test data

msk = np.random.rand(len(data)) < 0.8
train = data[msk]
test = data[~msk]
print('Training data size: ' + str(train.shape))
print('Test data size: ' + str(test.shape))

Training data size: (7138, 3)
Test data size: (1779, 3)


In [4]:
from collections import Counter

def print_dist(dataset, label='label'):
    
    dist = Counter(dataset[label])
    total = len(dataset)
    for k,v in sorted(dist.items(), key=lambda x: x[0]):
        pct = 100.0 * (float(v)/float(total))
        print(f'{k}: {v} ({pct:5.2f}%)')
    

In [5]:
print('Train dist:')
print(print_dist(train))
print('Test dist:')
print(print_dist(test))

Train dist:
-1: 2388 (33.45%)
0: 2379 (33.33%)
1: 2371 (33.22%)
None
Test dist:
-1: 582 (32.72%)
0: 590 (33.16%)
1: 607 (34.12%)
None


In [6]:
train[:1].values

array([["@jessedee Know about @fludapp ? Awesome iPad/iPhone app that you'll likely appreciate for its design. Also, they're giving free Ts at #SXSW",
        'Apple', 1]], dtype=object)

As you can see, each review is much longer than a sentence or two. The Google AI BERT models were trained on sequences of max length 512. Lets look at the performance for max_seq_length equal to  128, 256, and 512.

### max_seq_length = 128

In [7]:
## Set up data for the classifier

train = train.sample(1400)
test = test.sample(700)

print("Train data size: %d "%(len(train)))
print("Test data size: %d "%(len(test)))

X_train = train['text']
y_train = train['label']

X_test = test['text']
y_test = test['label']

Train data size: 1400 
Test data size: 700 


In [8]:
print('Train dist:')
print(print_dist(train))
print('Test dist:')
print(print_dist(test))

Train dist:
-1: 460 (32.86%)
0: 462 (33.00%)
1: 478 (34.14%)
None
Test dist:
-1: 253 (36.14%)
0: 218 (31.14%)
1: 229 (32.71%)
None


In [9]:
## Create the model

model = BertClassifier(bert_model='bert-base-uncased', label_list=[-1,0,1], validation_fraction=0.5)
model.max_seq_length = 128
model.learning_rate = 2e-05
model.epochs = 4

print(model)


Building sklearn text classifier...
BertClassifier(bert_config_json=None, bert_model='bert-base-uncased',
               bert_vocab=None, do_lower_case=None, epochs=4, eval_batch_size=8,
               fp16=False, from_tf=False, gradient_accumulation_steps=1,
               ignore_label=None, label_list=[-1, 0, 1], learning_rate=2e-05,
               local_rank=-1, logfile='bert_sklearn.log', loss_scale=0,
               max_seq_length=128, num_mlp_hiddens=500, num_mlp_layers=0,
               random_state=42, restore_file=None, train_batch_size=32,
               use_cuda=True, validation_fraction=0.5, warmup_proportion=0.1)


In [10]:
%%time
## Train the model using our data (this could take a while)

model.fit(X_train, y_train)

accy = model.score(X_test, y_test)

Loading bert-base-uncased model...
Defaulting to linear classifier/regressor
Loading Pytorch checkpoint
train data size: 700, validation data size: 700


Training  : 100%|██████████| 22/22 [08:56<00:00, 24.40s/it, loss=0.931]
Validating: 100%|██████████| 88/88 [02:28<00:00,  1.68s/it]

Epoch 1, Train loss: 0.9310, Val loss: 0.8605, Val accy: 56.29%



Training  : 100%|██████████| 22/22 [09:02<00:00, 24.67s/it, loss=0.687]
Validating: 100%|██████████| 88/88 [02:30<00:00,  1.71s/it]

Epoch 2, Train loss: 0.6868, Val loss: 0.6790, Val accy: 62.71%



Training  : 100%|██████████| 22/22 [08:55<00:00, 24.35s/it, loss=0.584]
Validating: 100%|██████████| 88/88 [02:31<00:00,  1.73s/it]

Epoch 3, Train loss: 0.5842, Val loss: 0.6348, Val accy: 70.43%



Training  : 100%|██████████| 22/22 [08:57<00:00, 24.45s/it, loss=0.509]
Validating: 100%|██████████| 88/88 [02:23<00:00,  1.63s/it]

Epoch 4, Train loss: 0.5088, Val loss: 0.5916, Val accy: 71.43%



Testing: 100%|██████████| 88/88 [02:27<00:00,  1.68s/it]


Loss: 0.5943, Accuracy: 71.86%
CPU times: user 2h 4min 31s, sys: 10min 27s, total: 2h 14min 58s
Wall time: 48min 19s





In [11]:
## SNOOPING !!!

y_pred = model.predict(X_test)
report = classification_report(y_test, y_pred, labels=[-1,0,1])
print(report)

Predicting: 100%|██████████| 88/88 [02:25<00:00,  1.65s/it]

              precision    recall  f1-score   support

          -1       0.93      0.85      0.89       253
           0       0.63      0.64      0.63       218
           1       0.60      0.65      0.62       229

    accuracy                           0.72       700
   macro avg       0.72      0.71      0.72       700
weighted avg       0.73      0.72      0.72       700






In [12]:
model.epochs = 2
model.fit(X_train, y_train, load_at_start=False)
accy2 = model.score(X_test, y_test)

train data size: 700, validation data size: 700


Training  : 100%|██████████| 22/22 [09:04<00:00, 24.75s/it, loss=0.615]
Validating: 100%|██████████| 88/88 [02:30<00:00,  1.71s/it]

Epoch 1, Train loss: 0.6152, Val loss: 0.5757, Val accy: 74.00%



Training  : 100%|██████████| 22/22 [08:55<00:00, 24.32s/it, loss=0.391]
Validating: 100%|██████████| 88/88 [02:23<00:00,  1.63s/it]

Epoch 2, Train loss: 0.3912, Val loss: 0.5940, Val accy: 75.29%



Testing: 100%|██████████| 88/88 [02:20<00:00,  1.59s/it]


Loss: 0.6589, Accuracy: 73.71%





In [13]:
## SNOOPING !!!

y_pred = model.predict(X_test)
report = classification_report(y_test, y_pred, labels=[-1,0,1])
print(report)

Predicting: 100%|██████████| 88/88 [02:27<00:00,  1.68s/it]

              precision    recall  f1-score   support

          -1       0.90      0.90      0.90       253
           0       0.65      0.57      0.61       218
           1       0.64      0.72      0.68       229

    accuracy                           0.74       700
   macro avg       0.73      0.73      0.73       700
weighted avg       0.74      0.74      0.74       700






In [52]:
model.save('models/model1_128_bb_uncased.mdl')

### max_seq_length = 256

In [None]:
%%time
## Don't use this one - it will take a very long time!

model = BertClassifier(bert_model='bert-base-uncased', label_list=[-1,0,1])
model.max_seq_length = 256
model.train_batch_size = 32
model.learning_rate = 2e-05
model.epochs = 4

print(model)

model.fit(X_train, y_train)

accy = model.score(X_test, y_test)

### max_seq_length = 512

In [None]:
%%time
## Don't use this one - it will take the longest of all!

model = BertClassifier(bert_model='bert-base-uncased', label_list=[-1,0,1])
model.max_seq_length = 512

# max_seq_length=512 will use a lot more GPU mem, so I am turning down batch size 
# and adding gradient accumulation steps
model.train_batch_size = 16
model_gradient_accumulation_steps = 4

model.learning_rate = 2e-05
model.epochs = 4

print(model)

model.fit(X_train, y_train)

accy = model.score(X_test, y_test)