sentiment  
https://towardsdatascience.com/sentiment-analysis-in-10-minutes-with-bert-and-hugging-face-294e8a04b671

In [1]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures
import tensorflow as tf
import pandas as pd
import os 
import shutil

In [2]:
os.chdir('/home/bettyliao/sentiment/data/')

In [3]:
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased') # 不區分大小寫
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 
model.summary()

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  109482240 
_________________________________________________________________
dropout_37 (Dropout)         multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  1538      
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________


In [4]:
URL = 'https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz' 

# tf.kera.utils.get_file 從網路下載資源
dataset = tf.keras.utils.get_file(fname = 'aclImdb_v1.tar.gz',
                                  origin = URL,
                                  untar = True,
                                  cache_dir = '.',
                                  cache_subdir = ''
                                 )


In [5]:
# create main directory path (/acIImdb)
main_dir = os.path.join(os.path.dirname(dataset), 'aclImdb') 
# create sub directory path (/ac1Imdb/train)
train_dir = os.path.join(main_dir, 'train')
train_dir = '/home/bettyliao/sentiment/data' + train_dir.replace('.', '')
remove_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(remove_dir)
print(os.listdir(train_dir))

['unsupBow.feat', 'urls_pos.txt', 'urls_neg.txt', 'urls_unsup.txt', 'neg', 'pos', 'labeledBow.feat']


In [6]:
train = tf.keras.preprocessing.text_dataset_from_directory('aclImdb/train', 
                                                           batch_size = 30000,
                                                           validation_split = 0.2, # 驗證資料
                                                           subset = 'training',
                                                           seed = 123
                                                          )
test = tf.keras.preprocessing.text_dataset_from_directory('aclImdb/train',
                                                          batch_size = 30000,
                                                          validation_split = 0.2,
                                                          subset = 'validation',
                                                          seed = 123
                                                         )

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [7]:
for i in train.take(1):
    train_feat = i[0].numpy()
    train_lab = i[1].numpy()
    print('train_feat: ', len(train_feat),
         'train_lab: ', len(train_lab))
    
train = pd.DataFrame([train_feat, train_lab]).T # Transpose 
train.columns = ['data_column', 'label_column']
train['data_column'] = train['data_column'].str.decode('utf-8')
train.head()

train_feat:  20000 train_lab:  20000


Unnamed: 0,data_column,label_column
0,Canadian director Vincenzo Natali took the art...,1
1,I gave this film 10 not because it is a superb...,1
2,I admit to being somewhat jaded about the movi...,1
3,"For a long time, 'The Menagerie' was my favori...",1
4,A truly frightening film. Feels as if it were ...,0


In [8]:
for j in test.take(1):
    test_feat = j[0].numpy()
    test_lab = j[1].numpy()

test = pd.DataFrame([test_feat, test_lab]).T
test.columns = ['data_column', 'label_column']
test['data_column'] = test['data_column'].str.decode('utf-8') 
test.head()

Unnamed: 0,data_column,label_column
0,I can't believe that so much talent can be was...,0
1,This movie blows - let's get that straight rig...,0
2,"The saddest thing about this ""tribute"" is that...",0
3,I'm only rating this film as a 3 out of pity b...,0
4,Something surprised me about this movie - it w...,1


In [9]:
InputExample(guid = None,
            text_a = 'Hello world',
            text_b = None,
            label = 1)

InputExample(guid=None, text_a='Hello world', text_b=None, label=1)

In [10]:
def convert_data_to_examples(train, test, data_column, label_column):
    train_InputExamples = train.apply(lambda x: InputExample(guid = None, 
                                                            text_a = x[data_column], 
                                                            text_b = None,
                                                            label = x[label_column]),
                                    axis = 1) 
    validation_InputExamples = test.apply(lambda x: InputExample(guid = None, 
                                                                text_a = x[data_column], 
                                                                text_b = None,
                                                                label = x[label_column]),
                                        axis = 1)
    return train_InputExamples, validation_InputExamples

train_input, test_input = convert_data_to_examples(train, test, 'data_column', 'label_column') 

In [11]:
def convert_examples_to_tf_dataset(examples, tokenizer, max_length = 128):
    features = [] # will input inputfeatures to be convert later
    
    for e in examples:
        input_dict = tokenizer.encode_plus(e.text_a,
                                          add_special_tokens = True, 
                                          max_length = max_length, # truncates if len(s) > max_length 
                                          return_token_type_ids = True, 
                                          return_attention_mask = True,
                                          pad_to_max_length = True, # pads to the right by default 
                                          truncation = True
                                         )  
        input_ids, token_type_ids, attention_mask = (input_dict['input_ids'], 
                                                    input_dict['token_type_ids'], 
                                                    input_dict['attention_mask']) 
        features.append(InputFeatures(
            input_ids = input_ids,
            attention_mask = attention_mask,
            token_type_ids = token_type_ids,
            label = e.label
        ))
    def gen():
        for f in features:
            yield(
                {
                'input_ids': f.input_ids,
                'attention_mask': f.attention_mask,
                'token_type_ids': f.token_type_ids
                },
            f.label
            )
    return tf.data.Dataset.from_generator(
        gen, 
        ({"input_ids": tf.int32,
          'attention_mask': tf.int32,
          'token_type_ids': tf.int32
            }, 
         tf.int64
        ),
        ({'input_ids': tf.TensorShape([None]),
           'attention_mask': tf.TensorShape([None]),
           'token_type_ids': tf.TensorShape([None])
          },
          tf.TensorShape([])
         )
    )

In [12]:
train_data = convert_examples_to_tf_dataset(list(train_input), tokenizer) 
train_data = train_data.shuffle(100).batch(32).repeat(2)

validation_data = convert_examples_to_tf_dataset(list(test_input), tokenizer) 
validation_data = validation_data.batch(32)



## Configuring the BERT model and Fine-tuning

In [13]:
model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = 3e-5, epsilon = 1e-08, clipnorm = 1.0),  
              loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True), 
              metrics = [tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])

model.fit(train_data, epochs = 2, validation_data = validation_data)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f3778101908>

## making predictions

In [14]:
pred_sentences = ['This was an awesome movie. I watch it twice my time watching this beautiful movie if I known it was this good',
                 'One of the worst movies of all time. I cannot believe I wasted two hours of my life for this movie'] 
tf_batch = tokenizer(pred_sentences, max_length = 128, padding = True, truncation = True, return_tensors = 'tf') # tensor use tensorflow 
tf_outputs = model(tf_batch)
tf_predictions = tf.nn.softmax(tf_outputs[0], axis = -1)
labels = ['Negative', 'Positive']
label = tf.argmax(tf_predictions, axis = 1)
label = label.numpy()
for i in range(len(pred_sentences)):
    print(pred_sentences[i], ":\n", labels[label[i]])

This was an awesome movie. I watch it twice my time watching this beautiful movie if I known it was this good :
 Positive
One of the worst movies of all time. I cannot believe I wasted two hours of my life for this movie :
 Negative
