In [1]:
import os
import json

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import tensorflow as tf
from transformers import BertTokenizer,TFBertModel,TFBertForSequenceClassification,BertConfig,TFBertForPreTraining

In [7]:
friend_data_path = r'/home/ning/dataset'
bert_path = r"/home/ning/bert_conf"
cache_dir=r'/home/ning/bert_conf/bert-base-uncased-cache'

ckpt_path = r'/home/ning/models.ckpt'

train_df = pd.read_pickle(os.path.join(friend_data_path,'friends82_train.pkl'))
test_df = pd.read_pickle(os.path.join(friend_data_path,'friends82_test.pkl'))

train_df = train_df[train_df['emotion'].isin([ 'neutral', 'joy', 'sadness', 'anger'])]
test_df = test_df[test_df['emotion'].isin([ 'neutral', 'joy', 'sadness', 'anger'])]

#shuffle
train_df = train_df.sample(frac=1)

train_df_emo = train_df[train_df['emotion'].isin([ 'neutral', 'joy', 'sadness', 'anger'])]
test_df_emo = test_df[test_df['emotion'].isin([ 'neutral', 'joy', 'sadness', 'anger'])]

tokenizer = BertTokenizer.from_pretrained(os.path.join(bert_path,'vocab_friends.txt'))

# model = TFBertForSequenceClassification.from_pretrained(os.path.join(ckpt_path,'friendsbert_no_pretrained.h5'),config = os.path.join(ckpt_path,'config.json'),num_labels=4)
model = TFBertForSequenceClassification.from_pretrained(os.path.join(ckpt_path,'FriendsBert_pretrain_finetune'),config = os.path.join(ckpt_path,'FriendsBert_pretrain_finetune'),num_labels=4)
# model = TFBertForSequenceClassification.from_pretrained(os.path.join(bert_path,'tf_model.h5'),config = os.path.join(bert_path,'config.json'),num_labels=4)



Calling BertTokenizer.from_pretrained() with the path to a single file or url is deprecated
Some weights of the model checkpoint at /home/ning/models.ckpt/FriendsBert_pretrain_finetune were not used when initializing TFBertForSequenceClassification: ['dropout_75']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of TFBertForSequenceClassification were not initialized from the model checkpoint at /home/ning/models.ckpt/FriendsBert_pretrain_finetune and are newly initialized: ['dropout_37']
You should probably TRAIN this model on a dow

In [8]:
def encode_sentence_with_speaker(speaker,utterance,tokenizer,sos):
    if sos:
        tokens = ['[CLS]']
    else:
        tokens = []

    if speaker not in ['other','None']:
        spk_token = '['+speaker+']'
        tokens.append(spk_token)
        tokens.append('[says]')
        tokens.extend(list(tokenizer.tokenize(utterance)))
        tokens.append('[SEP]')
    
    else:
        tokens.extend(list(tokenizer.tokenize(utterance)))
    
    return tokenizer.convert_tokens_to_ids(tokens)
    
    


def bert_encode(dataframe, tokenizer,single=False):
    
    if not single:
        num_examples = len(dataframe.index)
        sentence1 = tf.ragged.constant([encode_sentence_with_speaker(s[0],s[1],tokenizer,True) for s in dataframe.values])

        sentence2 = tf.ragged.constant([encode_sentence_with_speaker(s[2],s[3],tokenizer,False) for s in dataframe.values])

        input_word_ids = tf.concat([sentence1, sentence2], axis=-1)

        input_mask = tf.ones_like(input_word_ids).to_tensor()

        type_s1 = tf.zeros_like(sentence1)
        type_s2 = tf.ones_like(sentence2)
        input_type_ids = tf.concat([type_s1, type_s2], axis=-1).to_tensor()


        inputs = {
            'input_ids': input_word_ids.to_tensor(),
            'attention_mask': input_mask,
            'token_type_ids': input_type_ids}
    
    else:
        num_examples = len(dataframe.index)
        input_word_ids = tf.ragged.constant([encode_sentence_with_speaker(s[0],s[1],tokenizer,True) for s in dataframe.values])
        
        input_mask = tf.ones_like(input_word_ids).to_tensor()
        
        input_type_ids = tf.zeros_like(input_word_ids).to_tensor()
        
        inputs = {
            'input_ids': input_word_ids.to_tensor(),
            'attention_mask': input_mask,
            'token_type_ids': input_type_ids}
        
        
    return inputs




In [9]:
train_target = train_df['emotion']
test_target = test_df['emotion']
train_target_emo = train_df_emo['emotion']


train_features = bert_encode(train_df, tokenizer,single = False)
test_features = bert_encode(test_df, tokenizer,single = False)
train_features_emo= bert_encode(train_df_emo, tokenizer,single = False)


ems = train_target.unique()
def convert(emotion):
    return np.where(ems == emotion)[0][0]

train_labels = np.array(list(map(convert,train_target)))
test_labels = np.array(list(map(convert,test_target)))
train_labels_emo = np.array(list(map(convert,train_target_emo)))


In [9]:
def NLL(y_true,y_pred):
    
    sample_weight = np.array([ 1.44266613,  5.78943089, 18.78891821, 12.36284722])
    node1 = tf.nn.softmax(y_pred)
    node2 = sample_weight*node1
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)
    return loss(y_true,node2)/sample_weight.sum()


optimizer = tf.keras.optimizers.Adam(learning_rate=2.5e-8)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(
    optimizer=optimizer,
    loss=NLL,
    metrics=['accuracy'])
history = model.fit(
      train_features,train_labels,
    
#       validation_data=(test_features,test_labels),
      batch_size=24,
      epochs=1)



In [13]:
m,p = test(model,test_features,test_labels,batch_size = 16,num_labels = 4)

[[1183   66   23   21]
 [  42    3   56    5]
 [  26   13    2   98]
 [  77  264    5   20]]
            Neutral       Joy   Sadness     Anger   Overall
Precision  0.890813  0.008671  0.023256  0.138889       NaN
Recall     0.914927  0.028302  0.014388  0.054645       NaN
F1-score   0.902709  0.013274  0.017778  0.078431  0.634454


In [20]:
model.save_pretrained('/home/ning/models.ckpt/FriendsBert_pretrain_finetune.h5')

In [11]:
p

Unnamed: 0,Neutral,Joy,Sadness,Anger,Overall
Precision,0.882483,0.766082,0.702703,0.666667,
Recall,0.923434,0.715847,0.490566,0.647482,
F1-score,0.902494,0.740113,0.577778,0.656934,0.839286


In [11]:


epochs = 1
batch_size = 16
num_labels = 4
def calc_confusion_matrix(matrix,predict,true):
    
    for i in range(len(predict)):
        matrix[true[i]][predict[i]] += 1
    return matrix

def calc_acc(matrix):
    r = 0
    for i,label in enumerate(matrix):
        r += matrix[i][i]
    
    return r/sum(sum(matrix))
    
def calc_performance(matrix):
    rights = matrix[[range(4)],[range(4)]]
    
    Ps = rights/matrix.sum(axis=0)
    Rs = rights/matrix.sum(axis=1)
    
    f1_scores = 2*Ps*Rs/(Ps+Rs)
    
    micro_f1_scores = rights.sum()/sum(matrix.sum(axis=0))

    overall_p = np.array([[np.nan],[np.nan],[micro_f1_scores]])
    
    perfromence_ = np.concatenate((Ps,Rs,f1_scores),axis=0)
    perfromence = np.concatenate((perfromence_,overall_p),axis=1)
    
    df = pd.DataFrame(perfromence,columns=['Neutral','Joy','Sadness','Anger','Overall'],index=['Precision','Recall','F1-score'])
    
    
    return df
    
def NLL(y_true,y_pred):
    
    sample_weight = np.array([ 1.44266613,  5.78943089, 18.78891821, 12.36284722])
    node1 = tf.nn.softmax(y_pred)
    node2 = sample_weight*node1
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)
    return loss(y_true,node2)/sample_weight.sum()

loss_object= tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# loss_object = NLL

# train_data_size = len(train_labels)
# steps_per_epoch = int(train_data_size / batch_size)
# num_train_steps = steps_per_epoch * epochs
# warmup_steps = int(epochs * train_data_size * 0.1 / batch_size)

# optimizer = nlp.optimization.create_optimizer(
#     2.5e-6, num_train_steps=num_train_steps, num_warmup_steps=warmup_steps)



def train(model,train_f,train_l,test_f,test_l,one_step = False):

    performance = pd.DataFrame()
    batch_size = 16
    num_labels = 4
    epochs = 1
    weight = np.array([[sum(train_target==e) for e in ems]]*batch_size)
    weight = weight[0].sum()/weight
    sample_weight = tf.constant(weight,dtype=tf.float32)
    
    confusion_matrix = np.zeros((num_labels,num_labels),dtype=np.float32)
    optimizer = tf.keras.optimizers.Adam(learning_rate=2.5e-7)
    
    for i in range(epochs):
        print("Training for epoch {}".format(i))
        for step in range(len(train_l)//batch_size):
            train_f_batch = {'input_ids': train_f['input_ids'][batch_size*step:batch_size*(step+1)],
                    'attention_mask': train_f['attention_mask'][batch_size*step:batch_size*(step+1)],
                    'token_type_ids': train_f['token_type_ids'][batch_size*step:batch_size*(step+1)]}
            train_l_batch = train_l[batch_size*step:batch_size*(step+1)]

            with tf.GradientTape() as tape:
                prediction = model(train_f_batch,training = True)
                pred_label = np.argmax(prediction[0].numpy(),axis=1)
                
                loss_value  = loss_object(y_true=train_l_batch, y_pred=prediction)
                
                
            grads = tape.gradient(loss_value, model.trainable_weights)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))
            
            if one_step:
                return None,None
            
            
            if not step%50:
                print("Epoch {:03d}: Step:{:03d}//{:03d} Loss: {:.3f} ".format(i,step,len(train_l)//batch_size,loss_value))
                confusion_matrix,performance = test(model,test_features,test_labels,batch_size = 16,num_labels = 4)
                
    return confusion_matrix,performance



def test(model,test_f,test_l,batch_size = 16,num_labels = 4):
    
    confusion_matrix = np.zeros((num_labels,num_labels),dtype=np.float32)
    
    for step_ in range(len(test_l)//batch_size):
        test_f_batch = {'input_ids': test_f['input_ids'][batch_size*step_:batch_size*(step_+1)],
                'attention_mask': test_f['attention_mask'][batch_size*step_:batch_size*(step_+1)],
                'token_type_ids': test_f['token_type_ids'][batch_size*step_:batch_size*(step_+1)]}
        test_l_batch = test_l[batch_size*step_:batch_size*(step_+1)]

        prediction = model(test_f_batch,training = False)
        pred_label = np.argmax(prediction[0].numpy(),axis=1)

        confusion_matrix = calc_confusion_matrix(confusion_matrix,pred_label,test_l_batch)

    print(confusion_matrix.astype(np.int32))
    performance = calc_performance(confusion_matrix)

    print(performance)    
    return confusion_matrix.astype(np.int32),performance


In [12]:
confusion_matrix,performance = train(model,train_features,train_labels,test_features,test_labels)


Training for epoch 0
Epoch 000: Step:000//445 Loss: 0.102 
[[1438   93   33   22]
 [  99  348    5   28]
 [  49    4   62    4]
 [  38   19    5  121]]
            Neutral       Joy   Sadness     Anger   Overall
Precision  0.885468  0.750000  0.590476  0.691429       NaN
Recall     0.906684  0.725000  0.521008  0.661202       NaN
F1-score   0.895950  0.737288  0.553571  0.675978  0.831503
Epoch 000: Step:050//445 Loss: 0.253 
[[1429   92   37   28]
 [  99  347    5   29]
 [  49    4   62    4]
 [  35   18    5  125]]
            Neutral       Joy   Sadness     Anger  Overall
Precision  0.886476  0.752711  0.568807  0.672043      NaN
Recall     0.901009  0.722917  0.521008  0.683060      NaN
F1-score   0.893684  0.737513  0.543860  0.677507  0.82897
Epoch 000: Step:100//445 Loss: 0.176 
[[1430   91   37   28]
 [  98  347    5   30]
 [  49    4   61    5]
 [  35   18    5  125]]
            Neutral       Joy   Sadness     Anger  Overall
Precision  0.887097  0.754348  0.564815  0.664894  

In [None]:
confusion_matrix,performance = train(model,train_features_emo,train_labels_emo,test_features,test_labels)

In [7]:
a,b = test(model,test_features,test_labels,batch_size = 16,num_labels = 4)

TypeError: '_TupleWrapper' object is not callable

In [14]:
model2 = TFBertForSequenceClassification.from_pretrained(os.path.join(ckpt_path,'friendsbert_no_pretrained.h5'),config = os.path.join(ckpt_path,'config.json'),num_labels=4)
a,b = test(model2,test_features,test_labels,batch_size = 16,num_labels = 4)

Some weights of the model checkpoint at /home/ning/models.ckpt/friendsbert_no_pretrained.h5 were not used when initializing TFBertForSequenceClassification: ['dropout_112']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of TFBertForSequenceClassification were not initialized from the model checkpoint at /home/ning/models.ckpt/friendsbert_no_pretrained.h5 and are newly initialized: ['dropout_75']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[[1415.   96.   39.   36.]
 [  96.  352.    5.   27.]
 [  55.    3.   51.   10.]
 [  32.   23.    2.  126.]]
            Neutral       Joy   Sadness     Anger   Overall
Precision  0.885482  0.742616  0.525773  0.633166       NaN
Recall     0.892182  0.733333  0.428571  0.688525       NaN
F1-score   0.888819  0.737945  0.472222  0.659686  0.820946


In [9]:
model.save_pretrained('/home/ning/models.ckpt/')

In [14]:
model.layers[0].weights()

[<tf.Variable 'tf_bert_for_sequence_classification/bert/embeddings/word_embeddings/weight:0' shape=(30522, 768) dtype=float32, numpy=
 array([[-0.01018257, -0.06154883, -0.02649689, ..., -0.01985357,
         -0.03720997, -0.00975152],
        [-0.01153367, -0.06003894, -0.03249861, ..., -0.01657651,
         -0.03986622, -0.01054887],
        [-0.01963159, -0.06275437, -0.03288998, ..., -0.01668699,
         -0.04193879, -0.00309247],
        ...,
        [-0.02176224, -0.0556396 , -0.01346345, ..., -0.00432698,
         -0.0151355 , -0.02489496],
        [-0.04617237, -0.05647721, -0.00192082, ...,  0.01568751,
         -0.01387033, -0.00945213],
        [ 0.00145601, -0.08208051, -0.01597912, ..., -0.00811687,
         -0.04746607,  0.07527421]], dtype=float32)>,
 <tf.Variable 'tf_bert_for_sequence_classification/bert/embeddings/position_embeddings/embeddings:0' shape=(512, 768) dtype=float32, numpy=
 array([[ 0.01772893, -0.02547974, -0.03696872, ..., -0.00027289,
          0.00069

In [16]:
len(train_df)/len(test_df)

3.0004212299915753