In [1]:
emotion = "sadness"
emotion = "joy"
emotion = "anger"

emotion = "fear"

In [2]:
data_folder = "E:\data\emoint"
#data_file = "E:\\data\\emoint\\tweets_sm.xlsx"
data_file = "E:\\data\\emoint\\tweets_all.xlsx"

In [3]:
import os
import re
import csv
import time
import keras
import numpy
import random
import tensorflow

In [4]:
import numpy as np
import pandas as pd
import tensorflow as tf
import transformers

In [5]:
from keras import *
from keras.utils import *
from keras.models import *
from tensorflow.keras.utils import to_categorical

from sklearn.model_selection import KFold

In [6]:
max_features = 20000
sequence_length = 100 # max number of words in one tweet 

df_list = []

for f in os.listdir(data_folder):
    if bool(re.search(r'\.txt$', f)):
        f1 = '{}\{}'.format(data_folder, f)
        df = pd.read_csv(f1, 
            header = 0,
            names = ['tweet_id', 'text', 'emotion', 'intensity'],
            delimiter = '\t',
            )
        df_list.append(df)

df_all = pd.concat(df_list)

#df_all = df_all.head(1000)

df_all.to_excel(
    '{}/tweets_all.xlsx'.format(data_folder),
    index = False,
    )


# data conversion functions

## function to convert text file to text and tag list

In [7]:
def convert_file_to_text_and_tag_list(
    emotion_tag,
    data_file,
    ):
    data = pd.read_excel(data_file)
    data['label'] = data['emotion'].apply(lambda x: 1 if x == emotion else 0)
    texts = data['text'].to_list()
    tags = data['label'].to_list()
    return texts, tags

## function to convert text file to text and intensentiy score list

In [8]:
def str_to_float(
    x):
    try:
        return float(x)
    except:
        return None
    
def convert_file_to_text_and_score_list(
    emotion_tag,
    data_file,
    ):
    data = pd.read_excel(data_file)
    data = data[data['emotion'] == emotion]
    data['intensity'] = data['intensity'].apply(str_to_float)
    data = data[data.intensity.notnull()]
    texts = data['text'].to_list()
    scores = data['intensity'].to_list()
    return texts, scores

# build the model

# parameters

## convert text list to a input format of deep learning model

In [9]:
tokenizer = transformers.BertTokenizer.from_pretrained(
    "bert-base-uncased", do_lower_case=True
    )

In [10]:
def texts_to_input(
    texts,
    sequence_length = 100,
    ):
    encoded = tokenizer.batch_encode_plus(
        texts,
        add_special_tokens=True,
        max_length=sequence_length,
        return_attention_mask=True,
        return_token_type_ids=True,
        pad_to_max_length=True,
        return_tensors="tf",
        )
    # Convert batch of encoded features to numpy array.
    input_ids = np.array(encoded["input_ids"], dtype="int32")
    attention_masks = np.array(encoded["attention_mask"], dtype="int32")
    token_type_ids = np.array(encoded["token_type_ids"], dtype="int32")
    return input_ids, attention_masks, token_type_ids

texts_to_input(["Never dull moment here"])

texts_to_input(["Never dull moment here"], method = 'one_hot')

## function of building the model of text emotion tag 

In [11]:
embedding_dim = 300
dropout_rate = 0.2
filters = 128

In [12]:
def emotion_tagger_model_building(
    embedding_dim = 300,
    filters = 128,
    kernel_size = 2,
    dropout_rate = 0.2,
    sequence_length = 100,
    ):
    ### input layers
    input_ids = tf.keras.layers.Input(
        shape=(sequence_length,), dtype=tf.int32, name="input_ids"
    )
    # Attention masks indicates to the model which tokens should be attended to.
    attention_masks = tf.keras.layers.Input(
        shape=(sequence_length,), dtype=tf.int32, name="attention_masks"
    )
    # Token type ids are binary masks identifying different sequences in the model.
    token_type_ids = tf.keras.layers.Input(
        shape=(sequence_length,), dtype=tf.int32, name="token_type_ids"
    )

    # Loading pretrained BERT model.
    bert_model = transformers.TFBertModel.from_pretrained("bert-base-uncased")
    # Freeze the BERT model to reuse the pretrained features without modifying them.
    bert_model.trainable = False
    ###load the bert model 
    bert_output = bert_model(
        input_ids, attention_mask=attention_masks, token_type_ids=token_type_ids
    )
    sequence_output = bert_output.last_hidden_state   
    x = layers.Dropout(dropout_rate)(sequence_output)
    # Conv1D + global max pooling
    x = layers.Conv1D(filters, kernel_size, padding="valid", activation="relu")(x)
    x = layers.Conv1D(filters, kernel_size, padding="valid", activation="relu")(x)
    x = layers.GlobalMaxPooling1D()(x)
    # We add a vanilla hidden layer:
    x = layers.Dense(filters, activation="relu")(x)
    x = layers.Dropout(dropout_rate)(x)
    # We project onto a single unit output layer, and squash it with a sigmoid:
    predictions = layers.Dense(2, 
        activation="softmax",
         name="predictions")(x)
    model = keras.Model([
        input_ids,
        attention_masks,
        token_type_ids,
    ], predictions)
    model.compile(
        loss="categorical_crossentropy", 
        optimizer="adam", 
        metrics=["accuracy"])
    return model

## function of train the model of emotion tagger with data

In [13]:
def train_tagger(
    texts,
    tags,
    tagger_model_path = None,
    tagger_model_weight_path = None,
    tagger_model_json_path = None,
    epochs = 10,
    validation_split = 0.1,
    dropout_rate = 0.2,
    ):
    tagger_model = emotion_tagger_model_building(
        dropout_rate = dropout_rate,
        )
    '''
    prepare the text input

    texts = [
        "i feel so fear",
        "nothing is wrong"
        ]
    '''
    x_ids, x_attention, x_type = texts_to_input(texts)
    '''
    prepare the output
    '''
    y = numpy.array(tags)
    y = to_categorical(y)
    print(x_ids.shape)
    print(x_attention.shape)
    print(x_type.shape)
    print(y.shape)
    print(numpy.sum(y, axis = 0))
    # Fit the model using the train and test datasets.
    tagger_model.fit(
        [x_ids, x_attention, x_type], y, 
        validation_split=validation_split, 
        epochs=epochs)
    if tagger_model_path is not None:
        tagger_model.save(tagger_model_path)
    return tagger_model

# function of intensity score model building

In [14]:
def emotion_scorer_model_building(
    embedding_dim = 300,
    filters = 128,
    kernel_size = 2,
    dropout_rate = 0.2,
    sequence_length = 100,
    ):   
    ### input layers
    input_ids = tf.keras.layers.Input(
        shape=(sequence_length,), dtype=tf.int32, name="input_ids"
    )
    # Attention masks indicates to the model which tokens should be attended to.
    attention_masks = tf.keras.layers.Input(
        shape=(sequence_length,), dtype=tf.int32, name="attention_masks"
    )
    # Token type ids are binary masks identifying different sequences in the model.
    token_type_ids = tf.keras.layers.Input(
        shape=(sequence_length,), dtype=tf.int32, name="token_type_ids"
    )
    # Loading pretrained BERT model.
    bert_model = transformers.TFBertModel.from_pretrained("bert-base-uncased")
    # Freeze the BERT model to reuse the pretrained features without modifying them.
    bert_model.trainable = False
    ###load the bert model 
    bert_output = bert_model(
        input_ids, attention_mask=attention_masks, token_type_ids=token_type_ids
    )
    sequence_output = bert_output.last_hidden_state   
    x = layers.Dropout(dropout_rate)(sequence_output)
    # Conv1D + global max pooling
    x = layers.Conv1D(filters, kernel_size, padding="valid", activation="relu")(x)
    x = layers.Conv1D(filters, kernel_size, padding="valid", activation="relu")(x)
    x = layers.GlobalMaxPooling1D()(x)
    # We add a vanilla hidden layer:
    x = layers.Dense(filters, activation="relu")(x)
    x = layers.Dropout(dropout_rate)(x)
    # We project onto a single unit output layer, and squash it with a sigmoid:
    predictions = layers.Dense(1, 
        activation="sigmoid",
        name="predictions")(x)
    model = keras.Model([
        input_ids,
        attention_masks,
        token_type_ids,
    ], predictions)
    model.compile(
        loss="mse", 
        optimizer="adam", 
        metrics=[metrics.mean_absolute_error])
    return model

# function of training intensity score model training

In [15]:
def train_scorer(
    texts,
    scores,
    scorer_model_path,
    epochs = 10,
    validation_split=0.1,
    ):
    scorer_model = emotion_scorer_model_building()
    '''
    prepare the text input
    texts = [
        "i feel so fear",
        "nothing is wrong"
        ]
    '''
    x_ids, x_attention, x_type = texts_to_input(texts)
    '''
    prepare the output
    '''
    y = numpy.array(scores)
    print(x_ids.shape)
    print(x_attention.shape)
    print(x_type.shape)
    print(y.shape)
    # Fit the model using the train and test datasets.
    scorer_model.fit(
        [x_ids, x_attention, x_type], 
        y, 
        validation_split=0.1, 
        epochs=epochs)
    scorer_model.save(
        scorer_model_path,
        save_format='h5')
    return scorer_model

## training the model

### K-fold cross validation function of tagger

In [16]:
def tagger_model_fold_cross_validation(
    texts,
    tags,
    n_splits = 5,
    ):
    #convert data to numpy arrays
    x_ids, x_attention, x_type = texts_to_input(texts)
    y = numpy.array(tags)
    y = to_categorical(y)
    #make the k folds
    kfold = KFold(n_splits = n_splits, shuffle=True)
    #
    acc_per_fold = []
    loss_per_fold = []
    #
    fold_no = 1
    for train, test in kfold.split(x_ids, y):
        tagger_model = emotion_tagger_model_building()
        tagger_model.fit(
            [x_ids[train], x_attention[train], x_type[train]],
            y[train], 
            epochs = 5,
            verbose = 1)
        scores = tagger_model.evaluate(
            [x_ids[test], x_attention[test], x_type[test]],
            y[test], 
            verbose = 1)
        print('accuracy of the {}-th fold:{}'.format(
            fold_no,
            scores[1]))
        acc_per_fold.append(scores[1])
        loss_per_fold.append(scores[0])
        # Increase fold number
        fold_no = fold_no + 1
    ###
    acc_10_fold_cross_validation = numpy.mean(numpy.array(acc_per_fold))
    print('accuracy of {}-fold cross validation:\t{}'.format(
        n_splits,
        acc_10_fold_cross_validation,
        ))

### K-fold cross validation function of scorer

In [17]:
def scorer_model_fold_cross_validation(
    texts,
    scores,
    n_splits = 5,
    ):
    #convert data to numpy arrays
    x_ids, x_attention, x_type = texts_to_input(texts)
    y = numpy.array(scores)
    #make the k folds
    kfold = KFold(n_splits = n_splits, shuffle=True)
    #
    acc_per_fold = []
    loss_per_fold = []
    #
    fold_no = 1
    for train, test in kfold.split(x_ids, y):
        tagger_model = emotion_scorer_model_building()
        tagger_model.fit(
            [x_ids[train], x_attention[train], x_type[train]],
            y[train], 
            epochs = 5,
            verbose = 1)
        scores = tagger_model.evaluate(
            [x_ids[test], x_attention[test], x_type[test]],
            y[test], 
            verbose = 1)
        print('mse of the {}-th fold:{}'.format(
            fold_no,
            scores[1]))
        acc_per_fold.append(scores[1])
        loss_per_fold.append(scores[0])
        # Increase fold number
        fold_no = fold_no + 1
    ###
    acc_10_fold_cross_validation = numpy.mean(numpy.array(acc_per_fold))
    print('mse of {}-fold cross validation:\t{}'.format(
        n_splits,
        acc_10_fold_cross_validation,
        ))

# training of each emotion

### tagger training 

#### load data

In [18]:
fear_texts, fear_tags = convert_file_to_text_and_tag_list(
    emotion_tag = emotion,
    data_file = data_file)

#### 5-fold cross validation

In [19]:
tagger_model_fold_cross_validation(
    fear_texts,
    fear_tags,
    )

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were init

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
accuracy of the 1-th fold:0.9087470173835754


Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
accuracy of the 2-th fold:0.901135265827179


Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
accuracy of the 3-th fold:0.9025543928146362


Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
accuracy of the 4-th fold:0.900189220905304


Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
accuracy of the 5-th fold:0.9063386917114258
accuracy of 5-fold cross validation:	0.9037929177284241


#### train single model

In [20]:
start_time = time.time()

fear_tagger = train_tagger(
    texts = fear_texts,
    tags = fear_tags,
    tagger_model_path = '/data/emoint/{}_tagger_bert.h5'.format(emotion),
    epochs = 10,
    validation_split = 0.1,
    )

end_time = time.time()

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


(10571, 100)
(10571, 100)
(10571, 100)
(10571, 2)
[7219. 3352.]
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [21]:
print('training time of scorer:\t{}'.format(end_time - start_time))

training time of scorer:	5203.104001760483


### scorer training

#### laod data

In [22]:
fear_texts, fear_scores = convert_file_to_text_and_score_list(
    emotion_tag = emotion,
    data_file = data_file)

#### 5-fold cross validation

In [23]:
scorer_model_fold_cross_validation(
    fear_texts,
    fear_scores,
    )

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
mse of the 1-th fold:0.10018211603164673


InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:CPU:0 to /job:localhost/replica:0/task:0/device:GPU:0 in order to run _EagerConst: Dst tensor is not initialized.

#### train single model

In [None]:
start_time = time.time()

fear_tagger = train_scorer(
    texts = fear_texts,
    scores = fear_scores,
    scorer_model_path = '/data/emoint/{}_scorer_bert.h5'.format(emotion),
    epochs = 10,
    validation_split = 0.1,
    )

end_time = time.time()

In [None]:
print('training time of scorer:\t{}'.format(end_time - start_time))