In [1]:
#######################################
### -------- Load libraries ------- ###

# Load Huggingface transformers
from transformers import TFBertModel,  BertConfig, BertTokenizerFast

# Then what you need from tensorflow.keras
from tensorflow.keras.layers import Input, Dropout, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical

# And pandas for data import + sklearn because you allways need sklearn
import pandas as pd
from sklearn.model_selection import train_test_split
import json

In [2]:
data_path = '/home/Danny/Data-Mining/lab2/kaggle/data/'

json_list = list()
with open(data_path + 'tweets_DM.json' , 'r') as file:
    for line in file:
        json_list.append(json.loads(line))
        
tweet_list = list()
for json in json_list:
    tweet_id = json['_source']['tweet']['tweet_id']
    hashtags = json['_source']['tweet']['hashtags']
    hashtag = ' '.join(hashtags)
    text = json['_source']['tweet']['text']
    tweet_list.append([tweet_id, text])

In [3]:
emotion_df = pd.read_csv(data_path + 'emotion.csv')
emotion_df

Unnamed: 0,tweet_id,emotion
0,0x3140b1,sadness
1,0x368b73,disgust
2,0x296183,anticipation
3,0x2bd6e1,joy
4,0x2ee1dd,anticipation
...,...,...
1455558,0x38dba0,joy
1455559,0x300ea2,joy
1455560,0x360b99,fear
1455561,0x22eecf,joy


In [4]:
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder()
label_encoder.fit(emotion_df['emotion'])
emotion_df['label'] = label_encoder.transform(emotion_df['emotion'])
emotion_df

Unnamed: 0,tweet_id,emotion,label
0,0x3140b1,sadness,5
1,0x368b73,disgust,2
2,0x296183,anticipation,1
3,0x2bd6e1,joy,4
4,0x2ee1dd,anticipation,1
...,...,...,...
1455558,0x38dba0,joy,4
1455559,0x300ea2,joy,4
1455560,0x360b99,fear,3
1455561,0x22eecf,joy,4


In [5]:
identification_df = pd.read_csv(data_path + 'data_identification.csv')
identification_df

Unnamed: 0,tweet_id,identification
0,0x28cc61,test
1,0x29e452,train
2,0x2b3819,train
3,0x2db41f,test
4,0x2a2acc,train
...,...,...
1867530,0x227e25,train
1867531,0x293813,train
1867532,0x1e1a7e,train
1867533,0x2156a5,train


In [6]:
text_df = pd.DataFrame(tweet_list, columns=['tweet_id', 'text'])
text_df

Unnamed: 0,tweet_id,text
0,0x376b20,"People who post ""add me on #Snapchat"" must be ..."
1,0x2d5350,"@brianklaas As we see, Trump is dangerous to #..."
2,0x28b412,"Confident of your obedience, I write to you, k..."
3,0x1cd5b0,Now ISSA is stalking Tasha 😂😂😂 <LH>
4,0x2de201,"""Trust is not the same as faith. A friend is s..."
...,...,...
1867530,0x316b80,When you buy the last 2 tickets remaining for ...
1867531,0x29d0cb,I swear all this hard work gone pay off one da...
1867532,0x2a6a4f,@Parcel2Go no card left when I wasn't in so I ...
1867533,0x24faed,"Ah, corporate life, where you can date <LH> us..."


In [7]:
train_df = text_df.merge(emotion_df, left_on='tweet_id', right_on='tweet_id')
train_df

Unnamed: 0,tweet_id,text,emotion,label
0,0x376b20,"People who post ""add me on #Snapchat"" must be ...",anticipation,1
1,0x2d5350,"@brianklaas As we see, Trump is dangerous to #...",sadness,5
2,0x1cd5b0,Now ISSA is stalking Tasha 😂😂😂 <LH>,fear,3
3,0x1d755c,@RISKshow @TheKevinAllison Thx for the BEST TI...,joy,4
4,0x2c91a8,Still waiting on those supplies Liscus. <LH>,anticipation,1
...,...,...,...,...
1455558,0x321566,I'm SO HAPPY!!! #NoWonder the name of this sho...,joy,4
1455559,0x38959e,In every circumtance I'd like to be thankful t...,joy,4
1455560,0x2cbca6,there's currently two girls walking around the...,joy,4
1455561,0x24faed,"Ah, corporate life, where you can date <LH> us...",joy,4


In [8]:
train_df['label2'] = pd.Categorical(train_df['emotion'])
train_df

Unnamed: 0,tweet_id,text,emotion,label,label2
0,0x376b20,"People who post ""add me on #Snapchat"" must be ...",anticipation,1,anticipation
1,0x2d5350,"@brianklaas As we see, Trump is dangerous to #...",sadness,5,sadness
2,0x1cd5b0,Now ISSA is stalking Tasha 😂😂😂 <LH>,fear,3,fear
3,0x1d755c,@RISKshow @TheKevinAllison Thx for the BEST TI...,joy,4,joy
4,0x2c91a8,Still waiting on those supplies Liscus. <LH>,anticipation,1,anticipation
...,...,...,...,...,...
1455558,0x321566,I'm SO HAPPY!!! #NoWonder the name of this sho...,joy,4,joy
1455559,0x38959e,In every circumtance I'd like to be thankful t...,joy,4,joy
1455560,0x2cbca6,there's currently two girls walking around the...,joy,4,joy
1455561,0x24faed,"Ah, corporate life, where you can date <LH> us...",joy,4,joy


In [9]:
train_df['label3'] = train_df['label2'].cat.codes
train_df

Unnamed: 0,tweet_id,text,emotion,label,label2,label3
0,0x376b20,"People who post ""add me on #Snapchat"" must be ...",anticipation,1,anticipation,1
1,0x2d5350,"@brianklaas As we see, Trump is dangerous to #...",sadness,5,sadness,5
2,0x1cd5b0,Now ISSA is stalking Tasha 😂😂😂 <LH>,fear,3,fear,3
3,0x1d755c,@RISKshow @TheKevinAllison Thx for the BEST TI...,joy,4,joy,4
4,0x2c91a8,Still waiting on those supplies Liscus. <LH>,anticipation,1,anticipation,1
...,...,...,...,...,...,...
1455558,0x321566,I'm SO HAPPY!!! #NoWonder the name of this sho...,joy,4,joy,4
1455559,0x38959e,In every circumtance I'd like to be thankful t...,joy,4,joy,4
1455560,0x2cbca6,there's currently two girls walking around the...,joy,4,joy,4
1455561,0x24faed,"Ah, corporate life, where you can date <LH> us...",joy,4,joy,4


In [10]:
predict_df = identification_df[identification_df['identification'] == 'test']
predict_df

Unnamed: 0,tweet_id,identification
0,0x28cc61,test
3,0x2db41f,test
15,0x2466f6,test
23,0x23f9e9,test
31,0x1fb4e1,test
...,...,...
1867495,0x2c4dc2,test
1867496,0x31be7c,test
1867500,0x1ca58e,test
1867515,0x35c8ba,test


In [11]:
predict_df = predict_df.merge(text_df, left_on='tweet_id', right_on='tweet_id')
predict_df

Unnamed: 0,tweet_id,identification,text
0,0x28cc61,test,@Habbo I've seen two separate colours of the e...
1,0x2db41f,test,@FoxNews @KellyannePolls No serious self respe...
2,0x2466f6,test,"Looking for a new car, and it says 1 lady owne..."
3,0x23f9e9,test,@cineworld “only the brave” just out and fount...
4,0x1fb4e1,test,Felt like total dog 💩 going into open gym and ...
...,...,...,...
411967,0x2c4dc2,test,6 year old walks in astounded. Mum! Look how b...
411968,0x31be7c,test,Only one week to go until the #inspiringvolunt...
411969,0x1ca58e,test,"I just got caught up with the manga for ""My He..."
411970,0x35c8ba,test,Speak only when spoken to and make hot ass mus...


In [12]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(train_df, test_size=0.2, random_state=42)
print(train_df.shape)
print(test_df.shape)

(1164450, 6)
(291113, 6)


In [13]:
#######################################
### --------- Setup BERT ---------- ###

# Name of the BERT model to use
model_name = 'bert-base-uncased'

# Max length of tokens
max_length = 48

# Load transformers config and set output_hidden_states to False
config = BertConfig.from_pretrained(model_name)
config.output_hidden_states = False

# Load BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path = model_name, config = config)

# Load the Transformers BERT model
transformer_model = TFBertModel.from_pretrained(model_name, config = config)

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [14]:
#######################################
### ------- Build the model ------- ###

# TF Keras documentation: https://www.tensorflow.org/api_docs/python/tf/keras/Model

# Load the MainLayer
bert = transformer_model.layers[0]

# Build your model input
input_ids = Input(shape=(max_length,), name='input_ids', dtype='int32')
# attention_mask = Input(shape=(max_length,), name='attention_mask', dtype='int32') 
# inputs = {'input_ids': input_ids, 'attention_mask': attention_mask}
inputs = {'input_ids': input_ids}

# Load the Transformers BERT model as a layer in a Keras model
bert_model = bert(inputs)[1]
dropout = Dropout(config.hidden_dropout_prob, name='pooled_output')
pooled_output = dropout(bert_model, training=False)

# Then build your model output
output_id = Dense(units=len(train_df.label.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='output_id')(pooled_output)
outputs = {'output_id': output_id}

# And combine it all in a model object
model = Model(inputs=inputs, outputs=outputs, name='BERT_MultiClass')

# Take a look at the model
model.summary()

Model: "BERT_MultiClass"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_ids (InputLayer)       [(None, 48)]              0         
_________________________________________________________________
bert (TFBertMainLayer)       TFBaseModelOutputWithPool 109482240 
_________________________________________________________________
pooled_output (Dropout)      (None, 768)               0         
_________________________________________________________________
output_id (Dense)            (None, 8)                 6152      
Total params: 109,488,392
Trainable params: 109,488,392
Non-trainable params: 0
_________________________________________________________________


In [15]:
#######################################
### ------- Train the model ------- ###

# Set an optimizer
optimizer = Adam(
    learning_rate=5e-05,
    epsilon=1e-08,
    decay=0.01,
    clipnorm=1.0)

# Set loss and metrics
loss = {'output_id': CategoricalCrossentropy(from_logits = True), 'label': CategoricalCrossentropy(from_logits = True)}
metric = {'output_id': CategoricalAccuracy('accuracy'), 'label': CategoricalAccuracy('accuracy')}

# Compile the model
model.compile(
    optimizer = optimizer,
    loss = loss, 
    metrics = metric)

# Ready output data for the model
y_issue = to_categorical(train_df['label'])

# Tokenize the input (takes some time)
x = tokenizer(
    text=train_df['text'].to_list(),
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)

# Fit the model
history = model.fit(
    x={'input_ids': x['input_ids']},
    y={'emotion': y_issue},
    validation_split=0.2,
    batch_size=64,
    epochs=10)


Epoch 1/10


ValueError: in user code:

    /home/Danny/anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py:571 train_function  *
        outputs = self.distribute_strategy.run(
    /home/Danny/anaconda3/lib/python3.6/site-packages/tensorflow/python/distribute/distribute_lib.py:951 run  **
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /home/Danny/anaconda3/lib/python3.6/site-packages/tensorflow/python/distribute/distribute_lib.py:2290 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /home/Danny/anaconda3/lib/python3.6/site-packages/tensorflow/python/distribute/distribute_lib.py:2649 _call_for_each_replica
        return fn(*args, **kwargs)
    /home/Danny/anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py:543 train_step  **
        self.compiled_metrics.update_state(y, y_pred, sample_weight)
    /home/Danny/anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/engine/compile_utils.py:391 update_state
        self._build(y_pred, y_true)
    /home/Danny/anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/engine/compile_utils.py:322 _build
        self._metrics, y_true, y_pred)
    /home/Danny/anaconda3/lib/python3.6/site-packages/tensorflow/python/util/nest.py:1118 map_structure_up_to
        **kwargs)
    /home/Danny/anaconda3/lib/python3.6/site-packages/tensorflow/python/util/nest.py:1200 map_structure_with_tuple_paths_up_to
        expand_composites=expand_composites)
    /home/Danny/anaconda3/lib/python3.6/site-packages/tensorflow/python/util/nest.py:835 assert_shallow_structure
        input_length=len(input_tree), shallow_length=len(shallow_tree)))

    ValueError: The two structures don't have the same sequence length. Input structure has length 2, while shallow structure has length 1.
