<a href="https://colab.research.google.com/github/zwycl/flink-twitter-senti/blob/master/tf_models/valence_regression_distilbert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install transformers

In [0]:
import numpy as np
import tensorflow as tf
import pandas as pd
import re
import os
from google.colab import drive
from tensorflow.python.lib.io.tf_record import TFRecordWriter
from transformers import (TFDistilBertForSequenceClassification, 
                          TFDistilBertModel,
                          DistilBertTokenizerFast,
                          DistilBertConfig)
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tqdm.notebook import tqdm

drive.mount('/content/drive/')

train_dataset = pd.read_csv('drive/My Drive/dataset/semeval-english/V-reg/2018-Valence-reg-En-train.txt', delimiter='\t')
validation_dataset = pd.read_csv('drive/My Drive/dataset/semeval-english/V-reg/2018-Valence-reg-En-dev.txt', delimiter='\t')
test_dataset = pd.read_csv('drive/My Drive/dataset/semeval-english/V-reg/2018-Valence-reg-En-test-gold.txt', delimiter='\t')

def preprocess(df):
   df['Tweet'] = df['Tweet'].apply(lambda x : re.sub("@[A-Za-z0-9]+","", x).lower())
   df['ID'] = df['ID'].apply(lambda x : x.split('-')[2])
   df['ID'] = pd.to_numeric(df['ID'])
   return df[['ID', 'Tweet', 'Intensity Score']]

train_dataset = preprocess(train_dataset)
validation_dataset = preprocess(validation_dataset)
test_dataset = preprocess(test_dataset)

def create_tf_example_continous_label(features,label):
  tf_example = tf.train.Example(features = tf.train.Features(feature = {
      'idx': tf.train.Feature(int64_list=tf.train.Int64List(value=[features[0]])),
      'tweet': tf.train.Feature(bytes_list=tf.train.BytesList(value=[features[1].encode('utf-8')])),
      'label': tf.train.Feature(float_list=tf.train.FloatList(value=[label]))
      }))
  return tf_example.SerializeToString()

def convert_df_to_tfrecord(df, file_name):
  writer = TFRecordWriter(file_name)
  for idx,row in enumerate(df.iterrows()):
    features, label = row[1][:-1], row[1][-1]
    example =  create_tf_example_continous_label(features, label)
    writer.write(example)
  writer.close()

#os.mkdir('drive/My Drive/dataset/records')
convert_df_to_tfrecord(train_dataset, "drive/My Drive/dataset/records/valence_train.tfrecord")
convert_df_to_tfrecord(validation_dataset, "drive/My Drive/dataset/records/valence_validate.tfrecord")
convert_df_to_tfrecord(test_dataset, "drive/My Drive/dataset/records/valence_test.tfrecord")

In [0]:
model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=1)
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

def tokenize_sentences(sentences, tokenizer, max_seq_len = 128):
  tokenized_sentences = []

  for sentence in tqdm(sentences):
    tokenized_sentence = tokenizer.encode(
        sentence,
        add_special_tokens = True,
        max_length = max_seq_len
        )
    tokenized_sentences.append(tokenized_sentence)
  return tokenized_sentences

def create_attention_masks(sentences):
  attention_masks = []

  for sentence in sentences:
    att_mask = [int(token_id > 0) for token_id in sentence]
    attention_masks.append(att_mask)
  return np.asarray(attention_masks)

In [0]:
train_ids  = tokenize_sentences(train_dataset['Tweet'], tokenizer)
train_ids = pad_sequences(train_ids, maxlen=30, dtype="long", value=0, truncating="post", padding="post")
train_masks = create_attention_masks(train_ids)

validation_ids  = tokenize_sentences(validation_dataset['Tweet'], tokenizer)
validation_ids = pad_sequences(validation_ids, maxlen=30, dtype="long", value=0, truncating="post", padding="post")
validation_masks = create_attention_masks(validation_ids)

test_ids  = tokenize_sentences(test_dataset['Tweet'], tokenizer)
test_ids = pad_sequences(test_ids, maxlen=30, dtype="long", value=0, truncating="post", padding="post")
test_masks = create_attention_masks(test_ids)

def create_dataset(ids, labels):
  def gen():
    for i in range(len(ids)):
      yield (ids[i], labels[i])
  return tf.data.Dataset.from_generator(gen,
                                        (tf.int32, tf.float32),
                                        (tf.TensorShape([30]), ()))

train_dataset_tf = create_dataset(train_ids, train_dataset['Intensity Score'])
validation_dataset_tf = create_dataset(validation_ids,  validation_dataset['Intensity Score'])
test_dataset_tf = create_dataset(test_ids,  test_dataset['Intensity Score'])

In [0]:
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.MeanSquaredError()
metric = tf.keras.metrics.MeanSquaredError()
model.compile(optimizer=optimizer, loss=loss, metrics=[metrics])

distilbert_history = model.fit(
 train_dataset_tf, 
 epochs=5, 
 steps_per_epoch=120, 
 validation_data=validation_dataset_tf, 
 validation_steps=7
)


In [0]:
output = model.evaluate(test_dataset_tf)

#os.mkdir('drive/My Drive/saved_models/')
model.save('drive/My Drive/saved_models/valence_regression') 

tf.Tensor(0.141, shape=(), dtype=float32)
tf.Tensor(0.317, shape=(), dtype=float32)
