<a href="https://colab.research.google.com/github/zwycl/flink-twitter-senti/blob/master/tf_models/valence_regression_distilbert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install transformers

In [0]:
import tensorflow as tf
import pandas as pd
import re
import os
from google.colab import drive
from tensorflow.python.lib.io.tf_record import TFRecordWriter
from transformers import (TFDistilBertForSequenceClassification, 
                          TFDistilBertModel,
                          DistilBertTokenizerFast)
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tqdm.notebook import tqdm

drive.mount('/content/drive/')

train_dataset = pd.read_csv('drive/My Drive/dataset/semeval-english/V-reg/2018-Valence-reg-En-train.txt', delimiter='\t')
validation_dataset = pd.read_csv('drive/My Drive/dataset/semeval-english/V-reg/2018-Valence-reg-En-dev.txt', delimiter='\t')
test_dataset = pd.read_csv('drive/My Drive/dataset/semeval-english/V-reg/2018-Valence-reg-En-test-gold.txt', delimiter='\t')

def preprocess(df):
   df['Tweet'] = df['Tweet'].apply(lambda x : re.sub("@[A-Za-z0-9]+","", x).lower())
   df['ID'] = df['ID'].apply(lambda x : x.split('-')[2])
   df['ID'] = pd.to_numeric(df['ID'])
   return df[['ID', 'Tweet', 'Intensity Score']]

train_dataset = preprocess(train_dataset)
validation_dataset = preprocess(validation_dataset)
test_dataset = preprocess(test_dataset)

def create_tf_example_continous_label(features,label):
  tf_example = tf.train.Example(features = tf.train.Features(feature = {
      'idx': tf.train.Feature(int64_list=tf.train.Int64List(value=[features[0]])),
      'tweet': tf.train.Feature(bytes_list=tf.train.BytesList(value=[features[1].encode('utf-8')])),
      'label': tf.train.Feature(float_list=tf.train.FloatList(value=[label]))
      }))
  return tf_example.SerializeToString()

def convert_df_to_tfrecord(df, file_name):
  writer = TFRecordWriter(file_name)
  for idx,row in enumerate(df.iterrows()):
    features, label = row[1][:-1], row[1][-1]
    example =  create_tf_example_continous_label(features, label)
    writer.write(example)
  writer.close()

#os.mkdir('drive/My Drive/dataset/records')
convert_df_to_tfrecord(train_dataset, "drive/My Drive/dataset/records/valence_train.tfrecord")
convert_df_to_tfrecord(validation_dataset, "drive/My Drive/dataset/records/valence_validate.tfrecord")
convert_df_to_tfrecord(test_dataset, "drive/My Drive/dataset/records/valence_test.tfrecord")

In [0]:
tr_ds = tf.data.TFRecordDataset("drive/My Drive/dataset/records/valence_train.tfrecord")
val_ds = tf.data.TFRecordDataset("drive/My Drive/dataset/records/valence_validate.tfrecord")
test_ds = tf.data.TFRecordDataset("drive/My Drive/dataset/records/valence_test.tfrecord")

feature_spec = {
    'idx': tf.io.FixedLenFeature([], tf.int64),
    'tweet': tf.io.FixedLenFeature([], tf.string),
    'label': tf.io.FixedLenFeature([], tf.float32)
}
def parse_example(example_proto):
  # Parse the input tf.Example proto using the dictionary above.
    return tf.io.parse_single_example(example_proto, feature_spec)

# convert the encoded string tensor into the separate tensors that will feed into the model
tr_parse_ds = tr_ds.map(parse_example)
val_parse_ds = val_ds.map(parse_example)
test_parse_ds =  test_ds.map(parse_example)

print(tr_parse_ds)

model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

def tokenize_sentences(sentences, tokenizer, max_seq_len = 128):
  tokenized_sentences = []

  for sentence in tqdm(sentences):
    tokenized_sentence = tokenizer.encode(
        sentence,
        add_special_tokens = True,
        max_length = max_seq_len
        )
    tokenized_sentences.append(tokenized_sentence)
  return tokenized_sentences

def create_attention_masks(sentences):
  attention_masks = []

  for sentence in sentences:
    att_mask = [int(token_id > 0) for token_id in sentence]
    attention_masks.append(att_mask)
  return np.asarray(attention_masks)

train_ids  = tokenize_sentences(train_dataset['Tweet'], tokenizer)
train_ids = pad_sequences(input_ids, maxlen=300, dtype="long", value=0, truncating="post", padding="post")
train_masks = create_attention_masks(input_ids)

In [0]:
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.MeanSquaredError()
metric = tf.keras.metrics.MeanSquaredError()
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

distilbert_history = model.fit(
 tr_parse_ds, 
 epochs=4, 
 steps_per_epoch=115, 
 validation_data=val_parse_ds, 
 validation_steps=7
)
