In [2]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/a3/78/92cedda05552398352ed9784908b834ee32a0bd071a9b32de287327370b7/transformers-2.8.0-py3-none-any.whl (563kB)
[K     |████████████████████████████████| 573kB 3.3MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/98/2c/8df20f3ac6c22ac224fff307ebc102818206c53fc454ecd37d8ac2060df5/sentencepiece-0.1.86-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
[K     |████████████████████████████████| 1.0MB 51.2MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/99/50/93509f906a40bffd7d175f97fd75ea328ad9bd91f48f59c4bd084c94a25e/sacremoses-0.0.41.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 39.6MB/s 
[?25hCollecting tokenizers==0.5.2
[?25l  Downloading https://files.pythonhosted.org/packages/d1/3f/73c881ea4723e43c1e9acf317cf407fab3a278daab3a69c98dcac511c04f/tokenizers-0.5.2-cp36-cp36m-manylinux1_x86_64.whl (3.7MB)
[K    

In [1]:
import numpy as np
import tensorflow as tf
import pandas as pd
import re
import os
from google.colab import drive
from tensorflow.python.lib.io.tf_record import TFRecordWriter
from transformers import (TFDistilBertForSequenceClassification, 
                          TFDistilBertModel,
                          DistilBertTokenizerFast)
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tqdm.notebook import tqdm

drive.mount('/content/drive/')

train_dataset = pd.read_csv('drive/My Drive/dataset/semeval-english/V-reg/2018-Valence-reg-En-train.txt', delimiter='\t')
validation_dataset = pd.read_csv('drive/My Drive/dataset/semeval-english/V-reg/2018-Valence-reg-En-dev.txt', delimiter='\t')
test_dataset = pd.read_csv('drive/My Drive/dataset/semeval-english/V-reg/2018-Valence-reg-En-test-gold.txt', delimiter='\t')

def preprocess(df):
   df['Tweet'] = df['Tweet'].apply(lambda x : re.sub("@[A-Za-z0-9]+","", x).lower())
   df['ID'] = df['ID'].apply(lambda x : x.split('-')[2])
   df['ID'] = pd.to_numeric(df['ID'])
   return df[['ID', 'Tweet', 'Intensity Score']]

train_dataset = preprocess(train_dataset)
validation_dataset = preprocess(validation_dataset)
test_dataset = preprocess(test_dataset)

def create_tf_example_continous_label(features,label):
  tf_example = tf.train.Example(features = tf.train.Features(feature = {
      'idx': tf.train.Feature(int64_list=tf.train.Int64List(value=[features[0]])),
      'tweet': tf.train.Feature(bytes_list=tf.train.BytesList(value=[features[1].encode('utf-8')])),
      'label': tf.train.Feature(float_list=tf.train.FloatList(value=[label]))
      }))
  return tf_example.SerializeToString()

def convert_df_to_tfrecord(df, file_name):
  writer = TFRecordWriter(file_name)
  for idx,row in enumerate(df.iterrows()):
    features, label = row[1][:-1], row[1][-1]
    example =  create_tf_example_continous_label(features, label)
    writer.write(example)
  writer.close()

#os.mkdir('drive/My Drive/dataset/records')
convert_df_to_tfrecord(train_dataset, "drive/My Drive/dataset/records/valence_train.tfrecord")
convert_df_to_tfrecord(validation_dataset, "drive/My Drive/dataset/records/valence_validate.tfrecord")
convert_df_to_tfrecord(test_dataset, "drive/My Drive/dataset/records/valence_test.tfrecord")

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [0]:
model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

def tokenize_sentences(sentences, tokenizer, max_seq_len = 128):
  tokenized_sentences = []

  for sentence in tqdm(sentences):
    tokenized_sentence = tokenizer.encode(
        sentence,
        add_special_tokens = True,
        max_length = max_seq_len
        )
    tokenized_sentences.append(tokenized_sentence)
  return tokenized_sentences

def create_attention_masks(sentences):
  attention_masks = []

  for sentence in sentences:
    att_mask = [int(token_id > 0) for token_id in sentence]
    attention_masks.append(att_mask)
  return np.asarray(attention_masks)

In [31]:
train_ids  = tokenize_sentences(train_dataset['Tweet'], tokenizer)
train_ids = pad_sequences(train_ids, maxlen=30, dtype="long", value=0, truncating="post", padding="post")
train_masks = create_attention_masks(train_ids)

validation_ids  = tokenize_sentences(validation_dataset['Tweet'], tokenizer)
validation_ids = pad_sequences(validation_ids, maxlen=30, dtype="long", value=0, truncating="post", padding="post")
validation_masks = create_attention_masks(validation_ids)

test_ids  = tokenize_sentences(test_dataset['Tweet'], tokenizer)
test_ids = pad_sequences(test_ids, maxlen=30, dtype="long", value=0, truncating="post", padding="post")
test_masks = create_attention_masks(test_ids)

def create_dataset(ids, masks, labels):
  def gen():
    for i in range(len(ids)):
      yield ({
          "input_ids": ids[i],
          "attention_mask": masks[i]
          },
          labels[i])
  return tf.data.Dataset.from_generator(gen,
                                        ({"input_ids": tf.int32, "attention_mask": tf.int32}, tf.float32),
                                        ({"input_ids": tf.TensorShape([None]), "attention_mask": tf.TensorShape([None])},
                                         tf.TensorShape([])))

train_dataset_tf = create_dataset(train_ids, train_masks, train_dataset['Intensity Score'])
validation_dataset_tf = create_dataset(validation_ids, validation_masks, validation_dataset['Intensity Score'])
test_dataset_tf = create_dataset(test_ids, test_masks, test_dataset['Intensity Score'])

HBox(children=(IntProgress(value=0, max=1181), HTML(value='')))




HBox(children=(IntProgress(value=0, max=449), HTML(value='')))




HBox(children=(IntProgress(value=0, max=937), HTML(value='')))




In [36]:
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.MeanSquaredError()
metric = tf.keras.metrics.MeanSquaredError()
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

distilbert_history = model.fit(
 train_dataset_tf, 
 epochs=5, 
 steps_per_epoch=120, 
 validation_data=validation_dataset_tf, 
 validation_steps=7
)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [38]:
os.mkdir('drive/My Drive/saved_models/')
model.save('drive/My Drive/saved_models/valence_regression') 

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: drive/My Drive/saved_models/valence_regression/assets
