In [1]:
# use NPS Chat Corpus dataset from nltk
# that consists of posts from instant messaging sessions. These posts have all been labeled with one of 15 dialogue act types, such as "Statement," "Emotion," "ynQuestion", and "Continuer."
# to train an intent classifier identifying questions
# code based on https://stackoverflow.com/questions/49100615/nltk-detecting-whether-a-sentence-is-interogative-or-not/50583762
import nltk
import pandas as pd
nltk.download('nps_chat')
posts = nltk.corpus.nps_chat.xml_posts()  # [:10000]

# encode labels as int
classes = []
df = pd.DataFrame(columns=['text', 'labels'])
for post in posts:
    c = post.get('class')
    if c not in ['System']:
        if c not in classes:
            classes.append(c)
        df = df.append({'text': post.text, 'labels': classes.index(c)}, ignore_index=True)

print(len(df))
# check classes
print("%d distinct classes:"%len(classes))
print(classes)

# prepare the dataset for training classifier
size = int(len(df) * 0.1)
train_set, test_set = df[size:], df[:size]

[nltk_data] Downloading package nps_chat to /home/zola/nltk_data...
[nltk_data]   Package nps_chat is already up-to-date!


7935
14 distinct classes:
['Statement', 'Emotion', 'Greet', 'Accept', 'Reject', 'whQuestion', 'Continuer', 'ynQuestion', 'yAnswer', 'Bye', 'Clarify', 'Emphasis', 'nAnswer', 'Other']


In [2]:
print(len(train_set))
print(len(train_set.columns))

print(train_set.iloc[0])
print(classes[train_set.iloc[0][1]])

7142
2
text      sexy is just a bonus at this point in my life....
labels                                                    0
Name: 793, dtype: object
Statement


In [3]:
# fine-tune a pre-trained transformer for intent classification task
# using simple transformers library https://medium.com/swlh/simple-transformers-multi-class-text-classification-with-bert-roberta-xlnet-xlm-and-8b585000ce3a
from simpletransformers.classification import ClassificationModel


# Create a ClassificationModel
model = ClassificationModel('roberta', 'roberta-base', num_labels=len(classes))

I1228 01:34:27.127121 140553850861376 file_utils.py:35] PyTorch version 1.0.1.post2 available.
I1228 01:34:27.951721 140553850861376 configuration_utils.py:185] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json from cache at /home/zola/.cache/torch/transformers/e1a2a406b5a05063c31f4dfdee7608986ba7c6393f7f79db5e69dcd197208534.9dad9043216064080cf9dd3711c53c0f11fe2b09313eaa66931057b4bdcaf068
I1228 01:34:27.954186 140553850861376 configuration_utils.py:199] Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 1

In [4]:
# Train the model
model.train_model(train_set)


from sklearn.metrics import f1_score, accuracy_score


def f1_multiclass(labels, preds):
    return f1_score(labels, preds, average='micro')
    
result, model_outputs, wrong_predictions = model.eval_model(test_set, f1=f1_multiclass, acc=accuracy_score)

Features loaded from cache at cache_dir/cached_train_roberta_128_14_7142
Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


HBox(children=(IntProgress(value=0, description='Epoch', max=1, style=ProgressStyle(description_width='initial…

HBox(children=(IntProgress(value=0, description='Current iteration', max=893, style=ProgressStyle(description_…

Running loss: 1.784051Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 32768.0
Running loss: 2.131428Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 16384.0
Running loss: 1.818022Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8192.0
Running loss: 0.926366Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4096.0
Running loss: 0.980025

I1228 01:38:34.068494 140553850861376 configuration_utils.py:87] Configuration saved in outputs/epoch-1/config.json


Running loss: 0.842740

I1228 01:38:34.393338 140553850861376 modeling_utils.py:258] Model weights saved in outputs/epoch-1/pytorch_model.bin
I1228 01:38:34.470797 140553850861376 configuration_utils.py:87] Configuration saved in outputs/config.json





I1228 01:38:35.601099 140553850861376 modeling_utils.py:258] Model weights saved in outputs/pytorch_model.bin


Training of roberta model complete. Saved to outputs/.
Features loaded from cache at cache_dir/cached_dev_roberta_128_14_793


HBox(children=(IntProgress(value=0), HTML(value='')))




In [6]:
print(result)
# roberta {'mcc': 0.7508701664513905, 'f1': 0.8095838587641866, 'acc': 0.8095838587641866, 'eval_loss': 0.6986473923921586}

{'mcc': 0.7508701664513905, 'f1': 0.8095838587641866, 'acc': 0.8095838587641866, 'eval_loss': 0.6986473923921586}


In [11]:
predictions, raw_outputs = model.predict(['Some arbitary sentence', 'where is he from?', 'would you agree?', 'is this true?', 'yeap', 'not'])
for p in predictions:
    print(classes[p])

Converting to features started.


HBox(children=(IntProgress(value=0, max=6), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


Statement
whQuestion
ynQuestion
ynQuestion
Accept
nAnswer


In [None]:
# save trained model