Code based in part on the CNN described by Hashemi et al.,

```
Homa B Hashemi et al. "Query intent detection using convolutional neural
networks". In: International Conference on Web Search and Data Mining,
Workshop on Query Understanding. 2016.
```

and on other TextCNN implementations by Yoon Kim (https://github.com/yoonkim/CNN_sentence) and Shawn Ng (https://github.com/Shawn1993/cnn-text-classification-pytorch)


In [0]:
# Ensure spacy is at v. 2.0.11 (import especially for Italian vectors!) Env restart likely required.
!pip install --upgrade spacy==2.0.11

In [0]:
!python -m spacy validate

In [0]:
# Run this and next cell for IT vectors (might require restarting the env after running this cell)
!pip3 install https://github.com/MartinoMensio/it_vectors_wiki_spacy/releases/download/v1.0/it_vectors_wiki_lg-1.0.0.tar.gz

In [0]:
import spacy
import torch

import numpy as np
import pandas as pd
import torch.nn as nn

from keras.preprocessing.sequence import pad_sequences
from spacy.attrs import ID
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from time import time
from torch.nn.functional import relu, max_pool1d, log_softmax, cross_entropy  #, max_pool2d
from torch.optim import SGD, Adam
from torch.utils.data import DataLoader, TensorDataset
from typing import Tuple, List, Dict
from tqdm import tqdm, trange
# from tqdm import tqdm_notebook as tqdm

In [0]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [0]:
class CNN(nn.Module):
    def __init__(
            self,
            embeddings=None,
            num_embeddings: int = 52,  # number of features
            embedding_dim: int = 300,  # dimensionality of embeddings
            num_classes: int = 7,  # number of output classes
            num_filters: int = 3,
            filter_sizes: Tuple[int] = (2, 3, 4),
            dropout: float = 0.5):
        super().__init__()

        if embeddings is not None:
            self.embedding = nn.Embedding.from_pretrained(embeddings, freeze=False)
        else:
            self.embedding = nn.Embedding(num_embeddings + 1, embedding_dim)

        self.convs = nn.ModuleList(
            [nn.Conv2d(1, num_filters, (fs, embedding_dim)) for fs in filter_sizes])
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(len(filter_sizes) * num_filters, num_classes)

    def forward(self, x):
        x = self.embedding(x)

        x = x.unsqueeze(1)

        x = [relu(conv(x)).squeeze(3) for conv in self.convs]

        x = [max_pool1d(i, i.size(2)) for i in x]
        x = torch.cat(x, 1).squeeze(2)

        x = self.dropout(x)

        return self.fc(x)  # logit

In [0]:
from tqdm import tqdm_notebook as tqdm
def train(model: CNN, train_data: DataLoader, validation_data: DataLoader,
          num_epochs=100, lr=0.1):
    if 'cuda' == device:
        model.cuda()

    # optimizer = SGD(model.parameters(), lr=lr)
    optimizer = Adam(model.parameters(), lr=2e-5)

    # Tracking variables
    tr_loss = 0.0
    tr_steps = 0
    # inter_epoch_steps = 0

    model.zero_grad()
    # validation_accuracy, last_validation_accuracy = 0.0, 0.0
    # train_iterator = trange(num_epochs, desc='Epoch')
    train_iterator = range(num_epochs)
    for i in train_iterator:
        model.train()
        epoch_iterator = tqdm(train_data, desc='Iteration')
        for step, batch in enumerate(epoch_iterator):
            batch = tuple(t.to(device) for t in batch)
            input = {'input_ids': batch[0],
                     'labels': batch[1]}

            optimizer.zero_grad()
            logits = model(input['input_ids'])

            loss = cross_entropy(logits, input['labels'])
            loss.backward()
            tr_loss += loss.item()

            optimizer.step()
            model.zero_grad()
            tr_steps += 1

            # if inter_epoch_steps % 50:
            #   print()
            # else:
            #   print('.', end='')

        # Validation
        # last_validation_accuracy = validation_accuracy
        validation_accuracy = evaluate(model, validation_data)
        print('epoch {} (steps: {}) --- loss: {:.5f} --- '
              'acc: {:.5f}'.format(
            i, tr_steps, tr_loss / tr_steps, validation_accuracy))
        
        # if validation_accuracy < last_validation_accuracy or min_delta > validation_accuracy - last_validation_accuracy:
        #   if i % 10 != 0:
        #     print('epoch {} (steps: {}) --- loss: {:.5f} --- '
        #           'acc: {:.5f}'.format(
        #         i, tr_steps, tr_loss / tr_steps, validation_accuracy))
        #   return

In [0]:
def score(y_true, y_pred, scoring='average'):
  if 'f1-micro' == scoring:
    return f1_score(y_true, y_pred, average='micro')

  if 'f1-macro' == scoring:
    return f1_score(y_true, y_pred, labels=np.unique(y_pred), average='macro')
  
  return (y_true == y_pred).mean()

In [0]:
def evaluate(model, validation_data, scoring='average', predict=False):
    eval_acc = 0.0
    eval_steps = 0
    y_true = []
    y_pred = []
    model.eval()
    for batch in validation_data:
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            inputs = {'input_ids': batch[0],
                      'labels': batch[1]}
            logits = model(inputs['input_ids'])

        pred = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].detach().cpu().numpy()

        pred = np.argmax(pred, axis=1)
        eval_acc += score(y_true=label_ids, y_pred=pred, scoring=scoring)
        eval_steps += 1

        y_true.extend(label_ids)
        y_pred.extend(pred)

    if predict:
      return eval_acc / eval_steps, y_true, y_pred

    return eval_acc / eval_steps

In [0]:
MAX_SEQUENCE_LENGTH = 53

# Uncomment one TRAIN_DATA_PATH and one TEST_DATA_PATH, along with the appropriate SPACY_MODEL

# TRAIN_DATA_PATH = 'snips_train.tsv'
# TRAIN_DATA_PATH = 'snips_small_train.tsv'
# TEST_DATA_PATH = 'snips_validate.tsv'
# TRAIN_DATA_PATH = 'atis_train.tsv'
# TRAIN_DATA_PATH = 'atis_small_train.tsv'
# TEST_DATA_PATH = 'atis_dev.tsv'
# TRAIN_DATA_PATH = 'aw_slu_train.tsv'
# TRAIN_DATA_PATH = 'aw_slu_small_train.tsv'
# TEST_DATA_PATH = 'aw_slu_test.tsv'

# SPACY_MODEL = 'en_vectors_web_lg'
# SPACY_MODEL = 'it_vectors_wiki_lg'

if 'it_vectors_wiki_lg' == SPACY_MODEL:
  try:
    import it_vectors_wiki_lg
    nlp = it_vectors_wiki_lg.load()
  except:
    raise Exception('Italian GloVe vectors not available in this environment')
else:
  try:
      nlp = spacy.load(SPACY_MODEL)
  except:
      spacy.cli.download(SPACY_MODEL)
      nlp = spacy.load(SPACY_MODEL)

df = pd.read_csv(TRAIN_DATA_PATH, delimiter='\t', header=None,
                    names=['sentence', 'intent']).dropna(how='any')

unique_labels = sorted(list(set(df.intent)))
print('labels:', unique_labels)

train_x = [sentence.lower() for sentence in df.sentence]
train_y = [unique_labels.index(intent) for intent in df.intent]

print('length train_x, train_y:', len(train_x), len(train_y))

embedding_matrix = nlp.vocab.vectors.data

for idx, sentence in enumerate(train_x):
    toks = nlp(sentence)
    x = toks.to_array([ID])
    x.dtype = 'long'
    train_x[idx] = torch.tensor(x)

train_x = pad_sequences(train_x, maxlen=MAX_SEQUENCE_LENGTH, dtype='long',
                        padding='post', truncating='post')

train_x, validate_x, train_y, validate_y = train_test_split(
    train_x, train_y, shuffle=True, test_size=0.1)

train_x = torch.tensor(train_x)
train_y = torch.tensor(train_y)
validate_x = torch.tensor(validate_x)
validate_y = torch.tensor(validate_y)

train_dataset = TensorDataset(train_x, train_y)
train_dataloader = DataLoader(train_dataset, shuffle=True, num_workers=2, batch_size=50)

validate_dataset = TensorDataset(validate_x, validate_y)
validate_dataloader = DataLoader(validate_dataset)

cnn = CNN(embeddings=torch.tensor(embedding_matrix), num_classes=len(unique_labels))
cnn = cnn.float()

t1 = time()
train(cnn, train_dataloader, validate_dataloader, num_epochs=100)
t2 = time()

print(f'\n{t2 - t1}\n')

In [0]:
test_df = pd.read_csv(TEST_DATA_PATH, delimiter='\t', header=None,
                    names=['sentence', 'intent']).dropna(how='any')
test_x = [sentence.lower() for sentence in test_df.sentence]
test_y = [unique_labels.index(intent) for intent in test_df.intent]

for idx, sentence in enumerate(test_x):
    toks = nlp(sentence)
    x = toks.to_array([ID])
    x.dtype = 'long'
    test_x[idx] = torch.tensor(x)

test_x = pad_sequences(test_x, maxlen=MAX_SEQUENCE_LENGTH, dtype='long',
                        padding='post', truncating='post')

test_x = torch.tensor(test_x)
test_y = torch.tensor(test_y)

test_dataset = TensorDataset(test_x, test_y)
test_dataloader = DataLoader(test_dataset, shuffle=True, num_workers=2)

micro_f1, gold, pred = evaluate(cnn, test_dataloader, scoring='f1-micro', predict=True)
macro_f1, gold, pred = evaluate(cnn, test_dataloader, scoring='f1-macro', predict=True)

In [0]:
f1_score(gold, pred, labels=np.unique(pred), average='macro')
f1_score(gold, pred, average='micro')