In [0]:
# Ensure spacy is at v. 2.0.11 (import especially for Italian vectors!)
!pip install --upgrade spacy==2.0.11

In [0]:
import spacy

In [0]:
# Run this and next cell for EN vectors
!python -m spacy download en_core_web_lg

In [0]:
# EN
import spacy.cli
spacy.cli.download('en_core_web_lg')
nlp = spacy.load('en_core_web_lg')

In [0]:
# Run this and next cell for IT vectors (might require restarting the env after running this cell)
!pip3 install https://github.com/MartinoMensio/it_vectors_wiki_spacy/releases/download/v1.0/it_vectors_wiki_lg-1.0.0.tar.gz

In [0]:
# IT
import it_vectors_wiki_lg
nlp = it_vectors_wiki_lg.load()

In [0]:
import numpy as np
import pandas as pd

from keras.preprocessing.sequence import pad_sequences
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from tqdm import tqdm_notebook

In [0]:
# Update these with the correct train/test sets
df = pd.read_csv('./aw_slu_train.tsv', delimiter='\t', header=None, names=['sentence', 'intent']).dropna(how='any')
df = pd.read_csv('./aw_slu_small_train.tsv', delimiter='\t', header=None, names=['sentence', 'intent']).dropna(how='any')

In [0]:
unique_labels = sorted(list(set(df.intent)))
print(unique_labels)

In [0]:
x_train = [sent.lower() for sent in df.sentence]
y_train = [unique_labels.index(intent) for intent in df.intent]
print(f'len x_train, y_train: {len(x_train)}, {len(y_train)}')

In [0]:
X_train = list()
for sentence in tqdm_notebook(x_train):
  toks = nlp(sentence)
  X_train.append([tok.vector for tok in toks])

X_train = pad_sequences(X_train, maxlen=53, dtype='int32', padding='post', truncating='post')

nsamples, nx, ny = X_train.shape
X_train = X_train.reshape((nsamples,nx*ny))
print(f'X_train shape: {X_train.shape}')

In [0]:
df_test = pd.read_csv('./aw_slu_test.tsv', delimiter='\t', header=None, names=['sentence', 'intent']).dropna(how='any')
x_test = [sent.lower() for sent in df_test.sentence]
y_test = [unique_labels.index(intent) for intent in df_test.intent]
print(f'len x_train, y_train: {len(x_train)}, {len(y_train)}')

X_test = list()
for sentence in tqdm_notebook(x_test):
  toks = nlp(sentence)
  X_test.append([tok.vector for tok in toks])

X_test = pad_sequences(X_test, maxlen=53, dtype='int32', padding='post', truncating='post')

nsamples, nx, ny = X_test.shape
X_test = X_test.reshape((nsamples,nx*ny))
print(f'X_test shape: {X_test.shape}')

Use grid search to find the best estimators

In [0]:
# Tune SVM
svm_tuned_parameters = [{'kernel': ['linear'], 'C': [1.0, 10.0, 100.0]}]

svm = GridSearchCV(SVC(), svm_tuned_parameters, cv=5, scoring='f1_micro', verbose=1000, n_jobs=-1)
svm.fit(X_train, y_train)

In [0]:
print(svm.best_estimator_)

In [0]:
pred = svm.best_estimator_.predict(X_test)
print('SVM micro-f1', f1_score(y_test, pred, average='micro'))
print('SVM macro-f1', f1_score(y_test, pred, average='macro'))

In [0]:
# Tune KNN
knn_tuned_parameters = [{'n_neighbors': [3, 4, 5]}]

knn = GridSearchCV(KNeighborsClassifier(), knn_tuned_parameters, cv=5, scoring='f1_micro', verbose=1000, n_jobs=-1)
knn.fit(X_train, y_train)

In [0]:
print(knn.best_estimator_)

In [0]:
pred = knn.best_estimator_.predict(X_test)
print('knn micro-f1:', f1_score(y_test, pred, average='micro'))
print('knn macro-f1:', f1_score(y_test, pred, average='macro'))

In [0]:
# Tune LR
lr_tuned_parameters = [{'C': [1.0, 10.0, 100.0]}]

lr = GridSearchCV(LogisticRegression(), lr_tuned_parameters, cv=5, scoring='f1_micro', verbose=1000, n_jobs=-1)
lr.fit(X_train, y_train)

In [0]:
print(lr.best_estimator_)

In [0]:
pred = lr.best_estimator_.predict(X_test)
print('LR micro-f1', f1_score(y_test, pred, average='micro'))
print('LR macro-f1', f1_score(y_test, pred, average='macro'))