# Logistic Regression Classifier

Intent classification system based on the Logistic Regression model for restaurant information search.






## Required libraries

In [None]:
# Keras
from tensorflow.keras.preprocessing.text import Tokenizer

# NLTK
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

# SKlearn
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

# Joblib
from joblib import dump, load

# NumPy
import numpy as np

# Standard
import pickle

## Paths

The datasets and dictionaries used in this notebook can be downloaded from this [link](https://drive.google.com/drive/folders/178Sv30P-OvoJc_QOOAYkmUHgcQsPg-rb?usp=sharing). Also, it will be necessary to change the path of the `dataset_path` variable to the path wherein the downloaded information is saved. 

**Notes:** 

* The downloaded data can be obtained using the *TextAnalysis_DSTC2.ipynb* notebook.
* Only a UC3M user can access to the download link.

In [None]:
dataset_path = "/content/drive/My Drive/_TFM/Notebooks/Datasets/"
logit_path = "/content/drive/My Drive/_TFM/Notebooks/Models/logit/"

# Download some tools for the NLTK library
nltk.download("wordnet")
nltk.download("punkt")
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

## Data reading

DSTC 2 dataset provides dialogues of a human talking to a information system labelled with slots and dialogue actions.

In our case, the labels will be the users' intents which are the combination of acts and slots.

The data only has information about users' utterances.

**Load dataset**

In [None]:
# x_train
x_train_obj = open(dataset_path + "x_train.obj", "rb")
x_train = pickle.load(x_train_obj)
x_train_obj.close()

# y_train
y_train_obj = open(dataset_path + "y_train.obj", "rb")
y_train = pickle.load(y_train_obj)
y_train_obj.close()

# x_valid
x_valid_obj = open(dataset_path + "x_valid.obj", "rb")
x_valid = pickle.load(x_valid_obj)
x_valid_obj.close()

# y_valid
y_valid_obj = open(dataset_path + "y_valid.obj", "rb")
y_valid = pickle.load(y_valid_obj)
y_valid_obj.close()

# x_test
x_test_obj = open(dataset_path + "x_test.obj", "rb")
x_test = pickle.load(x_test_obj)
x_test_obj.close()

# y_test
y_test_obj = open(dataset_path + "y_test.obj", "rb")
y_test = pickle.load(y_test_obj)
y_test_obj.close()

**Check data**

In [None]:
print("Data = (X: users' utterances, y: intents)")

print("\n  > Train data:")
print("Samples: 8148 = %s" % len(x_train))
print(list(zip(x_train, y_train))[:3])

print("\n  > Validation data:")
print("Samples: 5656 = %s" % len(x_valid))
print(list(zip(x_valid, y_valid))[:3])

print("\n  > Test data:")
print("Samples: 5769 = %s" % len(x_test))
print(list(zip(x_test, y_test))[:3])

Data = (X: users' utterances, y: intents)

  > Train data:
Samples: 8148 = 8148
[('cheap restaurant', ['inform_pricerange']), ('any', ['inform_this']), ('south', ['inform_area'])]

  > Validation data:
Samples: 5656 = 5656
[('sil', ['unknown']), ('north part of town serving gastropub food', ['inform_food', 'inform_area']), ('north part of town serving gastropub food', ['inform_food', 'inform_area'])]

  > Test data:
Samples: 5769 = 5769
[('uh yes im looking for a cheap restaurant in the west part of town', ['affirm', 'inform_pricerange', 'inform_area']), ('west', ['inform_area']), ('uh yes a cheap restaurant', ['affirm', 'inform_pricerange'])]


## Data preprocessing


Each text sample will follow the next pipeline:

*raw text* > **decontraction** > **normalization** > **lemmatization** > **lowercasing** > **tokenization** > *tokens*

And then, we will build the vocabulary including the UNK token.

**Settings**

In [None]:
# Merge all samples
x_data = np.concatenate([x_train, x_valid, x_test])

In [None]:
# Decontract funtion
contracted_words_obj = open(dataset_path + "contracted_words.obj", "rb")
contracted_words = pickle.load(contracted_words_obj)
contracted_words_obj.close()

def decontract(text):
  for word in text.split():
    if word.lower() in contracted_words:
      text = text.replace(word, contracted_words[word.lower()])
  return text

# Normalize function
normalized_words_obj = open(dataset_path + "normalized_words.obj", "rb")
normalized_words = pickle.load(normalized_words_obj)
normalized_words_obj.close()

def normalize(text):
  for word in text.split():
    if word.lower() in normalized_words:
      text = text.replace(word, normalized_words[word.lower()])
  return text

In [None]:
# Lemmatize function

# Set the lemmatizer
lemmatizer = WordNetLemmatizer()

# Get the POS tag for a given word
def get_pos_tag(word):
  tag = nltk.pos_tag([word])[0][1][0].upper()
  tag_dict = {"J": wordnet.ADJ,
              "N": wordnet.NOUN,
              "V": wordnet.VERB,
              "R": wordnet.ADV}
  return tag_dict.get(tag, wordnet.NOUN)

# Lemmatize a text with POS tag
def lemmatize(text, lemmatizer):
  word_list = nltk.word_tokenize(text)
  text_lemmatized = ' '.join([lemmatizer.lemmatize(word, get_pos_tag(word)) for word in word_list])
  return text_lemmatized

**Preprocessing**

In [None]:
# Decontract, normalize, lemmatize, lowercase and tokenize the text samples
tokenizer = Tokenizer(lower=True, oov_token="unk")
tokenizer.fit_on_texts(
    [lemmatize(normalize(decontract(sample)), lemmatizer) for sample in x_data])

# Vocabulary
x_vocab = tokenizer.word_index
x_vocab_size = len(x_vocab) + 1

In [None]:
x_vocab_size

502

## Featurization

### Tokens (TF-IDF)


In [None]:
# x_train
x_train_eval = np.concatenate([x_train, x_valid])
x_train_eval_tfidf = tokenizer.texts_to_matrix(
    [lemmatize(normalize(decontract(sample)), lemmatizer) for sample in x_train_eval], mode="tfidf")

# x_test
x_test_tfidf = tokenizer.texts_to_matrix(
    [lemmatize(normalize(decontract(sample)), lemmatizer) for sample in x_test], mode="tfidf")

In [None]:
np.shape(x_train_eval_tfidf)

(13804, 502)

In [None]:
np.shape(x_test_tfidf)

(5769, 502)

### Labels (Multi-Label Binarizer)

On the other hand, the labels will be encoded as *n* binary elements in an array, where *n* is the total number of labels.

The binary vector (associated to each sample) indicates the presence of labels.

In [None]:
# Load the labels
labels_obj = open(dataset_path + "labels.obj", "rb")
labels = pickle.load(labels_obj)
labels_obj.close()

In [None]:
# Build the multi-label binarizer
mlb = MultiLabelBinarizer(classes=labels)

# y_train
y_train_eval = np.concatenate([y_train, y_valid])
y_train_eval_mlb = mlb.fit_transform(y_train_eval)

# y_test
y_test_mlb = mlb.fit_transform(y_test)

## Logistic model

In a multi-label scenario the Logistic Regression model must be wrapped in the OneVsRestClassifier to apply the One-vs-the-Rest (OvR) multi-label strategy.

### Hyper-parameter tuning



In [None]:
# Define the model
logit_model = OneVsRestClassifier(LogisticRegression(penalty="l1", solver="liblinear"), n_jobs=1)

logit_model.get_params().keys()

dict_keys(['estimator__C', 'estimator__class_weight', 'estimator__dual', 'estimator__fit_intercept', 'estimator__intercept_scaling', 'estimator__l1_ratio', 'estimator__max_iter', 'estimator__multi_class', 'estimator__n_jobs', 'estimator__penalty', 'estimator__random_state', 'estimator__solver', 'estimator__tol', 'estimator__verbose', 'estimator__warm_start', 'estimator', 'n_jobs'])

In [None]:
# Define the index of the train and validation data
index_data = [i for i, _ in enumerate(x_train_eval)]
index_train = index_data[:len(x_train)]
index_valid = index_data[len(x_train):]

In [None]:
# Define the hyper-parameters to tuning
C = [1e-03, 0.1, 1.0, 2.0, 2.5]
max_iter = [100, 120 ,140]
param_grid = dict(estimator__C=C, estimator__max_iter=max_iter)

In [None]:
# Tuning through Grid Search approach
logit_tune_model = GridSearchCV(logit_model,
                                param_grid=param_grid,
                                scoring = "accuracy",
                                cv=[(index_train, index_valid)],
                                verbose=1,
                                n_jobs=-1)

In [None]:
# Select the model with the best hyper-parameters
logit_tune_model = logit_tune_model.fit(x_train_eval_tfidf, y_train_eval_mlb)

Fitting 1 folds for each of 15 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:   13.3s finished
  str(classes[c]))


In [None]:
print("--- Hyper-paremeters selected ---\n")
print("Best accuracy: %f using %s" % (logit_tune_model.best_score_, logit_tune_model.best_params_))

--- Hyper-paremeters selected ---

Best accuracy: 0.963048 using {'estimator__C': 2.0, 'estimator__max_iter': 100}


**Save the model**

In [None]:
dump(logit_tune_model, logit_path + "logit_model.joblib")

['/content/drive/My Drive/_TFM/Notebooks/Models/logit/logit_model.joblib']

**Load the model**

In [None]:
logit_tune_model = load(logit_path + "logit_model.joblib") 

### Model Evaluation

In [None]:
# Prediction
y_test_mlb_pred = logit_tune_model.predict(x_test_tfidf)

**Accuracy**

In [None]:
metrics.accuracy_score(y_test_mlb, y_test_mlb_pred)

0.9580516553995493

**Precision, Recall & F1-Score**

In [None]:
print(metrics.classification_report(y_test_mlb, y_test_mlb_pred, target_names=labels, zero_division=0))

                    precision    recall  f1-score   support

               ack       0.50      0.43      0.46         7
            affirm       0.98      0.99      0.99       326
               bye       1.00      1.00      1.00       619
      confirm_area       0.67      1.00      0.80         2
      confirm_food       0.76      0.76      0.76        25
confirm_pricerange       0.78      1.00      0.88         7
         deny_food       0.00      0.00      0.00         2
         deny_name       0.00      0.00      0.00         0
             hello       0.93      1.00      0.97        28
       inform_area       1.00      0.99      0.99       653
       inform_food       0.98      0.98      0.98      1439
       inform_name       0.75      0.20      0.32        15
 inform_pricerange       1.00      0.99      1.00       526
       inform_this       0.98      1.00      0.99       537
            negate       0.96      0.93      0.94        69
            repeat       1.00      0.53

### Predictions

We inspect some predictions.

In [None]:
y_test_pred = mlb.inverse_transform(y_test_mlb_pred)

In [None]:
index_sample = 0

print("User's utterance: %s" % x_test[index_sample])

print("\nTrue label: %s" % y_test[index_sample])
print("Predicted label: %s" % list(y_test_pred[index_sample]))

print("\nTrue binary label: %s" % y_test_mlb[index_sample])
print("Predicted binary label: %s" % y_test_mlb_pred[index_sample])

User's utterance: uh yes im looking for a cheap restaurant in the west part of town

True label: ['affirm', 'inform_pricerange', 'inform_area']
Predicted label: ['affirm', 'inform_area', 'inform_pricerange']

True binary label: [0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
Predicted binary label: [0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
