# Text Classification Engine for Sensor Fusion

First, let us load and preprocess text transcriptions.

**Note**: Running this python script require `nltk` libraries to be set up in prior. Therefore, please download the following dependencies by uncommenting the line `nltk.download()`.

1. `perluniprops`
2. `punkt`

In [18]:
import pandas as pd
import numpy as np
import nltk
import tensorflow as tf
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import RandomizedSearchCV
# from sklearn.externals import joblib
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize 
from sklearn.utils import shuffle
from nltk.tokenize.treebank import TreebankWordDetokenizer as Detok
from nltk.tokenize.treebank import TreebankWordDetokenizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.linear_model import SGDClassifier
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.layers import *
import matplotlib.pyplot as plt
# from tensorflow.keras.utils.np_utils import to_categorical
from tensorflow.keras.regularizers import l2
# from tensorflow.keras.utils.vis_utils import plot_model
# from yellowbrick.classifier import ClassificationReport
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.models import load_model
import os.path
from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler
from imblearn.pipeline import make_pipeline
from imblearn.keras import BalancedBatchGenerator
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels
from sklearn.utils import class_weight
import tensorflow as tf

Import the dataset and shuffle it.

In [19]:
filepath = 'data/fyp_dataset.txt'
df = pd.read_csv(filepath, names=['sentence', 'operation'], sep=', ', engine='python')
df = shuffle(df)
sentences = df['sentence'].values
y = df['operation'].values
print(df)

                                   sentence  operation
500       I want the details of this bottle          2
601             Can you describe this chair          2
104                    Point out the bottle          1
400        What are the features of the pen          2
290                Can you help find my pen          1
..                                      ...        ...
416  What are the properties of the monitor          2
219             Find where the keyboard are          1
162              Show me where the mouse is          1
285              I need to find my keyboard          1
621                        look at the book          3

[709 rows x 2 columns]


In [26]:
stopwords_set = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself",
                  "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its",
                  "itself", "they", "them", "their", "theirs", "themselves", "which", "who", "whom", "these",
                  "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having",
                  "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as",
                  "until", "while", "of", "at", "by", "for", "with", "against", "into", "through", "during",
                  "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over",
                  "under", "again", "further", "then", "once", "here", "there", "when", "why", "how", "all", "any",
                  "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own",
                  "same", "so", "than", "too", "very", "s", "t", "don", "should", "now"]

Before removing the stopwords, we need to tokenize the sentences. Afterwards, we remove the stopwords.

In [27]:
def filter_stop_words(sentences):
    filtered_sentences = []
    detokenizer = Detok()
    for sentence in sentences:
        tokenized_sentence = word_tokenize(sentence)
        filtered_sentence = [word for word in tokenized_sentence if word not in stopwords_set]
        filtered_sentence = [] 
        for w in tokenized_sentence: 
            if w not in stopwords_set: 
                filtered_sentence.append(w)
        filtered_sentences.append(filtered_sentence)
    return filtered_sentences
        
def detokenize(filtered_sentences):
    detokenized_sentences = []
    for sentence in filtered_sentences:
        detokenized_sentences.append(TreebankWordDetokenizer().detokenize(sentence))
    return detokenized_sentences

Let us detokenize the output.

In [28]:
filtered_sentences = filter_stop_words(sentences)
detokenized_sentences = detokenize(filtered_sentences)

['locate phones']

Now let us assign the detokenized sentences back to the `pandas` dataframe. 

In [50]:
df['filtered_sentence'] = detokenized_sentences
df.head()

Unnamed: 0,sentence,operation,filtered_sentence
435,Give an account of this hand,2,Give account this hand
590,Give me the specification of the chair,2,Give specification chair
112,Detect cup,1,Detect cup
405,What are the features of the monitor,2,What features monitor
79,What are the locations of cups,1,What locations cups


## Building the model.

Here, we will test out several models and test the accuracy parameters, in order to arrive at a final model. We will also use a grid-search methodology for obtaining the best hyperparameters for the chosen model.

However, before this step, we need to understand the distribution of the dataset. For this, we will use `matplotlib` to plot the dataset w.r.t the labels. 

In [51]:
tags = ['1', '2', '3']

x = df['filtered_sentence'].values
y = df['operation']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=1000)

In [52]:
sgd = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None))
])

sgd.fit(X_train, y_train)

y_pred = sgd.predict(X_test)

print('Accuracy %s' % accuracy_score(y_pred, y_test))

print(classification_report(y_test, y_pred,target_names=tags))

Accuracy 0.9438202247191011
              precision    recall  f1-score   support

           1       0.92      1.00      0.96        69
           2       0.95      1.00      0.98        82
           3       1.00      0.63      0.77        27

    accuracy                           0.94       178
   macro avg       0.96      0.88      0.90       178
weighted avg       0.95      0.94      0.94       178



In [1]:
from pickle import load, dump

In [2]:
dump(sgd, open("model.pkl", "wb"))

NameError: name 'sgd' is not defined

In [3]:
model = load(open("model.pkl", "rb"))

In [4]:
y_pred = model.predict(X_test)

print('Accuracy %s' % accuracy_score(y_pred, y_test))

print(classification_report(y_test, y_pred,target_names=tags))

NameError: name 'X_test' is not defined

In [7]:
name_dictionary = {
    'laptops': {'operation': None, 'object_id': 6, 'multiple': True, 'pointing': False},
    'phones': {'operation': None, 'object_id': 4, 'multiple': True, 'pointing': False},
    'books': {'operation': None, 'object_id': 0, 'multiple': True, 'pointing': False},
    'bottles': {'operation': None, 'object_id': 3, 'multiple': True, 'pointing': False},
    'pens': {'operation': None, 'object_id': 7, 'multiple': True, 'pointing': False},
    'cups': {'operation': None, 'object_id': 9, 'multiple': True, 'pointing': False},
    'keyboards': {'operation': None, 'object_id': 8, 'multiple': True, 'pointing': False},
    'mouses': {'operation': None, 'object_id': 5, 'multiple': True, 'pointing': False},
    'monitors': {'operation': None, 'object_id': 2, 'multiple': True, 'pointing': False},
    'laptop': {'operation': None, 'object_id': 6, 'multiple': False, 'pointing': False},
    'phone': {'operation': None, 'object_id': 4, 'multiple': False, 'pointing': False},
    'book': {'operation': None, 'object_id': 0, 'multiple': False, 'pointing': False},
    'bottle': {'operation': None, 'object_id': 3, 'multiple': False, 'pointing': False},
    'pen': {'operation': None, 'object_id': 7, 'multiple': False, 'pointing': False},
    'cup': {'operation': None, 'object_id': 9, 'multiple': False, 'pointing': False},
    'keyboard': {'operation': None, 'object_id': 8, 'multiple': False, 'pointing': False},
    'mouse': {'operation': None, 'object_id': 5, 'multiple': False, 'pointing': False},
    'monitor': {'operation': None, 'object_id': 2, 'multiple': False, 'pointing': False}
}
labels = ['Locate', 'Describe', 'Invalid']

In [15]:
text = "locate the phones"

In [16]:
pred = model.predict([text])
print(labels[pred[0]-1])

Locate


In [37]:
_pointing = False
for token in tokens[0]:
    if token in ["this", "that"]:
        _pointing = True
    elif token in name_dictionary:
        command = name_dictionary[token]
        command["pointing"] = _pointing
        print(command)

{'operation': None, 'object_id': 4, 'multiple': True, 'pointing': False}


In [33]:
tokens = filter_stop_words([text])
filtered_commands = detokenize(tokens)