# Multimodal Conversational AI

## User Intent detection

In [1]:
import os
import numpy as np
import transformers
import json

from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)

model_finetuned = './twiz-intent-model'
with open(os.path.join(model_finetuned + '/all_intents.json'), 'r') as all_intents_json:
    all_intents = json.load(all_intents_json) # contains the written out names of intents. also implicitly

tokenizer_name = 'roberta-base' # try 'bert-base-uncased', 'bert-base-cased', 'bert-large-uncased'
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) # loads a tokenizer

model = AutoModelForSequenceClassification.from_pretrained(model_finetuned, 
                                                           num_labels=len(all_intents)) # Loads the BERT model weights


In [2]:
all_intents

['GetCuriositiesIntent',
 'GreetingIntent',
 'AMAZON.SelectIntent',
 'ShowStepsIntent',
 'IdentifyRestrictionsIntent',
 'ProvideUserNameIntent',
 'MoreOptionsIntent',
 'AMAZON.RepeatIntent',
 'AMAZON.HelpIntent',
 'QuestionIntent',
 'MoreDetailIntent',
 'AdjustServingsIntent',
 'GoToStepIntent',
 'SetTimerIntent',
 'OutOfScopeIntent',
 'AMAZON.FallbackIntent',
 'PreviousStepIntent',
 'TerminateCurrentTaskIntent',
 'ChitChatIntent',
 'CompleteTaskIntent',
 'NoneOfTheseIntent',
 'ShoppingIntent',
 'AMAZON.PauseIntent',
 'AMAZON.CancelIntent',
 'StartStepsIntent',
 'InappropriateIntent',
 'AMAZON.NoIntent',
 'SuggestionsIntent',
 'ResumeTaskIntent',
 'IngredientsConfirmationIntent',
 'NextStepIntent',
 'IdentifyProcessIntent',
 'NoRestrictionsIntent',
 'AMAZON.YesIntent',
 'SubstitutionIntent',
 'AMAZON.StopIntent']

In [3]:
utterance = "Can you find me a chicken recipe?"

input_encoding = tokenizer.encode_plus(utterance, return_tensors='pt', add_special_tokens=True, max_length = 512, truncation = True)
outputs = model(**input_encoding)

logits = outputs.logits
idx = logits.argmax(-1).item()
all_intents[idx]

'IdentifyProcessIntent'

In [4]:
utterance = "Show me the suggestions"

input_encoding = tokenizer.encode_plus(utterance, return_tensors='pt', add_special_tokens=True, max_length = 512, truncation = True)
outputs = model(**input_encoding)

logits = outputs.logits
idx = logits.argmax(-1).item()
all_intents[idx]

'SuggestionsIntent'

In [5]:
utterance = "Show me the second recipe."

input_encoding = tokenizer.encode_plus(utterance, return_tensors='pt', add_special_tokens=True, max_length = 512, truncation = True)
outputs = model(**input_encoding)

logits = outputs.logits
idx = logits.argmax(-1).item()
all_intents[idx]

'AMAZON.SelectIntent'

In [6]:
utterance = "Go back."

input_encoding = tokenizer.encode_plus(utterance, return_tensors='pt', add_special_tokens=True, max_length = 512, truncation = True)
outputs = model(**input_encoding)

logits = outputs.logits
idx = logits.argmax(-1).item()
all_intents[idx]

'PreviousStepIntent'

In [7]:
utterance = "What are the recipe ingredients?"

input_encoding = tokenizer.encode_plus(utterance, return_tensors='pt', add_special_tokens=True, max_length = 512, truncation = True)
outputs = model(**input_encoding)

logits = outputs.logits
idx = logits.argmax(-1).item()
all_intents[idx]

'IngredientsConfirmationIntent'

In [8]:
utterance = "This looks great. Let's start the recipe."

input_encoding = tokenizer.encode_plus(utterance, return_tensors='pt', add_special_tokens=True, max_length = 512, truncation = True)
outputs = model(**input_encoding)

logits = outputs.logits
idx = logits.argmax(-1).item()
all_intents[idx]

'StartStepsIntent'

In [9]:
utterance = "Next step."

input_encoding = tokenizer.encode_plus(utterance, return_tensors='pt', add_special_tokens=True, max_length = 512, truncation = True)
outputs = model(**input_encoding)

logits = outputs.logits
idx = logits.argmax(-1).item()
all_intents[idx]

'NextStepIntent'

## Multimodal Conversations

In [12]:
import json as json

with open("recipes_data.json", "r") as read_file:
    data = json.load(read_file)

imgA = data['0']['images'][0]['url']
titleA = data['0']['displayName']
propA = "Serves " + str(data['0']['servings'])


In [13]:
from IPython.display import Video, Image, HTML, display

def displayResults(titleA, imgA, propA, titleB, imgB, propB, titleC, imgC, propC):
    display(HTML(f"""
    <div class ="row" style="margin-left:100px">
       <div class="col-xs-6">
        <div class ="images" style="display:inline-block;">
            <img src="{imgA}" class="img-responsive" width="80px">
        </div>
        <div class ="images" style="display:inline-block;">
                      {titleA} <br>
                      {propA} <br>
        </div>
        <div class ="images" style="display:inline-block;">
            <img src="{imgB}" class="img-responsive" width="80">
        </div>
        <div class ="images" style="display:inline-block;">
                      {titleB} <br>
                      {propB} <br>
        </div>
        <div class ="images" style="display:inline-block;">
            <img src="{imgC}" class="img-responsive" width="80">
        </div>
                      {titleC} <br>
                      {propC} <br>
        </div>
       </div>
    </div>
    """))

def displayStep(text, img):
    display(HTML(f"""
    <div class ="row" style="margin-left:100px">
        <img src="{img}" class="img-responsive" width="80px">
        {text}<br>
    </div>
        """))


In [14]:
# Turn 1
print(" BOT: Hello, I am a TaskBot and I can help you with cooking tasks. Which recipe would you like to do?")
print()
val = input("USER:")

# Turn 2
print()
print(" BOT: Great! These are the results I found:")
print()
displayResults(titleA, imgA, propA, titleA, imgA, propA, titleA, imgA, propA)    
print("      Which recipe would you like to do? Or, would you like to search for something different?")
print()
val = input("USER:")

# Turn 3
print()
print(" BOT: That looks delicious! Let's start!")
print()
displayStep(data['0']['instructions'][0]['stepText'], imgA)    

# Turn 4
print(" BOT: Say next when you're done.")
print()
val = input("USER:")
print()
displayStep(data['0']['instructions'][1]['stepText'], imgA)    

# Turn 5
print(" BOT: Say next when you're done.")
print()
val = input("USER:")


 BOT: Hello, I am a TaskBot and I can help you with cooking tasks. Which recipe would you like to do?



USER: cf



 BOT: Great! These are the results I found:



      Which recipe would you like to do? Or, would you like to search for something different?



USER: gdfgd



 BOT: That looks delicious! Let's start!



 BOT: Say next when you're done.



USER: trete





 BOT: Say next when you're done.



USER: gfd
