## Import Necessary Libraries and Load Environment Variables

In [2]:
# Import libraries
import os
from dotenv import load_dotenv
from pathlib import Path
from swarmauri.utils.base64_to_img_url import base64_to_img_url
from swarmauri.llms.concrete.FalAIVisionModel import FalAIVisionModel
from swarmauri.llms.concrete.OpenAIAudioTTS import OpenAIAudioTTS
from swarmauri.llms.concrete.OpenAIModel import OpenAIModel as LLM
from swarmauri.conversations.concrete.Conversation import Conversation
from swarmauri.messages.concrete.HumanMessage import HumanMessage
from swarmauri.llms.concrete.DeepInfraImgGenModel import DeepInfraImgGenModel

# Load environment variables
load_dotenv()

DEEPINFRA_API_KEY = os.getenv("DEEPINFRA_API_KEY")
IMGBB_API_KEY = os.getenv("IMGBB_API_KEY")
FAL_KEY = os.getenv("FAL_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")


# Initialize models
llm = LLM(api_key=OPENAI_API_KEY)
llm_img_gen = DeepInfraImgGenModel(api_key=DEEPINFRA_API_KEY)
falai_vision_model = FalAIVisionModel(api_key=FAL_KEY) if FAL_KEY else None
tts = OpenAIAudioTTS(api_key=OPENAI_API_KEY)

# Output directory for audio
output_dir = Path("output")
output_dir.mkdir(exist_ok=True)


## Multimodal Chatbot Function

In [3]:
def multimodal_chatbot(input_text, mode="text", system_context="Default system context", model_name="tts-1"):
    """
    Multimodal chatbot function handling different modes of interaction.

    Parameters:
    - input_text (str): User's input.
    - mode (str): Mode of interaction ('text', 'image_gen', 'audio', 'vision').
    - system_context (str): Additional context.
    - model_name (str): Model for text-to-speech.
    
    Returns:
    - Response based on mode.
    """
    if mode == "text":
        conversation = Conversation()
        conversation.add_message(HumanMessage(content=system_context + "\n" + input_text))
        llm.predict(conversation=conversation)
        return conversation.get_last().content

    elif mode == "image_gen":
        conversation = Conversation()
        conversation.add_message(HumanMessage(content=input_text))
        llm.predict(conversation=conversation)
        detailed_description = conversation.get_last().content
        image_base64 = llm_img_gen.generate_image_base64(detailed_description)
        image_url = base64_to_img_url(image_base64, IMGBB_API_KEY)
        return f"Generated Image: {image_url}"

    elif mode == "audio":
        tts.name = model_name
        tts.voice = "alloy"
        output_path = output_dir / "output.mp3"
        tts.predict(text=input_text, audio_path=str(output_path))
        return f"Audio saved: {str(output_path)}"

    elif mode == "vision":
        result = falai_vision_model.process_image(image_url=input_text, prompt="Describe this image.")
        return result
    else:
        return "Invalid mode selected."


## Build Gradio Interface

In [7]:
import gradio as gr

In [8]:
# Define Gradio interface
def gradio_interface(user_input, mode, system_context, model_name):
    return multimodal_chatbot(user_input, mode=mode, system_context=system_context, model_name=model_name)

with gr.Blocks() as demo:
    gr.Markdown("# Multimodal Chatbot")
    
    with gr.Row():
        user_input = gr.Textbox(label="Input", placeholder="Type your text or image URL here...")
        mode = gr.Dropdown(choices=["text", "image_gen", "audio", "vision"], label="Mode", value="text")
    
    system_context = gr.Textbox(label="System Context", value="Default system context", placeholder="Add any context...")
    model_name = gr.Textbox(label="TTS Model Name (for audio)", value="tts-1")
    
    output = gr.Textbox(label="Output", lines=10, interactive=True)
    
    submit_btn = gr.Button("Submit")
    submit_btn.click(gradio_interface, inputs=[user_input, mode, system_context, model_name], outputs=output)

demo.launch()

* Running on local URL:  http://127.0.0.1:7862

To create a public link, set `share=True` in `launch()`.




## Summary

- Gradio simplifies building user-friendly UIs for your multimodal chatbot.
- Users can interact with the chatbot for different modes (text, image, audio, vision).
- This interface is easily extendable and deployable locally or online.

# Notebook Metadata

In [9]:
import platform
import sys
from datetime import datetime

# Display author information
author_name = "Huzaifa Irshad" 
github_username = "irshadhuzaifa"  

print(f"Author: {author_name}")
print(f"GitHub Username: {github_username}")

# Last modified datetime (file's metadata)
notebook_file = "Notebook_03_Deployment.ipynb"
try:
    last_modified_time = os.path.getmtime(notebook_file)
    last_modified_datetime = datetime.fromtimestamp(last_modified_time)
    print(f"Last Modified: {last_modified_datetime}")
except Exception as e:
    print(f"Could not retrieve last modified datetime: {e}")

# Display platform, Python version, and Swarmauri version
print(f"Platform: {platform.system()} {platform.release()}")
print(f"Python Version: {sys.version}")

import swarmauri

try:
    version = swarmauri.__version__
except AttributeError:
    version = f"Swarmauri Version: 0.5.1"

print(f"Swarmauri Version: {version}")

Author: Huzaifa Irshad
GitHub Username: irshadhuzaifa
Last Modified: 2024-11-07 13:27:24.813063
Platform: Windows 11
Python Version: 3.12.7 | packaged by Anaconda, Inc. | (main, Oct  4 2024, 13:17:27) [MSC v.1929 64 bit (AMD64)]
Swarmauri Version: Swarmauri Version: 0.5.1
