In [2]:
!pip install google-generativeai
!pip install faiss-cpu
!pip install transformers torchaudio librosa
!pip install git+https://github.com/salesforce/LAVIS.git

Collecting git+https://github.com/salesforce/LAVIS.git
  Cloning https://github.com/salesforce/LAVIS.git to /tmp/pip-req-build-ef3xpavo
  Running command git clone --filter=blob:none --quiet https://github.com/salesforce/LAVIS.git /tmp/pip-req-build-ef3xpavo
  Resolved https://github.com/salesforce/LAVIS.git to commit 506965b9c4a18c1e565bd32acaccabe0198433f7
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting contexttimer (from salesforce-lavis==1.0.1)
  Using cached contexttimer-0.3.3.tar.gz (4.9 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting decord (from salesforce-lavis==1.0.1)
  Using cached decord-0.6.0-py3-none-manylinux2010_x86_64.whl.metadata (422 bytes)
Collecting diffusers<=0.16.0 (from salesforce-lavis==1.0.1)
  Using cached diffusers-0.16.0-py3-none-any.whl.metadata (19 kB)
Collecting fairscale==0.4.4 (from salesforce-lavis==

In [3]:
!pip install git+https://github.com/salesforce/LAVIS.git


Collecting git+https://github.com/salesforce/LAVIS.git
  Cloning https://github.com/salesforce/LAVIS.git to /tmp/pip-req-build-in_h78i9
  Running command git clone --filter=blob:none --quiet https://github.com/salesforce/LAVIS.git /tmp/pip-req-build-in_h78i9
  Resolved https://github.com/salesforce/LAVIS.git to commit 506965b9c4a18c1e565bd32acaccabe0198433f7
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting contexttimer (from salesforce-lavis==1.0.1)
  Using cached contexttimer-0.3.3.tar.gz (4.9 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting decord (from salesforce-lavis==1.0.1)
  Using cached decord-0.6.0-py3-none-manylinux2010_x86_64.whl.metadata (422 bytes)
Collecting diffusers<=0.16.0 (from salesforce-lavis==1.0.1)
  Using cached diffusers-0.16.0-py3-none-any.whl.metadata (19 kB)
Collecting fairscale==0.4.4 (from salesforce-lavis==

In [4]:

!pip install google-generativeai faiss-cpu transformers torchaudio librosa
!pip install git+https://github.com/salesforce/LAVIS.git
!pip install timm fairscale omegaconf


Collecting git+https://github.com/salesforce/LAVIS.git
  Cloning https://github.com/salesforce/LAVIS.git to /tmp/pip-req-build-8liw3gg8
  Running command git clone --filter=blob:none --quiet https://github.com/salesforce/LAVIS.git /tmp/pip-req-build-8liw3gg8
  Resolved https://github.com/salesforce/LAVIS.git to commit 506965b9c4a18c1e565bd32acaccabe0198433f7
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting contexttimer (from salesforce-lavis==1.0.1)
  Using cached contexttimer-0.3.3.tar.gz (4.9 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting decord (from salesforce-lavis==1.0.1)
  Using cached decord-0.6.0-py3-none-manylinux2010_x86_64.whl.metadata (422 bytes)
Collecting diffusers<=0.16.0 (from salesforce-lavis==1.0.1)
  Using cached diffusers-0.16.0-py3-none-any.whl.metadata (19 kB)
Collecting fairscale==0.4.4 (from salesforce-lavis==

In [6]:


import os
import torch
import numpy as np
from PIL import Image
import torchaudio
from transformers import CLIPProcessor, CLIPModel, Wav2Vec2Processor, Wav2Vec2Model
from lavis.models import load_model_and_preprocess
import google.generativeai as genai
from google.colab import files
import faiss

# ================================
# 🔐 CONFIGURE GEMINI
# ================================
GEMINI_API_KEY = ""  # Replace this
genai.configure(api_key=GEMINI_API_KEY)
gemini = genai.GenerativeModel("gemini-1.5-pro-latest")

# ================================
# 🧠 LOAD ENCODERS
# ================================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# CLIP for text & image
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Wav2Vec2 for audio
audio_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
audio_model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h").to(device)

# BLIP for image captioning
blip_model, vis_processors, _ = load_model_and_preprocess(
    name="blip_caption", model_type="base_coco", is_eval=True, device=device
)

# ================================
# 🔧 ENCODING FUNCTIONS
# ================================
def encode_text(text):
    inputs = clip_processor(text=[text], return_tensors="pt", padding=True).to(device)
    with torch.no_grad():
        features = clip_model.get_text_features(**inputs)
    return features[0].cpu().numpy()

def encode_image(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = clip_processor(images=image, return_tensors="pt").to(device)
    with torch.no_grad():
        features = clip_model.get_image_features(**inputs)
    return features[0].cpu().numpy()

def encode_audio(audio_path):
    waveform, sr = torchaudio.load(audio_path)
    waveform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(waveform)
    input_values = audio_processor(waveform.squeeze(0), return_tensors="pt", sampling_rate=16000).input_values.to(device)
    with torch.no_grad():
        features = audio_model(input_values).last_hidden_state.mean(dim=1)
    return features[0].cpu().numpy()

def get_image_caption(image_path):
    raw_image = Image.open(image_path).convert("RGB")
    image = vis_processors["eval"](raw_image).unsqueeze(0).to(device)
    caption = blip_model.generate({"image": image})[0]
    return caption

# ================================
# 📚 KNOWLEDGE BASE + FAISS
# ================================
knowledge_base = [
    "A spectrogram shows frequency over time and intensity.",
    "Bar charts are ideal for comparing quantities across categories.",
    "Waveforms represent audio amplitude over time.",
    "Histograms are great for visualizing distributions of numeric data."
]

kb_embeddings = np.array([encode_text(doc) for doc in knowledge_base]).astype("float32")
dimension = kb_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(kb_embeddings)

def retrieve_relevant_docs(query_embedding, top_k=2):
    query_embedding = np.expand_dims(query_embedding.astype("float32"), axis=0)
    distances, indices = index.search(query_embedding, top_k)
    return [knowledge_base[i] for i in indices[0]]

# ================================
# 🎯 GEMINI RAG CALL
# ================================
def generate_with_gemini_1_5(image_path, user_query, retrieved_docs, image_caption=None):
    vision_data = Image.open(image_path).convert("RGB") if image_path else None
    prompt = f"""
You are a helpful assistant. Answer the user query based on retrieved context and input modality.

User Question:
{user_query}

Retrieved Context:
{retrieved_docs}

Image Description (if any):
{image_caption if image_caption else "No image uploaded."}
"""

    parts = [prompt]
    if vision_data:
        parts.append(vision_data)

    response = gemini.generate_content(parts)
    return response.text

# ================================
# 🚀 MAIN MULTIMODAL RAG
# ================================
print("What input modalities will you use?")
print("1 - Text\n2 - Image\n3 - Audio\n(You can combine, e.g., 1 2 3)")
selected = input("Enter choice(s): ").split()

use_text = '1' in selected
use_image = '2' in selected
use_audio = '3' in selected

vectors = []
image_path, audio_path, text_query = None, None, "What is shown or heard here?"
caption = ""

# 📁 Upload image
if use_image:
    print("📁 Upload an image:")
    uploaded = files.upload()
    image_path = list(uploaded.keys())[0]
    caption = get_image_caption(image_path)
    vectors.append(encode_image(image_path))

# 🎧 Upload audio
if use_audio:
    print("🎧 Upload audio file (.wav):")
    uploaded = files.upload()
    audio_path = list(uploaded.keys())[0]
    vectors.append(encode_audio(audio_path))

# ✍️ Text input
if use_text:
    text_query = input("📝 Enter your text query: ")
    vectors.append(encode_text(text_query))

if not vectors:
    print("❌ No inputs provided. Please select at least one modality.")
else:
    # 🔍 Retrieve context and answer
    query_vec = np.mean(np.stack(vectors), axis=0)
    retrieved = retrieve_relevant_docs(query_vec)

    print("\n📚 Retrieved Docs:")
    for doc in retrieved:
        print("•", doc)

    final_answer = generate_with_gemini_1_5(
        image_path=image_path if use_image else None,
        user_query=text_query,
        retrieved_docs=" ".join(retrieved),
        image_caption=caption if use_image else None
    )

    print("\n🤖 Gemini 1.5 Answer:\n", final_answer)


ModuleNotFoundError: No module named 'lavis'

In [16]:
import os
import torch
import numpy as np
from PIL import Image
import torchaudio
from transformers import (
    CLIPProcessor, CLIPModel,
    Wav2Vec2Processor, Wav2Vec2Model,
    BlipProcessor, BlipForConditionalGeneration
)
import google.generativeai as genai
from google.colab import files
import faiss

# ================================
# 🔐 GEMINI API KEY
# ================================
GEMINI_API_KEY = "AIzaSyAJtk_kLPx7s5YWcrfbMBVuaKxqmswTSU0"  # ⬅️ Replace with your actual API key
genai.configure(api_key=GEMINI_API_KEY)
gemini = genai.GenerativeModel("gemini-1.5-pro-latest")

# ================================
# 🧠 LOAD MODELS
# ================================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# CLIP for text & image
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Wav2Vec2 for audio
audio_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
audio_model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h").to(device)

# BLIP for image captioning
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

# ================================
# 🔧 ENCODING FUNCTIONS
# ================================
def encode_text(text):
    inputs = clip_processor(text=[text], return_tensors="pt", padding=True).to(device)
    with torch.no_grad():
        features = clip_model.get_text_features(**inputs)
    return features[0].cpu().numpy()

def encode_image(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = clip_processor(images=image, return_tensors="pt").to(device)
    with torch.no_grad():
        features = clip_model.get_image_features(**inputs)
    return features[0].cpu().numpy()

def encode_audio(audio_path):
    waveform, sr = torchaudio.load(audio_path)
    waveform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(waveform)
    input_values = audio_processor(waveform.squeeze(0), return_tensors="pt", sampling_rate=16000).input_values.to(device)
    with torch.no_grad():
        features = audio_model(input_values).last_hidden_state.mean(dim=1)
    return features[0].cpu().numpy()

def get_image_caption(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = blip_processor(image, return_tensors="pt").to(device)
    output = blip_model.generate(**inputs, max_new_tokens=30)
    caption = blip_processor.decode(output[0], skip_special_tokens=True)
    return caption

# ================================
# 📚 KNOWLEDGE BASE + FAISS
# ================================
knowledge_base = [
    "A spectrogram shows frequency over time and intensity.",
    "Bar charts are ideal for comparing quantities across categories.",
    "Waveforms represent audio amplitude over time.",
    "Histograms are great for visualizing distributions of numeric data."
]

kb_embeddings = np.array([encode_text(doc) for doc in knowledge_base]).astype("float32")
dimension = kb_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(kb_embeddings)

def retrieve_relevant_docs(query_embedding, top_k=2):
    query_embedding = np.expand_dims(query_embedding.astype("float32"), axis=0)
    distances, indices = index.search(query_embedding, top_k)
    return [knowledge_base[i] for i in indices[0]]

# ================================
# 🎯 GEMINI 1.5 GENERATION
# ================================
def generate_with_gemini_1_5(image_path, user_query, retrieved_docs, image_caption=None):
    vision_data = Image.open(image_path).convert("RGB") if image_path else None


    prompt = f"""
You are a helpful assistant. Answer the user query based on retrieved context and input modality.

User Question:
{user_query}

Retrieved Context:
{retrieved_docs}

Image Description (if any):
{image_caption if image_caption else "No image uploaded."}
"""

    parts = [prompt]
    if vision_data:
        parts.append(vision_data)

    response = gemini.generate_content(parts)
    return response.text

# ================================
# 🚀 MULTIMODAL RAG FLOW
# ================================
print("What input modalities will you use?")
print("1 - Text\n2 - Image\n3 - Audio\n(You can combine, e.g., 1 2 3)")
selected = input("Enter choice(s): ").split()

use_text = '1' in selected
use_image = '2' in selected
use_audio = '3' in selected

vectors = []
image_path, audio_path, text_query = None, None, "What is shown or heard here?"
caption = ""

# 📁 Upload image
if use_image:
    print("📁 Upload an image file:")
    uploaded = files.upload()
    image_path = list(uploaded.keys())[0]
    caption = get_image_caption(image_path)
    vectors.append(encode_image(image_path))

# 🎧 Upload audio
if use_audio:
    print("🎧 Upload audio file (.wav):")
    uploaded = files.upload()
    audio_path = list(uploaded.keys())[0]
    vectors.append(encode_audio(audio_path))

# ✍️ Text input
if use_text:
    text_query = input("📝 Enter your text query: ")
    vectors.append(encode_text(text_query))

from IPython.display import display
import ipywidgets as widgets

if not vectors:
    print("❌ No inputs provided. Please select at least one modality.")
else:
    # Display text box for user query
    question_box = widgets.Text(
        placeholder='Ask a question about the uploaded file...',
        description='Your Q:',
        layout=widgets.Layout(width='100%'),
        style={'description_width': 'initial'}
    )
    display(question_box)

    output_box = widgets.Output()
    display(output_box)

    def on_submit(change):
        with output_box:
            output_box.clear_output()
            user_question = question_box.value.strip()
            if user_question:
                # You can still use retrieval if desired (optional)
                # query_vec = encode_text(user_question)
                # retrieved = retrieve_relevant_docs(query_vec)

                answer = generate_with_gemini_1_5(
                    image_path=image_path if use_image else None,
                    user_query=user_question,
                    retrieved_docs="",  # Retrieval disabled here
                    image_caption=caption if use_image else None
                )
                print("🤖 Gemini 1.5 Answer:\n", answer)
            else:
                print("⚠️ Please ask a question.")

    question_box.on_submit(on_submit)





Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


What input modalities will you use?
1 - Text
2 - Image
3 - Audio
(You can combine, e.g., 1 2 3)
Enter choice(s): 1


KeyboardInterrupt: Interrupted by user

In [20]:
import os
import torch
import numpy as np
from PIL import Image
import torchaudio
from transformers import (
    CLIPProcessor, CLIPModel,
    Wav2Vec2Processor, Wav2Vec2Model,
    BlipProcessor, BlipForConditionalGeneration
)
import google.generativeai as genai
from google.colab import files
import faiss

# ================================
# 🔐 GEMINI API KEY
# ================================
GEMINI_API_KEY = "AIzaSyAJtk_kLPx7s5YWcrfbMBVuaKxqmswTSU0"  # ⬅️ Replace with your actual API key
genai.configure(api_key=GEMINI_API_KEY)
gemini = genai.GenerativeModel("gemini-1.5-pro-latest")

# ================================
# 🧠 LOAD MODELS
# ================================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# CLIP for text & image
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Wav2Vec2 for audio
audio_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
audio_model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h").to(device)

# BLIP for image captioning
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

# ================================
# 🔧 ENCODING FUNCTIONS
# ================================
def encode_text(text):
    inputs = clip_processor(text=[text], return_tensors="pt", padding=True).to(device)
    with torch.no_grad():
        features = clip_model.get_text_features(**inputs)
    return features[0].cpu().numpy()

def encode_image(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = clip_processor(images=image, return_tensors="pt").to(device)
    with torch.no_grad():
        features = clip_model.get_image_features(**inputs)
    return features[0].cpu().numpy()

def encode_audio(audio_path):
    waveform, sr = torchaudio.load(audio_path)
    waveform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(waveform)
    input_values = audio_processor(waveform.squeeze(0), return_tensors="pt", sampling_rate=16000).input_values.to(device)
    with torch.no_grad():
        features = audio_model(input_values).last_hidden_state.mean(dim=1)
    return features[0].cpu().numpy()

def get_image_caption(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = blip_processor(image, return_tensors="pt").to(device)
    output = blip_model.generate(**inputs, max_new_tokens=30)
    caption = blip_processor.decode(output[0], skip_special_tokens=True)
    return caption

# ================================
# 🎯 GEMINI 1.5 GENERATION
# ================================
def generate_with_gemini_1_5(user_query, context=None, image_path=None, image_caption=None):
    vision_data = Image.open(image_path).convert("RGB") if image_path else None

    prompt = f"""
You are a helpful assistant. Answer the user's question based on the context provided.

User Question:
{user_query}

Context:
{context if context else "No additional context."}

Image Description:
{image_caption if image_caption else "No image uploaded."}
"""

    parts = [prompt]
    if vision_data:
        parts.append(vision_data)

    response = gemini.generate_content(parts)
    return response.text

# ================================
# 🚀 INTERACTIVE LOOP
# ================================
while True:
    print("\nWhat input modality will you use?")
    print("1 - Text\n2 - Image\n3 - Audio\n4 - Exit")
    selected = input("Enter choice: ").strip()

    if selected == '4':
        print("👋 Exiting...")
        break

    text_context = ""
    image_path = None
    audio_path = None
    image_caption = ""

    if selected == '1':
        print("✍️ Enter a paragraph of text:")
        text_context = input()

    elif selected == '2':
        print("📁 Upload an image file:")
        uploaded = files.upload()
        image_path = list(uploaded.keys())[0]
        image_caption = get_image_caption(image_path)

    elif selected == '3':
        print("🎧 Upload an audio file (.wav):")
        uploaded = files.upload()
        audio_path = list(uploaded.keys())[0]

    else:
        print("❌ Invalid option. Please try again.")
        continue

    # Ask questions iteratively
    while True:
        user_question = input("\n❓ Ask a question (or type 'exit' to go back): ").strip()
        if user_question.lower() == 'exit':
            break

        answer = generate_with_gemini_1_5(
            user_query=user_question,
            context=text_context if selected == '1' else None,
            image_path=image_path if selected == '2' else None,
            image_caption=image_caption if selected == '2' else None
        )
        print("🤖 Gemini 1.5 Answer:\n", answer)


Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



What input modality will you use?
1 - Text
2 - Image
3 - Audio
4 - Exit
Enter choice: 1
✍️ Enter a paragraph of text:
Coronavirus (COVID-19) is a highly infectious disease caused by the novel coronavirus SARS-CoV-2. It first emerged in Wuhan, China, in late 2019 and rapidly spread across the globe, leading to a worldwide pandemic. The virus primarily spreads through respiratory droplets and aerosols, and can cause symptoms ranging from mild respiratory issues to severe pneumonia and death. Preventive measures such as vaccination, mask-wearing, social distancing, and good hand hygiene have been key in controlling its spread. Ongoing research focuses on understanding variants and improving treatment strategies.

❓ Ask a question (or type 'exit' to go back): where corona virus emerged first?
🤖 Gemini 1.5 Answer:
 Wuhan, China.


❓ Ask a question (or type 'exit' to go back): exit

What input modality will you use?
1 - Text
2 - Image
3 - Audio
4 - Exit
Enter choice: 2
📁 Upload an image fil

Saving ying.jfif to ying (5).jfif

❓ Ask a question (or type 'exit' to go back): what is the origin of this symbol?
🤖 Gemini 1.5 Answer:
 That's the yin and yang symbol, originating from ancient Chinese philosophy and cosmology.  It represents the interconnectedness and interdependence of opposite forces in the universe, such as light and dark, male and female, and action and inaction.  It's a key concept in Taoism.


❓ Ask a question (or type 'exit' to go back): exit

What input modality will you use?
1 - Text
2 - Image
3 - Audio
4 - Exit
Enter choice: 3
🎧 Upload an audio file (.wav):


Saving sample-6s.mp3 to sample-6s.mp3

❓ Ask a question (or type 'exit' to go back): what is the name of this music?
🤖 Gemini 1.5 Answer:
 I do not have enough information to answer what music is playing.  I have no context or audio to work with.


❓ Ask a question (or type 'exit' to go back): exit

What input modality will you use?
1 - Text
2 - Image
3 - Audio
4 - Exit
Enter choice: 3
🎧 Upload an audio file (.wav):


Saving health 011509 wisdom teeth_0.mp3 to health 011509 wisdom teeth_0.mp3

❓ Ask a question (or type 'exit' to go back): what is wisdom teeth?
🤖 Gemini 1.5 Answer:
 Wisdom teeth are the third molars, typically the last teeth to erupt in the mouth, usually between the ages of 17 and 25.  They are located in the very back of the mouth, both top and bottom.


❓ Ask a question (or type 'exit' to go back): so you have answered this question based on the audio i provided or not?
🤖 Gemini 1.5 Answer:
 I have no access to audio or any files you might have provided outside of this current text interaction. So, the answer I've given is not based on any audio you've previously shared.


❓ Ask a question (or type 'exit' to go back): exit

What input modality will you use?
1 - Text
2 - Image
3 - Audio
4 - Exit
Enter choice: exit
❌ Invalid option. Please try again.

What input modality will you use?
1 - Text
2 - Image
3 - Audio
4 - Exit
Enter choice: 4
👋 Exiting...


In [22]:
import os
import torch
import numpy as np
from PIL import Image
import torchaudio
from transformers import (
    CLIPProcessor, CLIPModel,
    Wav2Vec2Processor, Wav2Vec2ForCTC, Wav2Vec2Tokenizer,
    BlipProcessor, BlipForConditionalGeneration
)
import google.generativeai as genai
from google.colab import files
import faiss

# ================================
# 🔐 GEMINI API KEY
# ================================
GEMINI_API_KEY = "AIzaSyAJtk_kLPx7s5YWcrfbMBVuaKxqmswTSU0"  # ⬅️ Replace with your actual API key
genai.configure(api_key=GEMINI_API_KEY)
gemini = genai.GenerativeModel("gemini-1.5-pro-latest")

# ================================
# 🧠 LOAD MODELS
# ================================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# CLIP for text & image
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Wav2Vec2 for audio transcription
audio_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
audio_tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
audio_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h").to(device)

# BLIP for image captioning
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

# ================================
# 🔧 ENCODING FUNCTIONS
# ================================
def encode_text(text):
    inputs = clip_processor(text=[text], return_tensors="pt", padding=True).to(device)
    with torch.no_grad():
        features = clip_model.get_text_features(**inputs)
    return features[0].cpu().numpy()

def encode_image(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = clip_processor(images=image, return_tensors="pt").to(device)
    with torch.no_grad():
        features = clip_model.get_image_features(**inputs)
    return features[0].cpu().numpy()

def encode_audio(audio_path):
    waveform, sr = torchaudio.load(audio_path)
    waveform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(waveform)
    input_values = audio_processor(waveform.squeeze(0), return_tensors="pt", sampling_rate=16000).input_values.to(device)
    with torch.no_grad():
        logits = audio_model(input_values).logits
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = audio_tokenizer.batch_decode(predicted_ids)[0].lower()
    return transcription

def get_image_caption(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = blip_processor(image, return_tensors="pt").to(device)
    output = blip_model.generate(**inputs, max_new_tokens=30)
    caption = blip_processor.decode(output[0], skip_special_tokens=True)
    return caption

# ================================
# 🎯 GEMINI 1.5 GENERATION
# ================================
def generate_with_gemini_1_5(user_query, context=None, image_path=None, image_caption=None):
    vision_data = Image.open(image_path).convert("RGB") if image_path else None

    prompt = f"""
You are a helpful assistant. Answer the user's question based on the context provided.

User Question:
{user_query}

Context:
{context if context else "No additional context."}

Image Description:
{image_caption if image_caption else "No image uploaded."}
"""

    parts = [prompt]
    if vision_data:
        parts.append(vision_data)

    response = gemini.generate_content(parts)
    return response.text

# ================================
# 🚀 INTERACTIVE LOOP
# ================================
while True:
    print("\nWhat input modality will you use?")
    print("1 - Text\n2 - Image\n3 - Audio\n4 - Exit")
    selected = input("Enter choice: ").strip()

    if selected == '4':
        print("👋 Exiting...")
        break

    text_context = ""
    image_path = None
    audio_path = None
    image_caption = ""

    if selected == '1':
        print("✍️ Enter a paragraph of text:")
        text_context = input()

    elif selected == '2':
        print("📁 Upload an image file:")
        uploaded = files.upload()
        image_path = list(uploaded.keys())[0]
        image_caption = get_image_caption(image_path)

    elif selected == '3':
        print("🎧 Upload an audio file (.wav):")
        uploaded = files.upload()
        audio_path = list(uploaded.keys())[0]
        text_context = encode_audio(audio_path)  # Transcribe audio to text
        print("📝 Transcription of audio:")
        print(text_context)

    else:
        print("❌ Invalid option. Please try again.")
        continue

    # Ask questions iteratively
    while True:
        user_question = input("\n❓ Ask a question (or type 'exit' to go back): ").strip()
        if user_question.lower() == 'exit':
            break

        answer = generate_with_gemini_1_5(
            user_query=user_question,
            context=text_context if selected in ['1', '3'] else None,
            image_path=image_path if selected == '2' else None,
            image_caption=image_caption if selected == '2' else None
        )
        print("🤖 Gemini 1.5 Answer:\n", answer)


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'Wav2Vec2CTCTokenizer'. 
The class this function is called from is 'Wav2Vec2Tokenizer'.
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



What input modality will you use?
1 - Text
2 - Image
3 - Audio
4 - Exit
Enter choice: 3
🎧 Upload an audio file (.wav):


Saving health 011509 wisdom teeth_0.mp3 to health 011509 wisdom teeth_0 (2).mp3
📝 Transcription of audio:
this is the veoa special english health report wisdom teeth are normally the last teeth to appear in the mouth it usually happens when people are older and wiser that is when there in their late teanage years or early twenties wisdom teeth are molers or chewing teeth at the back of the mouth the third set of molers if you have them are your wisdom teeth they can grow into place normally and never cause a problem but often there is not enough room for them in the mouth they might crowd the other teeth sometimes they even push through the gums sideways and impacted wisdom teeth is one that fails to completely rise through the gums the term is erupt wisdom teeth that only partly erupt can leave space for bacteria to enter around the tooth infection is a risk in these cases experts say people should have their mouths examined between the ages of sixteen and twentyt for placement of the

In [24]:
import os
import torch
import numpy as np
from PIL import Image
import torchaudio
from transformers import (
    CLIPProcessor, CLIPModel,
    Wav2Vec2Processor, Wav2Vec2ForCTC,
    BlipProcessor, BlipForConditionalGeneration
)
import google.generativeai as genai
from google.colab import files
import faiss

# Enable MP3 support
torchaudio.set_audio_backend("sox_io")

# ================================
# 🔐 GEMINI API KEY
# ================================
GEMINI_API_KEY = "AIzaSyAJtk_kLPx7s5YWcrfbMBVuaKxqmswTSU0"  # ⮆️ Replace with your actual API key
genai.configure(api_key=GEMINI_API_KEY)
gemini = genai.GenerativeModel("gemini-1.5-pro-latest")

# ================================
# 🧠 LOAD MODELS
# ================================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# CLIP for text & image
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Wav2Vec2 for audio transcription
audio_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
audio_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h").to(device)

# BLIP for image captioning
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

# ================================
# 🔧 ENCODING FUNCTIONS
# ================================
def encode_text(text):
    inputs = clip_processor(text=[text], return_tensors="pt", padding=True).to(device)
    with torch.no_grad():
        features = clip_model.get_text_features(**inputs)
    return features[0].cpu().numpy()

def encode_image(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = clip_processor(images=image, return_tensors="pt").to(device)
    with torch.no_grad():
        features = clip_model.get_image_features(**inputs)
    return features[0].cpu().numpy()

def encode_audio(audio_path):
    waveform, sr = torchaudio.load(audio_path)
    waveform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(waveform)
    input_values = audio_processor(waveform.squeeze(0), return_tensors="pt", sampling_rate=16000).input_values.to(device)
    with torch.no_grad():
        logits = audio_model(input_values).logits
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = audio_processor.batch_decode(predicted_ids)[0].lower()
    return transcription

def get_image_caption(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = blip_processor(image, return_tensors="pt").to(device)
    output = blip_model.generate(**inputs, max_new_tokens=30)
    caption = blip_processor.decode(output[0], skip_special_tokens=True)
    return caption

# ================================
# 🎯 GEMINI 1.5 GENERATION
# ================================
def generate_with_gemini_1_5(user_query, context=None, image_path=None, image_caption=None):
    vision_data = Image.open(image_path).convert("RGB") if image_path else None

    prompt = f"""
You are a helpful assistant. Answer the user's question based on the context provided.

User Question:
{user_query}

Context:
{context if context else "No additional context."}

Image Description:
{image_caption if image_caption else "No image uploaded."}
"""

    parts = [prompt]
    if vision_data:
        parts.append(vision_data)

    response = gemini.generate_content(parts)
    return response.text

# ================================
# 🚀 INTERACTIVE LOOP
# ================================
while True:
    print("\nWhat input modality will you use?")
    print("1 - Text\n2 - Image\n3 - Audio\n4 - Exit")
    selected = input("Enter choice: ").strip()

    if selected == '4':
        print("👋 Exiting...")
        break

    text_context = ""
    image_path = None
    audio_path = None
    image_caption = ""

    if selected == '1':
        print("✍️ Enter a paragraph of text:")
        text_context = input()

    elif selected == '2':
        print("📁 Upload an image file:")
        uploaded = files.upload()
        image_path = list(uploaded.keys())[0]
        image_caption = get_image_caption(image_path)

    elif selected == '3':
        print("🎷 Upload an audio file (.mp3 or .wav):")
        uploaded = files.upload()
        audio_path = list(uploaded.keys())[0]
        text_context = encode_audio(audio_path)
        print("📝 Transcription of audio:")
        print(text_context)

    else:
        print("❌ Invalid option. Please try again.")
        continue

    # Ask questions iteratively
    while True:
        user_question = input("\n❓ Ask a question (or type 'exit' to go back): ").strip()
        if user_question.lower() == 'exit':
            break

        answer = generate_with_gemini_1_5(
            user_query=user_question,
            context=text_context if selected in ['1', '3'] else None,
            image_path=image_path if selected == '2' else None,
            image_caption=image_caption if selected == '2' else None
        )
        print("🧠 Gemini 1.5 Answer:\n", answer)


  torchaudio.set_audio_backend("sox_io")
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



What input modality will you use?
1 - Text
2 - Image
3 - Audio
4 - Exit
Enter choice: 2
📁 Upload an image file:


Saving ying.jfif to ying (6).jfif

❓ Ask a question (or type 'exit' to go back): what does yin mean
🧠 Gemini 1.5 Answer:
 Yin, represented by the dark swirl in the symbol, is one half of the Taoist concept of duality.  It is associated with the feminine, passive, dark, cold, and yielding aspects of existence.  It complements and is interconnected with yang, the light swirl.  Together, yin and yang represent the interplay of opposing forces that make up all aspects of the universe and life.


❓ Ask a question (or type 'exit' to go back): exit

What input modality will you use?
1 - Text
2 - Image
3 - Audio
4 - Exit
Enter choice: 4
👋 Exiting...
