<a href="https://colab.research.google.com/github/tanmay1240/LLM/blob/main/Brain_Eyes_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
pip install --upgrade transformers accelerate torch



In [5]:
from transformers import pipeline
from PIL import Image
import re

# =====================================================
# 1. Load Vision Model (BLIP)
# =====================================================
captioner = pipeline(
    "image-to-text",
    model="Salesforce/blip-image-captioning-large"
)

# =====================================================
# 1.5 Load VQA Model (Emotion Probe)
# =====================================================
vqa = pipeline(
    "visual-question-answering",
    model="Salesforce/blip-vqa-base"
)


# =====================================================
# 2. Load LLM
# =====================================================
llm = pipeline(
    "text-generation",
    model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    max_new_tokens=120,
    temperature=0.1,
    do_sample=False,
    return_full_text=False
)

# =====================================================
# 3. Load Image and Generate Caption
# =====================================================
image_path = input("Enter the image path of the image you want to analyze: ")

try:
    image = Image.open(image_path)
except:
    print("‚ùå Error: Could not open the image. Please check the path.")
    exit()

caption_result = captioner(image)
caption = caption_result[0]["generated_text"]

print("\nüîç Image Caption:")
print(caption)
print("-" * 50)

# -------------------------------
# VQA Emotion Hint (Moved here after image is loaded)
# -------------------------------
vqa_result = vqa(
    image=image,
    question="What emotion is the person showing?"
)

vqa_emotion = vqa_result[0]["answer"].lower()

print("\nüéØ VQA Emotion Hint:")
print(vqa_emotion)


# =====================================================
# 4. Build Prompt for LLM
# =====================================================
prompt = f"""
You MUST return output in exactly this format:
Human: Yes/No
Emotion: one of [Happy, Sad, Angry, Neutral, Fear, Surprise, Not Applicable]
Reasoning: one sentence explanation

Rules:
- Exactly ONE output block.
- No repetition.
- No additional examples.
- Do NOT invent any story or context.
- Do NOT add information not present in the description.
- Do NOT infer emotion from pets, animals, environment, or objects.
- Only use explicit emotional cues from the description.
- Concerned, worried, nervous ‚Üí map to Fear.
- If no clear emotion matches, use Neutral.
- If no human is present, Emotion = Not Applicable.

Description:
{caption}
"""

# =====================================================
# 5. Run LLM
# =====================================================
llm_result = llm(prompt)
raw_output = llm_result[0]["generated_text"]

# =====================================================
# 6. Extract First Valid Answer Block
# =====================================================
pattern = r"Human:\s.*?\nEmotion:\s.*?\nReasoning:.*?(?=\nHuman:|\Z)"
match = re.search(pattern, raw_output, re.DOTALL)

if match:
    answer = match.group(0).strip()
else:
    answer = raw_output.strip()

# =====================================================
# 7. Parse Fields
# =====================================================
human_match = re.search(r"Human:\s*(Yes|No)", answer, re.IGNORECASE)
emotion_match = re.search(r"Emotion:\s*([\w\s]+)", answer, re.IGNORECASE)
reasoning_match = re.search(r"Reasoning:\s*(.*)", answer, re.DOTALL | re.IGNORECASE)

human = human_match.group(1).capitalize() if human_match else "Unknown"
emotion = emotion_match.group(1).strip().capitalize() if emotion_match else "Unknown"
reasoning = reasoning_match.group(1).strip() if reasoning_match else ""

# =====================================================
# 8. Correct Human Detection Logic (Priority Based)
# =====================================================
caption_lower = caption.lower()

human_keywords = [
    "person", "people", "man", "woman",
    "boy", "girl", "child", "couple",
    "human", "face"
]

non_human_keywords = [
    "car", "vehicle", "building", "street", "road",
    "house", "tree", "nature", "landscape",
    "object", "sky", "cloud", "mountain"
]

# Priority:
# 1. If human keywords ‚Üí Human = Yes
# 2. Else if non-human keywords ‚Üí Human = No
# 3. Else ‚Üí Unknown
if any(word in caption_lower for word in human_keywords):
    human = "Yes"
elif any(word in caption_lower for word in non_human_keywords):
    human = "No"
else:
    human = "Unknown"

# =====================================================
# Hard Override for Non-Human Images
# =====================================================
if human == "No":
    emotion = "Not Applicable"
    reasoning = "No humans are present in the image, so emotion classification is not applicable."

# =====================================================
# 8.5 Emotion Refinement Layer (Caption-Only + Priority)
# =====================================================
emotion_map = {
    "Angry": [
        "angry", "anger", "furious", "annoyed",
        "shouting", "yelling", "screaming",
        "scowl", "scowling", "glare", "glaring",
        "hostile", "aggressive", "rage"
    ],
    "Fear": [
        "fear", "afraid", "scared", "worried",
        "nervous", "anxious", "concerned",
        "terrified", "panic"
    ],
    "Sad": [
        "cry", "crying", "sad", "sadness",
        "upset", "teary", "tear", "tears", "weeping"
    ],
    "Happy": [
        "smile", "smiling", "laugh", "laughing",
        "cheerful", "joyful"
    ]
}

# Priority order: Angry > Fear > Sad > Happy
emotion_priority = ["Angry", "Fear", "Sad", "Happy"]

detected_emotion = None
for emotion_label in emotion_priority:
    keywords = emotion_map[emotion_label]
    if any(word in caption_lower for word in keywords):
        detected_emotion = emotion_label
        break

# =====================================================
# Emotion Fusion Logic (Caption + VQA)
# =====================================================

valid_vqa_emotions = ["happy", "sad", "angry", "fear", "scared", "neutral"]

if human == "Yes":
    if detected_emotion:
        emotion = detected_emotion
        reasoning = (
            f"The description contains emotional cues such as {emotion.lower()} related expressions, "
            f"so the emotion is classified as {emotion}."
        )

    elif vqa_emotion in valid_vqa_emotions:
        # Normalize fear synonyms
        if vqa_emotion == "scared":
            emotion = "Fear"
        else:
            emotion = vqa_emotion.capitalize()

        reasoning = (
            f"The visual question answering model suggests the emotion '{emotion}', "
            f"which is used when no explicit emotional cues are found in the caption."
        )

    else:
        emotion = "Neutral"
        reasoning = (
            "The description mentions a human, but neither the caption nor the visual question answering model "
            "provides clear emotional cues. Therefore, the emotion is classified as Neutral."
        )
else:
    emotion = "Not Applicable"


# =====================================================
# 9. Prevent Story Hallucination
# =====================================================
forbidden_words = ["home", "work", "school", "office", "after", "day"]

if any(word in reasoning.lower() for word in forbidden_words):
    reasoning = (
        "The description contains a human, but no contextual background or life situation "
        "is provided. The reasoning must rely only on the visual description."
    )

# =====================================================
# 10. Final Output
# =====================================================
final_output = f"""
üß† Multimodal Reasoning Output
-----------------------------
Human: {human}
Emotion: {emotion}
Reasoning: {reasoning}
"""

print(final_output)


Device set to use cpu
Device set to use cpu
Device set to use cpu


Enter the image path of the image you want to analyze: /content/smile.avif

üîç Image Caption:
smiling man with beard and green shirt looking at camera
--------------------------------------------------

üéØ VQA Emotion Hint:
happiness

üß† Multimodal Reasoning Output
-----------------------------
Human: Yes
Emotion: Happy
Reasoning: The description contains emotional cues such as happy related expressions, so the emotion is classified as Happy.

