In [1]:
# First cell - Setup paths
import sys
import os

# Setup paths as before
notebook_dir = os.path.dirname('__file__')
lerobot_root = os.path.abspath(os.path.join(notebook_dir, '../..'))
sys.path.append(lerobot_root)
os.chdir(lerobot_root)

In [2]:
import cv2
import numpy as np
from PIL import Image
import torch
from tqdm.notebook import tqdm
from typing import List, Optional, Dict, Union, Tuple
import matplotlib.pyplot as plt
from pathlib import Path
import modal
import gc
import json
from collections import deque
import time
import re
from google import genai
from google.genai import types
from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
from lerobot.caferacer.scripts.image_utils import reorder_tensor_dimensions, tensor_to_pil, display_images
from lerobot.caferacer.scripts.aug_utils import flip_frame, apply_color, get_mask, precompute_masks

objc[15640]: Class AVFFrameReceiver is implemented in both /Users/shreyas/opt/anaconda3/envs/caferacer/lib/python3.10/site-packages/av/.dylibs/libavdevice.61.3.100.dylib (0x31cf0c798) and /Users/shreyas/opt/anaconda3/envs/caferacer/lib/libavdevice.59.7.100.dylib (0x325278778). One of the two will be used. Which one is undefined.
objc[15640]: Class AVFAudioReceiver is implemented in both /Users/shreyas/opt/anaconda3/envs/caferacer/lib/python3.10/site-packages/av/.dylibs/libavdevice.61.3.100.dylib (0x31cf0c7e8) and /Users/shreyas/opt/anaconda3/envs/caferacer/lib/libavdevice.59.7.100.dylib (0x3252787c8). One of the two will be used. Which one is undefined.


In [3]:
GEMINI_API_KEY = os.environ.get("GOOGLE_API_KEY")
client = genai.Client(api_key=GEMINI_API_KEY)
train_repo = "shreyasgite/so100_base_left"
eval_repo = "shreyasgite/eval_test"
MODEL_ID = "gemini-2.0-flash"  # Use Gemini 2.0 Flash for 3D capabilities
PRO_MODEL_ID ='gemini-2.0-pro-exp-02-05'
#repo_id0 = "shreyasgite/so100_base_env"

In [4]:
gsam = modal.Function.lookup("grounded-sam","GroundedSam.run", environment_name='prod')
#gsam = modal.Cls.from_name("grounded-sam","GroundedSam.run", environment_name='prod')

/Users/shreyas/opt/anaconda3/envs/caferacer/lib/python3.10/asyncio/events.py:80: DeprecationError: 2025-01-27: `modal.Function.lookup` is deprecated and will be removed in a future release. It can be replaced with `modal.Function.from_name`.

See https://modal.com/docs/guide/modal-1-0-migration for more information.
  self._context.run(self._callback, *self._args)
/Users/shreyas/opt/anaconda3/envs/caferacer/lib/python3.10/asyncio/events.py:80: DeprecationError: 2025-02-11: Looking up class methods using Function.from_name will be deprecated in a future version of Modal.
Use modal.Cls.from_name instead, e.g.

GroundedSam = modal.Cls.from_name("grounded-sam", "GroundedSam")
instance = GroundedSam(...)
instance.run.remote(...)

  self._context.run(self._callback, *self._args)


In [18]:
def analyze_scene(img, prompt=None) -> str:
    """Prompts Gemini 2.0 Flash for scene analysis (3D bounding boxes, orientation)."""
    if prompt is None:
        prompt = """Describe the scene from the top view. Focus on objects, positions, and spatial relationships. 
        Point to no more than 10 items in the image. Include following items in the analysis: Robot arm, container, and Lego bricks.
        The answer should follow the json format: [{"point": <point>, "label": <label1>, "description": <description>}, ...]. The points are in [y, x] format normalized to 0-1000. One element a line.
        """
    response = client.models.generate_content(
        model=MODEL_ID,
        contents=[img, prompt],
        config=types.GenerateContentConfig(temperature=0.1),
    )
    return response.text
    
def analyze_multi_view(img_0, img_1, prompt=None, context=None) -> str:
    if prompt is None:
        prompt = """Given the top view image and analysis, get additional details from the front view image. 
        Provide a combined analysis of the top and front views. Focus on heights, occlusions, and depth relationships.
        The answer should follow the json format: [{"point": <point>, "label": <label1>, "description": <description>}, ...]. The points are in [y, x] format normalized to 0-1000. One element a line.
        """
    response = client.models.generate_content(
        model=MODEL_ID,
        contents=[img_0, prompt, context, img_1],
        config=types.GenerateContentConfig(temperature=0.1),
    )
    return response.text

def get_summary(analysis_data: List[Dict], prompt_template: str) -> str:
    """Prompts Gemini (text model) for a summary of the analysis."""
    # Flatten the analysis data into a single string (for text-based Gemini)
    combined_analysis = "\n".join([str(episode) for episode in analysis_data])
    try:
        response = client.models.generate_content(
            model=PRO_MODEL_ID, # Use this or something similar for text generation
            contents=[prompt_template.format(combined_analysis)],
            config = types.GenerateContentConfig(temperature=0.4, top_p=0.8)
        )
    except ServerError:
        response = client.models.generate_content(
            model=MODEL_ID, # Use this or something similar for text generation
            contents=[prompt_template.format(combined_analysis)],
            config = types.GenerateContentConfig(temperature=0.4, top_p=0.8)
        )

    return response.text

def parse_json(response_text):
    # Parsing out the markdown fencing
    # Find JSON content within triple backticks if present
    try:
        # Remove markdown code block formatting, if present
        response_text = response_text.strip().replace("```json", "").replace("```", "").strip()
        data = json.loads(response_text)
        return data
    except json.JSONDecodeError as e:
        print(f"JSON parsing error: {e}")
        print(f"Raw response: {response_text}")
        return []


In [7]:
# --- Initial Training Data Analysis ---

def analyze_training_data(repo_id: str, episodes: List[int] = None) -> List[Dict]:
    """Analyzes the first frames of each training episode."""
    dataset = LeRobotDataset(repo_id, episodes=episodes) #Load initial dataset
    num_episodes = dataset.num_episodes
    all_episodes_analysis = []

    for ep_idx in tqdm(range(num_episodes), desc="Analyzing Training Episodes"):
        from_idx = dataset.episode_data_index["from"][ep_idx].item()
        to_idx = dataset.episode_data_index["to"][ep_idx].item()
        
        top_frame = dataset[from_idx]["observation.images.phone"]
        front_frame = dataset[from_idx]["observation.images.laptop"]

        top_image = tensor_to_pil(top_frame)
        front_image = tensor_to_pil(front_frame)

        top_analysis_raw = analyze_scene(top_image)
        analysis = analyze_multi_view(top_image, front_image, context=top_analysis_raw)
        
        analysis = parse_json(analysis)

        all_episodes_analysis.append({"episode": ep_idx, "analysis": analysis})
        #all_episodes_analysis.append(
        #    {"episode": ep_idx, "top": top_analysis, "front": front_analysis}
        #)

    return all_episodes_analysis

In [8]:
def summarize_training_data(analysis_data: List[Dict]) -> str:
    """Generates a summary of the training data analysis."""
    summary_prompt_template = """
    Analyze the following dataset of robot pick-and-place episodes with lego bricks and containers.
    Summarize patterns, biases, and limitations in the training data.
        
    Focus on:
    1. Object distributions (positions, orientations, colors)
    2. Success patterns
    3. Potential biases (e.g., container always on left)
    4. Limitations in the dataset diversity

    {0}

    Provide the following:
    1.  **Training Statistics:**  Quantify key aspects, such as the percentage of episodes
        where the container is on the left vs. right side of the robot.
    2.  **Potential Biases:** Identify any biases in the data (e.g., only one container color).
    3.  **Suggestions for Data Augmentation:**  Suggest specific augmentations to address
        biases and improve generalization. Categorize suggestions like this:
        - flip_frame:  (If the data is biased towards one side)
        - change_color: object-container:color-yellow  (If container color variety is needed)
        - inpaint_distraction: List_of_distraction_objects:[object1, object2] (If distractions should be removed)
    """
    return get_summary(analysis_data, summary_prompt_template)

In [22]:
def evaluate_episode_success(
        #first_frame: Union[Image.Image, torch.Tensor],
        last_frame: Union[Image.Image, torch.Tensor]
    ) -> Dict:
        """
        Evaluate whether an episode was successful by comparing first and last frames.
        
        Args:
            first_frame: First frame of the episode
            last_frame: Last frame of the episode
            
        Returns:
            Dictionary with success evaluation and reasoning
        """
        prompt = """
        Analyze the final state of this robotics task (picking and placing Lego bricks).
        1.  Is the Lego brick inside the container? (Answer with YES or NO).
        2.  If NO, provide a concise reason for the failure, relating it to the scene.
        Output should be JSON in format: {"success": bool, "failure_reason": str}
        """
        
        # Convert tensors to PIL if needed
        #if isinstance(first_frame, torch.Tensor):
        #    first_frame = tensor_to_pil(first_frame)
        if isinstance(last_frame, torch.Tensor):
            last_frame = tensor_to_pil(last_frame)
            
        try:
            response = client.models.generate_content(
                model=MODEL_ID,
                contents=[last_frame, prompt],
                config=types.GenerateContentConfig(temperature=0.2)
            )
            
            
            return response.text
            
        
        except Exception as e:
            print(f"Error evaluating episode success: {e}")
            return {"error": str(e), "success": False, "confidence": 0}

In [19]:
def analyze_eval_data(repo_id: str, episodes: List[int] = None, GPU_POOR=True) -> List[Dict]:
    """Analyzes the first frames of each training episode."""
    dataset = LeRobotDataset(repo_id, episodes=episodes) #Load initial dataset
    num_episodes = dataset.num_episodes
    all_episodes_analysis = []

    for ep_idx in tqdm(range(num_episodes), desc="Analyzing Eval Episodes"):
        from_idx = dataset.episode_data_index["from"][ep_idx].item()
        to_idx = dataset.episode_data_index["to"][ep_idx].item()
        
        top_first = dataset[from_idx]["observation.images.phone"]
        front_first = dataset[from_idx]["observation.images.laptop"]
        top_image = tensor_to_pil(top_first)
        front_image = tensor_to_pil(front_first)

        top_analysis_raw = analyze_scene(top_image)
        scene_analysis_raw = analyze_multi_view(top_image, front_image, context=top_analysis_raw)
        scene_analysis = parse_json(scene_analysis_raw)
        
        # TODO: Add front view to the eval analysis
        top_last = dataset[to_idx-1]["observation.images.phone"]
        eval_analysis_raw = evaluate_episode_success(top_last)
        eval_analysis = parse_json(eval_analysis_raw)

        all_episodes_analysis.append(
            {"episode": ep_idx, "episode_eval": eval_analysis, "scene_analysis": scene_analysis}
        )
        
    return all_episodes_analysis

In [29]:
def get_augmentations(
    evaluation_results: List[Dict], training_summary: str
) -> str:
    """Summarizes evaluation results and suggests augmentations."""

    # Combine evaluation results into a string format
    combined_eval_results = ""
    for result in evaluation_results:
        combined_eval_results += f"Episode {result['episode']}: \n"
        combined_eval_results += f"Scene Description: {result['scene_analysis']} \n"
        combined_eval_results += f"Episode Evaluation: {result['episode_eval']} \n"

    prompt = """
    Based on the training data summary and failed evaluation episodes,
    suggest data augmentations to improve the robot's policy.
        
    Consider the following types of augmentations:
    1. flip_frame: If the training data shows position bias (e.g., container always on left)
    2. change_color: If the training data shows color bias (e.g., container always blue)
    3. inpaint_distraction: If distractions in scenes affect performance. Distractions are objects that are not the target object.
    Format the response as JSON with the following structure:
        {
            "recommended_augmentations": {
                    'flip_frame': True, 
                    'change_color': {
                        'object': 'blue container', 
                        'target_color': 'blue'
                    },
                    'inpaint_distraction': {
                        'distraction_objects': ['object1', 'object2']
                    }
            },
            "reasoning": "overall explanation of recommendations",
            "expected_improvements": "how these changes should help"
        }
    """
    prompt += f"Here is a summary of the training data used for the initial policy: {training_summary}"
    prompt += f"Here are the evaluation results: {combined_eval_results}"
    response = client.models.generate_content(
        model=PRO_MODEL_ID,
        contents=[prompt],
        config=types.GenerateContentConfig(temperature=0.2)
    )
    
    return response.text
    

In [12]:
train_results = analyze_training_data(train_repo, episodes=list(range(8)))

Analyzing Training Episodes:   0%|          | 0/8 [00:00<?, ?it/s]

In [14]:
train_summary_1 = summarize_training_data(train_results)

In [33]:
print(train_summary_1)

Here's an analysis of the provided robot pick-and-place dataset, focusing on the requested aspects:

**1. Training Statistics:**

*   **Container Position:**  In all episodes (100%), the container is described as being on the *right* side of the robot arm or image.  There are no instances of the container being on the left.
*   **Lego Brick Position:** In all episodes (100%), the Lego bricks are consistently described as being on the *left* side of the robot arm or image.
*   **Lego Brick Colors:** The Lego bricks are primarily described as green, yellow, gray, and black.
*   **Container Color:** The container is consistently described as blue.
*   **Robot Arm Color:** The robot arm is consistently described as red, often with black components.

**2. Potential Biases:**

*   **Strong Positional Bias:** The most significant bias is the consistent placement of the container on the right and the Lego bricks on the left.  A robot trained solely on this data would likely fail to generalize 

In [23]:
eval_results = analyze_eval_data(eval_repo, episodes=list(range(1,10)))

Analyzing Eval Episodes:   0%|          | 0/9 [00:00<?, ?it/s]

In [35]:
for episode in eval_results:
    print(f"Episode {episode['episode']}: \n")
    print(f"Scene Description: {episode['scene_analysis']} \n")
    print(f"Episode Evaluation: {episode['episode_eval']} \n")


Episode 0: 

Scene Description: [{'point': [591, 500], 'label': 'Robot arm', 'description': 'A red robot arm with black accents, positioned in the center of the image.'}, {'point': [610, 757], 'label': 'container', 'description': 'A blue square container located on the right side of the image.'}, {'point': [727, 301], 'label': 'Lego bricks', 'description': 'A small stack of green and gray Lego bricks, situated on the left side of the image.'}, {'point': [464, 375], 'label': 'Lamp', 'description': 'A black lamp is located on the left side of the image.'}, {'point': [360, 537], 'label': 'Wire', 'description': 'A white wire is connected to the robot arm.'}, {'point': [491, 478], 'label': 'Servo', 'description': 'A black servo motor is attached to the robot arm.'}, {'point': [600, 662], 'label': 'Shadow', 'description': 'The shadow of the container.'}, {'point': [631, 537], 'label': 'Shadow', 'description': 'The shadow of the robot arm.'}, {'point': [760, 287], 'label': 'Shadow', 'descript

In [30]:
augmentation_suggestions = get_augmentations(eval_results, train_summary_1)

In [32]:
print(augmentation_suggestions)

```json
{
    "recommended_augmentations": {
        "flip_frame": true,
        "change_color": {
            "object": "blue container",
            "target_color": ["yellow", "green", "red", "gray"]
        },
        "inpaint_distraction": {
            "distraction_objects": ["wire", "shadow", "lamp"]
        }
    },
    "reasoning": "The training data exhibits strong positional and color biases.  The container is always blue and on the right, and the Lego bricks are usually on the left.  The evaluation episodes, while limited, show failures when the container is yellow or when the relative positions of the container and bricks are switched.  Inpainting distractions like the wire and shadows, and in some cases the lamp, will force the model to focus on the key objects (container and bricks).  The successful episodes (0, 7, and 8) all have the blue container on the *right*, reinforcing the positional bias.  The failed episodes often involve a yellow container (3, 4, 5) or a switch

In [9]:
DATA_AUG = {'flip_frame': True, 'change_color': {'object': 'blue container', 'target_color': 'blue'}}

In [18]:
#dataset0 = LeRobotDataset(test_repo, episodes=[0])

In [None]:
#dataset = create_dataset(test_repo, dataset0, gsam, DATA_AUG=DATA_AUG)
#dataset.push_to_hub()

In [10]:
#inpaint_flux = modal.Function.lookup('inpaint-flux', 'inpaint_flux', environment_name='prod')
#gsam = modal.Function.lookup("grounded-sam","GroundedSam.run", environment_name='prod')