In [None]:
!pip install pytesseract
!pip install ultralytics
!pip install google-cloud-vision



In [None]:
import cv2
import os
from google.cloud import vision
from google.cloud.vision_v1 import types
from ultralytics import YOLO
from google.colab.patches import cv2_imshow  # For displaying images in Colab

# -----------------------------
# NOTE: Secret credentials removed.
# Set your Google Vision credentials before running, e.g.:
# os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/content/path/to/your_credentials.json"
# -----------------------------

# Load trained YOLO model
model = YOLO('/content/drive/MyDrive/Colab Notebooks/2024_CV_project/yolo_tag_aug_drink/weights/best.pt')  # Path to trained YOLO weights

# Directory to save cropped outputs
output_dir = '/content/drive/MyDrive/Colab Notebooks/2024_CV_project/yolo_tag_aug_drink/output_crops'
os.makedirs(output_dir, exist_ok=True)

# Google Vision OCR function
def google_vision_ocr(image):
    """Run Google Vision OCR on an image crop and return detected text."""
    client = vision.ImageAnnotatorClient()

    # Encode the crop in memory and send to Google Vision API
    success, encoded_image = cv2.imencode('.jpg', image)
    content = encoded_image.tobytes()
    image = types.Image(content=content)
    response = client.text_detection(image=image)
    texts = response.text_annotations
    if texts:
        return texts[0].description  # Return full detected text
    return ""

# Compute image sharpness using Laplacian variance
def calculate_sharpness(image):
    """Measure sharpness of an image crop using Laplacian variance."""
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    return cv2.Laplacian(gray, cv2.CV_64F).var()

# Crop a bounding box region from an image
def crop_box(image, box):
    """Crop ROI from image using (x1, y1, x2, y2) coordinates."""
    x1, y1, x2, y2 = map(int, box)
    return image[y1:y2, x1:x2]

# Select best crop based on YOLO confidence + sharpness
def process_detections(frame, detections, alpha=0.6, conf_threshold=0.88, sharpness_threshold=300):
    """
    Pick the best crop among detections by combining confidence and sharpness.

    alpha: weight for confidence vs sharpness (0.0~1.0)
    conf_threshold: minimum detection confidence
    sharpness_threshold: minimum sharpness to accept crop
    """
    best_crop = None
    best_score = -1

    for detection in detections:
        box = detection.boxes.xyxy[0].tolist()
        conf = detection.boxes.conf[0].item()

        # Filter out low-confidence detections
        if conf < conf_threshold:
            continue

        cropped = crop_box(frame, box)

        # Filter out blurry crops
        sharpness = calculate_sharpness(cropped)
        if sharpness < sharpness_threshold:
            continue

        # Combined score: confidence + sharpness
        score = alpha * conf + (1 - alpha) * sharpness

        # Update best crop
        if score > best_score:
            best_score = score
            best_crop = cropped

    return best_crop

# Run inference on a video and perform OCR on best crops
def process_video(video_path, alpha=0.5):
    cap = cv2.VideoCapture(video_path)

    # Store best crops for each class
    best_tag_crop = None
    best_nutri_crop = None

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # Run YOLO inference
        results = model(frame)

        for detection in results:
            # Process tag class (class 0)
            if best_tag_crop is None:
                best_tag_crop = process_detections(frame, detection[detection.boxes.cls == 0])

            # Process nutri class (class 1)
            if best_nutri_crop is None:
                best_nutri_crop = process_detections(frame, detection[detection.boxes.cls == 1])

    cap.release()

    # Save best crops
    best_tag_path = None
    best_nutri_path = None

    if best_tag_crop is not None and best_tag_crop.size > 0:
        best_tag_path = os.path.join(output_dir, 'best_tag_hatban.jpg')
        cv2.imwrite(best_tag_path, best_tag_crop)

    if best_nutri_crop is not None and best_nutri_crop.size > 0:
        best_nutri_path = os.path.join(output_dir, 'best_nutri_hatban.jpg')
        cv2.imwrite(best_nutri_path, best_nutri_crop)

    # Run OCR on best crops
    tag_result = google_vision_ocr(best_tag_crop) if best_tag_crop is not None and best_tag_crop.size > 0 else None
    nutri_result = google_vision_ocr(best_nutri_crop) if best_nutri_crop is not None and best_nutri_crop.size > 0 else None

    # Wrap results
    results = {
        'tag': {
            'product_info': tag_result[300] if tag_result else None,  # OCR result (note: index usage kept as-is)
            'save_path': best_tag_path
        },
        'nutri': {
            'nutrition_info': nutri_result,
            'save_path': best_nutri_path
        }
    }

    return results

# Video path (or set 0 for webcam)
# video_path = '/content/drive/MyDrive/Colab Notebooks/2024_CV_project/test data/20241203_225032178.mp4'  # Hatban sample
video_path = '/content/drive/MyDrive/Colab Notebooks/2024_CV_project/test data/20241203_224949750.mp4'  # Seoul Milk sample

# Run processing
final_results = process_video(video_path, alpha=0.5)  # alpha = sharpness weight
print("Final results:", final_results)
