In [1]:
import cv2
import spacy
import nltk
from googletrans import Translator
from collections import Counter
import torch
import torchvision.transforms as T
from torchvision.models.detection import fasterrcnn_resnet50_fpn


In [2]:
# Load the object detection model
model = fasterrcnn_resnet50_fpn(pretrained=True)
model.eval()

# Load labels for COCO dataset
COCO_INSTANCE_CATEGORY_NAMES = [
    '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
    'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter',
    'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra',
    'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis',
    'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard',
    'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon',
    'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
    'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'dining table', 'toilet',
    'TV', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven',
    'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear',
    'hair drier', 'toothbrush'
]

# Function to detect objects in a frame
def detect_objects(frame):
    transform = T.Compose([T.ToTensor()])
    frame_tensor = transform(frame)
    with torch.no_grad():
        predictions = model([frame_tensor])[0]

    detected_objects = []
    for idx, score in enumerate(predictions['scores']):
        if score > 0.8:  # Threshold for detection confidence
            label = COCO_INSTANCE_CATEGORY_NAMES[predictions['labels'][idx]]
            detected_objects.append(label)

    return detected_objects

# Function to analyze video content
def analyze_video(video_path):
    cap = cv2.VideoCapture(video_path)
    frame_count = 0
    object_list = []

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        # Analyze every 30th frame
        if frame_count % 30 == 0:
            objects_in_frame = detect_objects(frame)
            object_list.extend(objects_in_frame)
        
        frame_count += 1

    cap.release()
    cv2.destroyAllWindows()
    
    return object_list




In [6]:
import spacy

# Load SpaCy model for NLP tasks
nlp = spacy.load("en_core_web_sm")


In [7]:
from collections import Counter
import spacy

# Load SpaCy model for NLP tasks
nlp = spacy.load("en_core_web_sm")

def generate_tags(object_list):
    # Count frequency of detected objects
    object_counter = Counter(object_list)
    
    # Basic tag generation based on object detection
    tags = list(object_counter.keys())
    
    # (Optional) Use NLP to enhance tags - finding synonyms, related words, etc.
    enhanced_tags = set(tags)
    for tag in tags:
        doc = nlp(tag)
        for token in doc:
            # Synonyms in SpaCy aren't directly available. You might use WordNet or another source for synonyms.
            # For this example, we're simply adding the lemma (base form of the word) to the tags.
            enhanced_tags.add(token.lemma_)
    
    return list(enhanced_tags)


In [8]:
# Initialize the Google Translator
translator = Translator()

def translate_tags(tags, languages):
    translated_tags = {}
    
    for language in languages:
        translated_tags[language] = [translator.translate(tag, dest=language).text for tag in tags]
    
    return translated_tags


In [3]:
import cv2
import spacy
from googletrans import Translator
import torch
import torchvision.transforms as T
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from collections import Counter

# Load SpaCy model for NLP tasks
nlp = spacy.load("en_core_web_sm")

# Load the object detection model
model = fasterrcnn_resnet50_fpn(pretrained=True)
model.eval()

# COCO Instance Category Names
COCO_INSTANCE_CATEGORY_NAMES = [
    '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
    'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter',
    'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra',
    'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis',
    'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard',
    'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon',
    'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
    'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'dining table', 'toilet',
    'TV', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven',
    'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear',
    'hair drier', 'toothbrush'
]

def detect_objects(frame):
    transform = T.Compose([T.ToTensor()])
    frame_tensor = transform(frame)
    with torch.no_grad():
        predictions = model([frame_tensor])[0]
    
    detected_objects = []
    labels = predictions['labels']
    scores = predictions['scores']
    
    for idx, score in enumerate(scores):
        if score > 0.5:  # Confidence threshold
            if idx < len(labels):
                label_idx = labels[idx].item()
                if label_idx < len(COCO_INSTANCE_CATEGORY_NAMES):
                    label = COCO_INSTANCE_CATEGORY_NAMES[label_idx]
                    detected_objects.append(label)
    
    return detected_objects

def analyze_video(video_path):
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print("Error: Could not open video.")
        return []
    
    frame_count = 0
    object_list = []

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        if frame_count % 30 == 0:  # Analyze every 30th frame
            objects_in_frame = detect_objects(frame)
            object_list.extend(objects_in_frame)
        
        # Display the frame (for debugging purposes)
        cv2.imshow('Video', frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
        
        frame_count += 1

    cap.release()
    cv2.destroyAllWindows()
    
    return object_list

def generate_tags(object_list):
    object_counter = Counter(object_list)
    tags = list(object_counter.keys())
    
    enhanced_tags = set(tags)
    for tag in tags:
        doc = nlp(tag)
        for token in doc:
            enhanced_tags.add(token.lemma_)  # Add lemma (base form) of each word
    
    return list(enhanced_tags)

translator = Translator()

def translate_tags(tags, languages):
    translated_tags = {}
    for language in languages:
        translated_tags[language] = [translator.translate(tag, dest=language).text for tag in tags]
    
    return translated_tags

def main(video_path, languages=['es', 'fr', 'de']):
    # Step 1: Analyze video content
    objects = analyze_video(video_path)
    print(f"Detected objects: {objects}")
    
    # Step 2: Generate tags
    tags = generate_tags(objects)
    print(f"Generated tags: {tags}")
    
    # Step 3: Translate tags into multiple languages
    translated_tags = translate_tags(tags, languages)
    for lang, tags in translated_tags.items():
        print(f"Translated tags in {lang}: {tags}")

# Test the system with a sample video
video_path = 'ssvid.net - When boys are given some work  Raj Grover  shorts_v144P.mp4'  # Replace with your video path
main(video_path)


Detected objects: ['person', 'person', 'person', 'person', 'sink', 'person', 'person', 'sink', 'person', 'person', 'person', 'person']
Generated tags: ['person', 'sink']
Translated tags in es: ['persona', 'hundir']
Translated tags in fr: ['personne', 'couler']
Translated tags in de: ['Person', 'Waschbecken']
