In [3]:
!pip install torch torchvision opencv-python segment-anything
!pip install scikit-image
! pip install \
'git+https://github.com/facebookresearch/segment-anything.git'
! pip install -q roboflow supervision
! wget -q \
'https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth'

Collecting git+https://github.com/facebookresearch/segment-anything.git
  Cloning https://github.com/facebookresearch/segment-anything.git to /tmp/pip-req-build-_nz7_9fk
  Running command git clone --filter=blob:none --quiet https://github.com/facebookresearch/segment-anything.git /tmp/pip-req-build-_nz7_9fk
  Resolved https://github.com/facebookresearch/segment-anything.git to commit 6fdee8f2727f4506cfbbe553e23b895e27956588
  Preparing metadata (setup.py) ... [?25ldone
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-decision-forests 1.8.1 requires wurlitzer, which is not installed.
jupyterlab 4.2.3 requires jupyter-lsp>=2.0.0, but you have jupyter-lsp 1.5.1 which is incompatible.
jupyterlab-lsp 5.1.0 requires jupyter-lsp>=2.0.0, but you have jupyter-lsp 1.5.1 which is incompatible.
tensorflow 2.15.0 requires keras<2.16,>=2.15.0, but you 

In [4]:
import torch
from segment_anything import sam_model_registry,SamPredictor

DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
MODEL_TYPE = "vit_h"

sam = sam_model_registry[MODEL_TYPE](checkpoint="/kaggle/working/sam_vit_h_4b8939.pth")
sam.to(device=DEVICE)
predictor = SamPredictor(sam)



In [5]:
! pip install youtube-transcript-api
from youtube_transcript_api import YouTubeTranscriptApi

Collecting youtube-transcript-api
  Downloading youtube_transcript_api-0.6.2-py3-none-any.whl.metadata (15 kB)
Downloading youtube_transcript_api-0.6.2-py3-none-any.whl (24 kB)
Installing collected packages: youtube-transcript-api
Successfully installed youtube-transcript-api-0.6.2


In [16]:
import os
import cv2
import numpy as np
from segment_anything import sam_model_registry, SamPredictor
from youtube_transcript_api import YouTubeTranscriptApi

def video_to_frames(video_path, output_dir, frame_rate=0.7):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_interval = int(fps / frame_rate)
    frame_count = 0
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        if frame_count % frame_interval == 0:
            cv2.imwrite(os.path.join(output_dir, f'frame_{frame_count:05d}.jpg'), frame)
        frame_count += 1
    cap.release()
    return fps

def select_background_points(image, num_points=4):
    """
    Select background points from the edges of the image.
    The points will be selected from corners or edges assuming they are likely to be background.
    """
    h, w, _ = image.shape
    points = np.array([
        [0, 0],  # top-left corner
        [0, w - 1],  # top-right corner
        [h - 1, 0],  # bottom-left corner
        [h - 1, w - 1]  # bottom-right corner
    ])
    
    if num_points > 4:
        # Add midpoints of edges as background points if more points are required
        points = np.vstack([points, 
                            [0, w // 2], 
                            [h // 2, 0], 
                            [h - 1, w // 2], 
                            [h // 2, w - 1]])
    
    return points

def compare_histograms(frame1, frame2, threshold=0.4):
    """Compares histograms of two frames and returns True if they differ significantly."""
    hist1 = cv2.calcHist([frame1], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256])
    hist2 = cv2.calcHist([frame2], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256])
    hist1 = cv2.normalize(hist1, hist1).flatten()
    hist2 = cv2.normalize(hist2, hist2).flatten()
    diff = cv2.compareHist(hist1, hist2, cv2.HISTCMP_CORREL)
    return diff < threshold

def detect_scene_changes(frame_dir, fps, threshold=0.15, hist_threshold=0.3):
    frames = sorted(os.listdir(frame_dir))
    scene_changes = []
    prev_mask = None
    prev_frame = None

    for i, frame_name in enumerate(frames):
        frame = cv2.imread(os.path.join(frame_dir, frame_name))
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        predictor.set_image(frame_rgb)
                # Select background points
        background_points = select_background_points(frame_rgb)
        point_labels = np.zeros(background_points.shape[0], dtype=int)  # Label points as background (0)
        
        # Obtain masks for the frame focusing on background
        masks, _, _ = predictor.predict(point_coords=background_points, 
                                        point_labels=point_labels, 
                                        multimask_output=False)
        # Compare masks with the previous frame using logical XOR
        mask_diff = 0
        if prev_mask is not None:
            mask_diff = np.logical_xor(masks[0], prev_mask).mean()
        
        # Compare histograms for color change detection
        hist_diff = False
        if prev_frame is not None:
            hist_diff = compare_histograms(prev_frame, frame, threshold=hist_threshold)
        
        if mask_diff > threshold or hist_diff:  # Scene change detected
            timestamp = int(frame_name.split('_')[1].split('.')[0]) / fps
            scene_changes.append(timestamp)
        
        prev_mask = masks[0]
        prev_frame = frame
    
    return scene_changes

def get_transcript(video_id):
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        return transcript
    except Exception as e:
        print(f"Error retrieving transcript: {e}")
        return []

def group_transcripts_by_scenes(transcripts, scene_changes):
    grouped_transcripts = []
    scene_index = 0
    current_group = []

    for transcript in transcripts:
        start_time = transcript['start']
        if scene_index < len(scene_changes) and start_time > scene_changes[scene_index]:
            grouped_transcripts.append(' '.join([t['text'] for t in current_group]))
            current_group = []
            scene_index += 1
        current_group.append(transcript)
    
    if current_group:
        grouped_transcripts.append(' '.join([t['text'] for t in current_group]))
    
    return grouped_transcripts

# Path to the video file and directory to save frames
video_path = "/kaggle/input/videoin/How to Ace Your Group Interview _ Mock Job Interview _ Indeed Career Tips.mp4"
output_dir = "/kaggle/working/freames2"

# Extract frames from the video
fps = video_to_frames(video_path, output_dir, frame_rate=0.7)

# Initialize the SAM predictor
#model = sam_model_registry["vit_h"](checkpoint="/content/sam_vit_h_4b8939.pth")
#predictor = SamPredictor(model)

# Detect scene changes
scene_changes = detect_scene_changes(output_dir, fps, threshold=0.15, hist_threshold=0.3)
print("Scene changes detected at timestamps (in seconds):", scene_changes)

# Get YouTube transcript
video_id = "eLxA6hPaStw"
transcripts = get_transcript(video_id)

# Group transcripts by scene changes
grouped_transcripts = group_transcripts_by_scenes(transcripts, scene_changes)
for i, text in enumerate(grouped_transcripts):
    print(f"Scene {i + 1}: {text}\n")


Scene changes detected at timestamps (in seconds): [12.762749999999999, 48.21483333333333, 69.48608333333333, 82.24883333333332, 102.10199999999999, 126.20941666666666, 143.22641666666667, 161.6615, 163.07958333333332, 181.51466666666664, 182.93275, 184.35083333333333, 201.36783333333332, 225.47525, 232.56566666666666, 236.81991666666664, 238.23799999999997, 242.49224999999998, 252.4188333333333, 258.09116666666665, 296.37941666666666]
Scene 1: I hope I'm able to stand
out in this interview. What if I say the wrong thing? Wow, they both look
really professional. Am I dressed correctly? [MUSIC PLAYING] If a job you've applied to
has a lot of other applicants,

Scene 2: chances are you might find
yourself starting the interview process in a group setting. But don't be alarmed. Group interviews are a
great way for employers to get an initial read on
you and your work ethic. And above all, they want
to see how you communicate and how you work with a team. While they might seem
daunting at 

In [17]:
import os
import cv2
import numpy as np
from segment_anything import sam_model_registry, SamPredictor
from youtube_transcript_api import YouTubeTranscriptApi

def video_to_frames(video_path, output_dir, frame_rate=0.7):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_interval = int(fps / frame_rate)
    frame_count = 0
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        if frame_count % frame_interval == 0:
            cv2.imwrite(os.path.join(output_dir, f'frame_{frame_count:05d}.jpg'), frame)
        frame_count += 1
    cap.release()
    return fps

def select_background_points(image, num_points=4):
    """
    Select background points from the edges of the image.
    The points will be selected from corners or edges assuming they are likely to be background.
    """
    h, w, _ = image.shape
    points = np.array([
        [0, 0],  # top-left corner
        [0, w - 1],  # top-right corner
        [h - 1, 0],  # bottom-left corner
        [h - 1, w - 1]  # bottom-right corner
    ])
    
    if num_points > 4:
        # Add midpoints of edges as background points if more points are required
        points = np.vstack([points, 
                            [0, w // 2], 
                            [h // 2, 0], 
                            [h - 1, w // 2], 
                            [h // 2, w - 1]])
    
    return points

def compare_histograms(frame1, frame2, threshold=0.4):
    """Compares histograms of two frames and returns True if they differ significantly."""
    hist1 = cv2.calcHist([frame1], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256])
    hist2 = cv2.calcHist([frame2], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256])
    hist1 = cv2.normalize(hist1, hist1).flatten()
    hist2 = cv2.normalize(hist2, hist2).flatten()
    diff = cv2.compareHist(hist1, hist2, cv2.HISTCMP_CORREL)
    return diff < threshold

def detect_scene_changes(frame_dir, fps, threshold=0.15, hist_threshold=0.3):
    frames = sorted(os.listdir(frame_dir))
    scene_changes = []
    prev_mask = None
    prev_frame = None

    for i, frame_name in enumerate(frames):
        frame = cv2.imread(os.path.join(frame_dir, frame_name))
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        predictor.set_image(frame_rgb)
                # Select background points
        background_points = select_background_points(frame_rgb)
        point_labels = np.zeros(background_points.shape[0], dtype=int)  # Label points as background (0)
        
        # Obtain masks for the frame focusing on background
        masks, _, _ = predictor.predict(point_coords=background_points, 
                                        point_labels=point_labels, 
                                        multimask_output=False)
        # Compare masks with the previous frame using logical XOR
        mask_diff = 0
        if prev_mask is not None:
            mask_diff = np.logical_xor(masks[0], prev_mask).mean()
        
        # Compare histograms for color change detection
        hist_diff = False
        if prev_frame is not None:
            hist_diff = compare_histograms(prev_frame, frame, threshold=hist_threshold)
        
        if mask_diff > threshold or hist_diff:  # Scene change detected
            timestamp = int(frame_name.split('_')[1].split('.')[0]) / fps
            scene_changes.append(timestamp)
        
        prev_mask = masks[0]
        prev_frame = frame
    
    return scene_changes

def get_transcript(video_id):
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        return transcript
    except Exception as e:
        print(f"Error retrieving transcript: {e}")
        return []

def group_transcripts_by_scenes(transcripts, scene_changes):
    grouped_transcripts = []
    scene_index = 0
    current_group = []

    for transcript in transcripts:
        start_time = transcript['start']
        if scene_index < len(scene_changes) and start_time > scene_changes[scene_index]:
            grouped_transcripts.append(' '.join([t['text'] for t in current_group]))
            current_group = []
            scene_index += 1
        current_group.append(transcript)
    
    if current_group:
        grouped_transcripts.append(' '.join([t['text'] for t in current_group]))
    
    return grouped_transcripts

# Path to the video file and directory to save frames
video_path = "/kaggle/input/videoin/How to Ace Your Group Interview _ Mock Job Interview _ Indeed Career Tips.mp4"
output_dir = "/kaggle/working/freames3"

# Extract frames from the video
fps = video_to_frames(video_path, output_dir, frame_rate=0.7)

# Initialize the SAM predictor
#model = sam_model_registry["vit_h"](checkpoint="/content/sam_vit_h_4b8939.pth")
#predictor = SamPredictor(model)

# Detect scene changes
scene_changes = detect_scene_changes(output_dir, fps, threshold=0.2, hist_threshold=0.5)
print("Scene changes detected at timestamps (in seconds):", scene_changes)

# Get YouTube transcript
video_id = "eLxA6hPaStw"
transcripts = get_transcript(video_id)

# Group transcripts by scene changes
grouped_transcripts = group_transcripts_by_scenes(transcripts, scene_changes)
for i, text in enumerate(grouped_transcripts):
    print(f"Scene {i + 1}: {text}\n")


Scene changes detected at timestamps (in seconds): [8.5085, 12.762749999999999, 48.21483333333333, 69.48608333333333, 82.24883333333332, 102.10199999999999, 109.19241666666666, 126.20941666666666, 143.22641666666667, 163.07958333333332, 181.51466666666664, 182.93275, 184.35083333333333, 191.44125, 201.36783333333332, 225.47525, 226.89333333333332, 229.72949999999997, 232.56566666666666, 242.49224999999998, 249.58266666666665, 252.4188333333333, 258.09116666666665, 296.37941666666666]
Scene 1: I hope I'm able to stand
out in this interview. What if I say the wrong thing? Wow, they both look
really professional. Am I dressed correctly? [MUSIC PLAYING]

Scene 2: If a job you've applied to
has a lot of other applicants,

Scene 3: chances are you might find
yourself starting the interview process in a group setting. But don't be alarmed. Group interviews are a
great way for employers to get an initial read on
you and your work ethic. And above all, they want
to see how you communicate and h

In [None]:
#This code saves the transcript directly to specified location as txt file

import os
import cv2
import numpy as np
from segment_anything import sam_model_registry, SamPredictor
from youtube_transcript_api import YouTubeTranscriptApi

def video_to_frames(video_path, output_dir, frame_rate=0.7):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_interval = int(fps / frame_rate)
    frame_count = 0
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        if frame_count % frame_interval == 0:
            cv2.imwrite(os.path.join(output_dir, f'frame_{frame_count:05d}.jpg'), frame)
        frame_count += 1
    cap.release()
    return fps

def select_background_points(image, num_points=4):
    h, w, _ = image.shape
    points = np.array([
        [0, 0],  # top-left corner
        [0, w - 1],  # top-right corner
        [h - 1, 0],  # bottom-left corner
        [h - 1, w - 1]  # bottom-right corner
    ])
    
    if num_points > 4:
        points = np.vstack([points, 
                            [0, w // 2], 
                            [h // 2, 0], 
                            [h - 1, w // 2], 
                            [h // 2, w - 1]])
    
    return points

def compare_histograms(frame1, frame2, threshold=0.4):
    hist1 = cv2.calcHist([frame1], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256])
    hist2 = cv2.calcHist([frame2], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256])
    hist1 = cv2.normalize(hist1, hist1).flatten()
    hist2 = cv2.normalize(hist2, hist2).flatten()
    diff = cv2.compareHist(hist1, hist2, cv2.HISTCMP_CORREL)
    return diff < threshold

def detect_scene_changes(frame_dir, fps, threshold=0.15, hist_threshold=0.3):
    frames = sorted(os.listdir(frame_dir))
    scene_changes = []
    prev_mask = None
    prev_frame = None

    for i, frame_name in enumerate(frames):
        frame = cv2.imread(os.path.join(frame_dir, frame_name))
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        predictor.set_image(frame_rgb)
        
        background_points = select_background_points(frame_rgb)
        point_labels = np.zeros(background_points.shape[0], dtype=int)  # Label points as background (0)
        
        masks, _, _ = predictor.predict(point_coords=background_points, 
                                        point_labels=point_labels, 
                                        multimask_output=False)
        
        mask_diff = 0
        if prev_mask is not None:
            mask_diff = np.logical_xor(masks[0], prev_mask).mean()
        
        hist_diff = False
        if prev_frame is not None:
            hist_diff = compare_histograms(prev_frame, frame, threshold=hist_threshold)
        
        if mask_diff > threshold or hist_diff:  # Scene change detected
            timestamp = int(frame_name.split('_')[1].split('.')[0]) / fps
            scene_changes.append(timestamp)
        
        prev_mask = masks[0]
        prev_frame = frame
    
    return scene_changes

def get_transcript(video_id):
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        return transcript
    except Exception as e:
        print(f"Error retrieving transcript: {e}")
        return []

def group_transcripts_by_scenes(transcripts, scene_changes):
    grouped_transcripts = []
    scene_index = 0
    current_group = []

    for transcript in transcripts:
        start_time = transcript['start']
        if scene_index < len(scene_changes) and start_time > scene_changes[scene_index]:
            grouped_transcripts.append(' '.join([t['text'] for t in current_group]))
            current_group = []
            scene_index += 1
        current_group.append(transcript)
    
    if current_group:
        grouped_transcripts.append(' '.join([t['text'] for t in current_group]))
    
    return grouped_transcripts

def save_transcripts_to_file(grouped_transcripts, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        for i, text in enumerate(grouped_transcripts):
            f.write(f"Scene {i + 1}:\n{text}\n\n")
    print(f"Grouped transcripts saved to {output_file}")

# Path to the video file and directory to save frames
video_path = "/kaggle/input/videoin/How to Ace Your Group Interview _ Mock Job Interview _ Indeed Career Tips.mp4"
output_dir = "/kaggle/working/frames2"

# Extract frames from the video
fps = video_to_frames(video_path, output_dir, frame_rate=0.7)

# Initialize the SAM predictor
#model = sam_model_registry["vit_h"](checkpoint="/content/sam_vit_h_4b8939.pth")
#predictor = SamPredictor(model)

# Detect scene changes
scene_changes = detect_scene_changes(output_dir, fps, threshold=0.15, hist_threshold=0.3)
print("Scene changes detected at timestamps (in seconds):", scene_changes)

# Get YouTube transcript
video_id = "eLxA6hPaStw"
transcripts = get_transcript(video_id)

# Group transcripts by scene changes
grouped_transcripts = group_transcripts_by_scenes(transcripts, scene_changes)

# Save the grouped transcripts to a file
output_file = "/kaggle/working/grouped_transcripts.txt"
save_transcripts_to_file(grouped_transcripts, output_file)
