In [4]:
import os
os.chdir("/Users/vanshbansal/Desktop/FuelGrowth")
%pwd

'/Users/vanshbansal/Desktop/FuelGrowth'

In [5]:
import pandas as pd
import os
import cv2
import numpy as np
from urllib.request import urlretrieve
from imagehash import phash
from PIL import Image

# Load Excel data
df = pd.read_excel('Assignment Data.xlsx')

df = df.sample(20)
print(df)
df.rename(columns={'Video URL': 'url'}, inplace=True)

# Directory to store temporary frames
TEMP_DIR = "temp_frames"
os.makedirs(TEMP_DIR, exist_ok=True)

# Function to download video and capture first few frames
def get_video_frames(video_url, frame_count=5):
    temp_video_path = os.path.join(TEMP_DIR, "temp_video.mp4")
    urlretrieve(video_url, temp_video_path)

    cap = cv2.VideoCapture(temp_video_path)
    frames = []
    for _ in range(frame_count):
        ret, frame = cap.read()
        if not ret:
            break
        frame = cv2.resize(frame, (200, 200))  # Resize for consistent comparison
        frames.append(frame)
    cap.release()
    os.remove(temp_video_path)
    return frames

# Function to calculate hash for each frame
def calculate_frame_hashes(frames):
    hashes = []
    for frame in frames:
        image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
        hashes.append(phash(image))
    return hashes


# Compare videos and retain unique ones
unique_rows = []  # To store rows corresponding to unique videos
seen_hashes = set()

for _, row in df.iterrows():
    url = row['url']
    try:
        frames = get_video_frames(url)
        frame_hashes = calculate_frame_hashes(frames)
        
        # If all hashes are new, mark the video as unique
        if not any(frame_hash in seen_hashes for frame_hash in frame_hashes):
            unique_rows.append(row)  # Store the entire row
            seen_hashes.update(frame_hashes)
    except Exception as e:
        print(f"Error processing video {url}: {e}")

# Create a new DataFrame with unique rows
unique_df = pd.DataFrame(unique_rows)

# Save cleaned data to Excel, retaining all columns
unique_df.to_excel('cleaned_video_urls2.xlsx', index=False)

print("Duplicate video removal complete. Cleaned data saved to 'cleaned_video_urls2.xlsx'.")

# Cleanup
for file in os.listdir(TEMP_DIR):
    os.remove(os.path.join(TEMP_DIR, file))
os.rmdir(TEMP_DIR)

print("Duplicate video removal complete. Cleaned data saved to 'cleaned_video_urls2.xlsx'.")


     Performance                                          Video URL
56      0.935800  https://fgimagestorage.blob.core.windows.net/f...
13      0.333000  https://fgimagestorage.blob.core.windows.net/f...
176     0.447996  https://fgimagestorage.blob.core.windows.net/f...
33      0.218900  https://fgimagestorage.blob.core.windows.net/f...
74      0.558200  https://fgimagestorage.blob.core.windows.net/f...
30      0.677700  https://fgimagestorage.blob.core.windows.net/f...
112     1.506244  https://fgimagestorage.blob.core.windows.net/f...
122     0.855486  https://fgimagestorage.blob.core.windows.net/f...
126     0.718521  https://fgimagestorage.blob.core.windows.net/f...
149     1.222953  https://fgimagestorage.blob.core.windows.net/f...
36      0.625000  https://fgimagestorage.blob.core.windows.net/f...
254     1.503368  https://fgimagestorage.blob.core.windows.net/f...
107     0.786823  https://fgimagestorage.blob.core.windows.net/f...
18      0.429000  https://fgimagestorage.blob.co

## method-2

In [8]:
import pandas as pd
import os
import cv2
import numpy as np
from urllib.request import urlretrieve
from imagehash import phash
from PIL import Image

# Load Excel data
df = pd.read_excel('cleaned_video_urls.xlsx')
#print(df)
df.rename(columns={'Video URL': 'url'}, inplace=True)

# Directory to store temporary frames
TEMP_DIR = "temp_frames"
os.makedirs(TEMP_DIR, exist_ok=True)

# Function to download video and capture first few frames
def get_video_frames(video_url, frame_count=5):
    temp_video_path = os.path.join(TEMP_DIR, "temp_video.mp4")
    urlretrieve(video_url, temp_video_path)

    cap = cv2.VideoCapture(temp_video_path)
    frames = []
    for _ in range(frame_count):
        ret, frame = cap.read()
        if not ret:
            break
        frame = cv2.resize(frame, (200, 200))  # Resize for consistent comparison
        frames.append(frame)
    cap.release()
    os.remove(temp_video_path)
    return frames

# Function to calculate hash for each frame
def calculate_frame_hashes(frames):
    hashes = []
    for frame in frames:
        image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
        hashes.append(phash(image))
    return hashes


# Compare videos and retain unique ones
unique_rows = []  # To store rows corresponding to unique videos
unique_frame_hashes = []  # Store frame hashes for the first 3-5 seconds only

for _, row in df.iterrows():
    url = row['url']
    try:
        frames = get_video_frames(url)
        frame_hashes = calculate_frame_hashes(frames)
        
        # Check if the extracted frame hashes match those of already seen videos
        is_duplicate = any(
            all(curr_hash == existing_hash for curr_hash, existing_hash in zip(frame_hashes, stored_hashes))
            for stored_hashes in unique_frame_hashes
        )
        
        if not is_duplicate:
            unique_rows.append(row)  # Store the entire row
            unique_frame_hashes.append(frame_hashes)  # Save the current video's hashes
    except Exception as e:
        print(f"Error processing video {url}: {e}")

# Create a new DataFrame with unique rows
unique_df = pd.DataFrame(unique_rows)

# Save cleaned data to Excel, retaining all columns
unique_df.to_excel('cleaned_video_urls2.xlsx', index=False)

print("Duplicate video removal complete. Cleaned data saved to 'cleaned_video_urls.xlsx'.")

# Cleanup
for file in os.listdir(TEMP_DIR):
    os.remove(os.path.join(TEMP_DIR, file))
os.rmdir(TEMP_DIR)

print("Duplicate video removal complete. Cleaned data saved to 'cleaned_video_urls.xlsx'.")


Duplicate video removal complete. Cleaned data saved to 'cleaned_video_urls.xlsx'.
Duplicate video removal complete. Cleaned data saved to 'cleaned_video_urls.xlsx'.
