In [None]:
import json
import pandas as pd
from datetime import timedelta

In [None]:
with open("data/data.json") as json_file:
    json_data = json.load(json_file)

In [None]:
# Convert the JSON data to a DataFrame
df = pd.json_normalize(json_data)

# Extract necessary columns
df['frame'] = df['payload.frame']
df['video_id'] = df['payload.video_id']

# Sort by frame number
df = df.sort_values(by='frame').reset_index(drop=True)

# Function to find sequences with the same video_id
def find_sequences(df, min_length=2):
    sequences = []
    current_sequence = []
    
    for i, row in df.iterrows():
        if not current_sequence:
            current_sequence.append(row)
        else:
            if row['video_id'] == current_sequence[-1]['video_id']:
                current_sequence.append(row)
            else:
                if len(current_sequence) >= min_length:
                    sequences.append(current_sequence)
                current_sequence = [row]
    
    if len(current_sequence) >= min_length:
        sequences.append(current_sequence)
    
    return sequences

# Find sequences with a minimum length threshold
min_length = 3000
sequences = find_sequences(df, min_length)

# Print the sequences
for seq in sequences:
    print(f"Sequence with video_id {seq[0]['video_id']}:")
    for item in seq:
        print(f"  Frame: {item['frame']}, ID: {item['id']}, Score: {item['score']}")

In [None]:

# Convert the JSON data to a DataFrame
df = pd.json_normalize(json_data)

# Extract necessary columns
df['frame'] = df['payload.frame']
df['video_id'] = df['payload.video_id']

# Sort by frame number
df = df.sort_values(by='frame').reset_index(drop=True)

# Apply a window function to identify sequences
df['prev_video_id'] = df['video_id'].shift(1)
df['new_sequence'] = df['video_id'] != df['prev_video_id']

# Create a sequence identifier
df['sequence_id'] = df['new_sequence'].cumsum()

# Count the length of each sequence
df['sequence_length'] = df.groupby('sequence_id')['sequence_id'].transform('size')

# Filter sequences by minimum length
min_length = 100
filtered_df = df[df['sequence_length'] >= min_length]

# Get the unique sequences that meet the minimum length threshold
sequences = filtered_df.groupby('sequence_id').apply(lambda x: x.to_dict(orient='records'))

# Print the sequences
for seq_id, seq in sequences.items():
    print(f"Sequence ID {seq_id} with video_id {seq[0]['video_id']}:")
    for item in seq:
        print(f"  Frame: {item['frame']}, ID: {item['id']}, Score: {item['score']}")

In [None]:
# Convert the JSON data to a DataFrame
df = pd.json_normalize(json_data)

# Extract necessary columns
df['frame'] = df['payload.frame']
df['video_id'] = df['payload.video_id']

# Sort by frame number
df = df.sort_values(by='frame').reset_index(drop=True)

# Smooth filter function to ignore up to n frames between sequences
def smooth_sequences(df, max_skip=1, min_length=2):
    sequences = []
    current_sequence = []
    skip_count = 0
    
    for i, row in df.iterrows():
        if not current_sequence:
            current_sequence.append(row.to_dict())
        else:
            if row['video_id'] == current_sequence[-1]['video_id']:
                current_sequence.append(row.to_dict())
                skip_count = 0
            else:
                if skip_count < max_skip:
                    skip_count += 1
                    current_sequence.append(row.to_dict())
                else:
                    filtered_sequence = [seq for seq in current_sequence if seq['video_id'] == current_sequence[0]['video_id']]
                    if len(filtered_sequence) >= min_length:
                        sequences.append(filtered_sequence)
                    current_sequence = [row.to_dict()]
                    skip_count = 0
    
    # Final check for the last sequence
    filtered_sequence = [seq for seq in current_sequence if seq['video_id'] == current_sequence[0]['video_id']]
    if len(filtered_sequence) >= min_length:
        sequences.append(filtered_sequence)
    
    return sequences

# Find sequences with a smooth filter
max_skip = 10
min_length = 100
sequences = smooth_sequences(df, max_skip, min_length)

# Calculate mean score for each sequence
def calculate_mean_score(sequence):
    mean_score = sum(item['score'] for item in sequence) / len(sequence)
    return mean_score

# Print the sequences with mean scores
for seq in sequences:
    mean_score = calculate_mean_score(seq)
    print(f"Sequence with video_id {seq[0]['video_id']}, Mean Score: {mean_score:.6f}:")
    for item in seq:
        print(f"  Frame: {item['frame']}, ID: {item['id']}, Score: {item['score']}")

In [None]:
len(sequences)

In [None]:
# Convert the JSON data to a DataFrame
df = pd.json_normalize(json_data)
df["query_video_frame"] = list(range(len(df)))

# Extract necessary columns
df['frame'] = df['payload.frame']
df['video_id'] = df['payload.video_id']

# Sort by frame number
df = df.sort_values(by='query_video_frame').reset_index(drop=True)

# Smooth filter function to ignore up to n frames between sequences
def smooth_sequences(df, max_skip=1, min_length=2):
    sequences = []
    current_sequence = []
    skip_count = 0
    
    for i, row in df.iterrows():
        if not current_sequence:
            current_sequence.append(row.to_dict())
        else:
            if row['video_id'] == current_sequence[-1]['video_id']:
                current_sequence.append(row.to_dict())
                skip_count = 0
            else:
                if skip_count < max_skip:
                    skip_count += 1
                    current_sequence.append(row.to_dict())
                else:
                    filtered_sequence = [seq for seq in current_sequence if seq['video_id'] == current_sequence[0]['video_id']]
                    if len(filtered_sequence) >= min_length:
                        sequences.append(filtered_sequence)
                    current_sequence = [row.to_dict()]
                    skip_count = 0
    
    # Final check for the last sequence
    filtered_sequence = [seq for seq in current_sequence if seq['video_id'] == current_sequence[0]['video_id']]
    if len(filtered_sequence) >= min_length:
        sequences.append(filtered_sequence)
    
    return sequences

# Find sequences with a smooth filter
max_skip = 10
min_length = 100
sequences = smooth_sequences(df, max_skip, min_length)

# Calculate mean score for each sequence
def calculate_mean_score(sequence):
    mean_score = sum(item['score'] for item in sequence) / len(sequence)
    return mean_score

# Frame rate
frame_rate = 10  # frames per second

# Function to convert frame number to time
def frame_to_time(frame, frame_rate):
    return timedelta(seconds=frame/frame_rate)

# Print the sequences with mean scores and time frames
for seq in sequences:
    mean_score = calculate_mean_score(seq)
    start_frame = seq[0]['frame']
    end_frame = seq[-1]['frame']
    start_time = frame_to_time(start_frame, frame_rate)
    end_time = frame_to_time(end_frame, frame_rate)
    
    print(f"Sequence with video_id {seq[0]['video_id']}, Mean Score: {mean_score:.6f}:")
    print(f"  Target Time: {start_time} - {end_time}")
    
    for item in seq:
        item_start_time = frame_to_time(item['frame'], frame_rate)
        print(f"    Frame: {item['frame']}, Initial frame: {item['query_video_frame']}, Score: {item['score']}, Time: {item_start_time}")

In [None]:
calculate_mean_score(sequences[0])

In [None]:
seq = sequences[0]

In [None]:
from pydantic import BaseModel, Field
class MatchingData(BaseModel):
    query_start_frame: int
    query_end_frame: int
    query_start_time: timedelta
    query_end_time: timedelta

    match_video_id: str
    match_start_frame: int
    match_end_frame: int
    match_start_time: timedelta
    match_end_time: timedelta

    similarity_score: float

In [None]:
mean_score = calculate_mean_score(seq)

query_start_frame = seq[0]['query_video_frame']
query_end_frame = seq[-1]['query_video_frame']
query_start_time = frame_to_time(query_start_frame, frame_rate)
query_end_time = frame_to_time(query_end_frame, frame_rate)

match_start_frame = seq[0]['frame']
match_end_frame = seq[-1]['frame']
match_start_time = frame_to_time(match_start_frame, frame_rate)
match_end_time = frame_to_time(match_end_frame, frame_rate)



In [None]:
match_metadata = MatchingData(
                                query_start_frame=query_start_frame,
                                query_end_frame=query_end_frame,
                                query_start_time=query_start_time,
                                query_end_time=query_end_time,

                                match_video_id=seq[0]["video_id"],
                                match_start_frame=match_start_frame,
                                match_end_frame=match_end_frame,
                                match_start_time=match_start_time,
                                match_end_time=match_end_time,

                                similarity_score=mean_score
                            )

In [None]:
match_metadata.json()