# Synchformer: Efficient Synchronization from Sparse Cues

<figure>
  <img src="https://github.com/v-iashin/Synchformer/raw/main/_repo_assets/main.png" width="700" />
</figure>

This notebook demonstrates a minimal working example of audio-visual synchronisation on a sample video with a sparse synchronisation signal.

[Project Page](https://www.robots.ox.ac.uk/~vgg/research/synchformer/) | [Code & Models](https://github.com/v-iashin/Synchformer)

Uncomment the lines in the following cell if you are on Google Colab

In [1]:
# !git clone https://github.com/v-iashin/Synchformer.git
# !pip install pip==23  # run this first
# # NOTE: `av>=9.1.1` causing worse accuracy (see issue #11),
# # but the installation of <= 10.0.0 versions to Google Colab is difficult (help is needed).
# !pip install omegaconf==2.0.6 av==10.0 einops timm==0.6.7
# %cd Synchformer

Cloning into 'Synchformer'...
remote: Enumerating objects: 250, done.[K
remote: Counting objects: 100% (44/44), done.[K
remote: Compressing objects: 100% (41/41), done.[K
remote: Total 250 (delta 8), reused 5 (delta 3), pack-reused 206 (from 1)[K
Receiving objects: 100% (250/250), 95.15 MiB | 9.92 MiB/s, done.
Resolving deltas: 100% (74/74), done.
Updating files: 100% (188/188), done.
Collecting pip==23
  Downloading pip-23.0-py3-none-any.whl.metadata (4.1 kB)
Downloading pip-23.0-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m28.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-23.0
Collecting omegaconf==2.0.6
  Downloading omegaconf-2.0.6-py3-none-any.whl (36 kB)
Collecting av==10.0
  Downloading av-10.0.0-cp311-cp311-manylinux_2_

In [2]:
import subprocess
from pathlib import Path

import torch
import torchaudio
import torchvision
from omegaconf import OmegaConf

from dataset.dataset_utils import get_video_and_audio
from dataset.transforms import make_class_grid, quantize_offset
from utils.utils import check_if_file_exists_else_download, which_ffmpeg
from scripts.train_utils import get_model, get_transforms, prepare_inputs


def reencode_video(path, vfps=25, afps=16000, in_size=256):
    assert which_ffmpeg() != '', 'Is ffmpeg installed? Check if the conda environment is activated.'
    new_path = Path.cwd() / 'vis' / f'{Path(path).stem}_{vfps}fps_{in_size}side_{afps}hz.mp4'
    new_path.parent.mkdir(exist_ok=True)
    new_path = str(new_path)
    cmd = f'{which_ffmpeg()}'
    # no info/error printing
    cmd += ' -hide_banner -loglevel panic'
    cmd += f' -y -i {path}'
    # 1) change fps, 2) resize: min(H,W)=MIN_SIDE (vertical vids are supported), 3) change audio framerate
    cmd += f" -vf fps={vfps},scale=iw*{in_size}/'min(iw,ih)':ih*{in_size}/'min(iw,ih)',crop='trunc(iw/2)'*2:'trunc(ih/2)'*2"
    cmd += f" -ar {afps}"
    cmd += f' {new_path}'
    subprocess.call(cmd.split())
    cmd = f'{which_ffmpeg()}'
    cmd += ' -hide_banner -loglevel panic'
    cmd += f' -y -i {new_path}'
    cmd += f' -acodec pcm_s16le -ac 1'
    cmd += f' {new_path.replace(".mp4", ".wav")}'
    subprocess.call(cmd.split())
    return new_path


def decode_single_video_prediction(off_logits, grid, item):
    label = item['targets']['offset_label'].item()
    print('Ground Truth offset (sec):', f'{label:.2f} ({quantize_offset(grid, label)[-1].item()})')
    print('Prediction Results:')
    off_probs = torch.softmax(off_logits, dim=-1)
    k = min(off_probs.shape[-1], 5)
    topk_logits, topk_preds = torch.topk(off_logits, k)
    # remove batch dimension
    assert len(topk_logits) == 1, 'batch is larger than 1'
    topk_logits = topk_logits[0]
    topk_preds = topk_preds[0]
    off_logits = off_logits[0]
    off_probs = off_probs[0]
    for target_hat in topk_preds:
        print(
            f'p={off_probs[target_hat]:.4f} ({off_logits[target_hat]:.4f}), "{grid[target_hat]:.2f}" ({target_hat})')
    return off_probs


def patch_config(cfg):
    # the FE ckpts are already in the model ckpt
    cfg.model.params.afeat_extractor.params.ckpt_path = None
    cfg.model.params.vfeat_extractor.params.ckpt_path = None
    # old checkpoints have different names
    cfg.model.params.transformer.target = cfg.model.params.transformer.target\
                                             .replace('.modules.feature_selector.', '.sync_model.')
    return cfg


In [3]:
vfps = 25
afps = 16000
in_size = 256
exp_name = '24-01-04T16-39-21'

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# load the model
cfg_path = f'./logs/sync_models/{exp_name}/cfg-{exp_name}.yaml'
ckpt_path = f'./logs/sync_models/{exp_name}/{exp_name}.pt'

# if the model does not exist try to download it from the server
check_if_file_exists_else_download(cfg_path)
check_if_file_exists_else_download(ckpt_path)

# load config
cfg = OmegaConf.load(cfg_path)

# patch config
cfg = patch_config(cfg)

_, model = get_model(cfg, device)
ckpt = torch.load(ckpt_path, map_location=torch.device('cpu'))
model.load_state_dict(ckpt['model'])
model.eval()
print('Model loaded.')

7.17kB [00:00, 3.75MB/s]                   
1.13GB [00:51, 22.2MB/s]                            
  ckpt = torch.load(ckpt_path, map_location=torch.device('cpu'))


Model loaded.


In [5]:
# list of items to process. Mind the order: (video_path, offset_sec, v_start_i_sec)
to_process = [
    ('./data/vggsound/h264_video_25fps_256side_16000hz_aac/3qesirWAGt4_20000_30000.mp4', 1.6, 0.0),
    ('./data/vggsound/h264_video_25fps_256side_16000hz_aac/ZYc410CE4Rg_0_10000.mp4', -2.0, 4.0),
]

In [6]:
for vid_path, offset_sec, v_start_i_sec in to_process:
    # (optional) checking if the provided video has the correct frame rates
    print(f'Using video: {vid_path}')
    v, _, info = torchvision.io.read_video(vid_path, pts_unit='sec')
    _, H, W, _ = v.shape
    if info['video_fps'] != vfps or info['audio_fps'] != afps or min(H, W) != in_size:
        print(f'Reencoding. vfps: {info["video_fps"]} -> {vfps};', end=' ')
        print(f'afps: {info["audio_fps"]} -> {afps};', end=' ')
        print(f'{(H, W)} -> min(H, W)={in_size}')
        vid_path = reencode_video(vid_path, vfps, afps, in_size)
    else:
        print(
            f'No need to reencode: vfps: {info["video_fps"]}; afps: {info["audio_fps"]}; min(H, W)={in_size}')

    # load visual and audio streams
    # rgb: (Tv, 3, H, W) in [0, 225], audio: (Ta,) in [-1, 1]
    rgb, audio, meta = get_video_and_audio(vid_path, get_meta=True)

    # making an item (dict) to apply transformations
    # NOTE: here is how it works:
    # For instance, if the model is trained on 5sec clips, the provided video is 9sec, and `v_start_i_sec=1.3`
    # the transform will crop out a 5sec-clip from 1.3 to 6.3 seconds and shift the start of the audio
    # track by `offset_sec` seconds. It means that if `offset_sec` > 0, the audio will
    # start by `offset_sec` earlier than the rgb track.
    # It is a good idea to use something in [-`max_off_sec`, `max_off_sec`] (-2, +2) seconds (see `grid`)
    item = dict(
        video=rgb, audio=audio, meta=meta, path=vid_path, split='test',
        targets={'v_start_i_sec': v_start_i_sec, 'offset_sec': offset_sec, },
    )

    # making the offset class grid similar to the one used in transforms
    max_off_sec = cfg.data.max_off_sec
    num_cls = cfg.model.params.transformer.params.off_head_cfg.params.out_features
    grid = make_class_grid(-max_off_sec, max_off_sec, num_cls)
    if not (min(grid) <= item['targets']['offset_sec'] <= max(grid)):
        print(f'WARNING: offset_sec={item["targets"]["offset_sec"]} is outside the trained grid: {grid}')

    # applying the test-time transform
    item = get_transforms(cfg, ['test'])['test'](item)

    # prepare inputs for inference
    batch = torch.utils.data.default_collate([item])
    aud, vid, targets = prepare_inputs(batch, device)

    # TODO:
    # sanity check: we will take the input to the `model` and recontruct make a video from it.
    # Use this check to make sure the input makes sense (audio should be ok but shifted as you specified)
    # reconstruct_video_from_input(aud, vid, batch['meta'], vid_path, v_start_i_sec, offset_sec,
    #                              vfps, afps)

    # forward pass
    with torch.set_grad_enabled(False):
        with torch.autocast('cuda', enabled=cfg.training.use_half_precision):
            _, logits = model(vid, aud)

    # simply prints the results of the prediction
    decode_single_video_prediction(logits, grid, item)
    print()

Using video: ./data/vggsound/h264_video_25fps_256side_16000hz_aac/3qesirWAGt4_20000_30000.mp4
No need to reencode: vfps: 25.0; afps: 16000; min(H, W)=256
Ground Truth offset (sec): 1.60 (18)
Prediction Results:
p=0.9482 (12.1250), "1.60" (18)
p=0.0307 (8.6953), "1.80" (19)
p=0.0158 (8.0312), "1.40" (17)
p=0.0034 (6.5039), "-0.40" (8)
p=0.0007 (4.9258), "2.00" (20)

Using video: ./data/vggsound/h264_video_25fps_256side_16000hz_aac/ZYc410CE4Rg_0_10000.mp4
No need to reencode: vfps: 25.0; afps: 16000; min(H, W)=256
Ground Truth offset (sec): -2.00 (0)
Prediction Results:
p=0.7129 (11.8047), "-2.00" (0)
p=0.1361 (10.1484), "-1.80" (1)
p=0.1210 (10.0312), "-1.60" (2)
p=0.0217 (8.3125), "-1.40" (3)
p=0.0056 (6.9531), "-1.20" (4)



In [7]:
# !pip freeze

absl-py==1.4.0
accelerate==1.2.1
aiohappyeyeballs==2.4.4
aiohttp==3.11.11
aiosignal==1.3.2
alabaster==1.0.0
albucore==0.0.19
albumentations==1.4.20
ale-py==0.10.1
altair==5.5.0
annotated-types==0.7.0
anyio==3.7.1
argon2-cffi==23.1.0
argon2-cffi-bindings==21.2.0
array_record==0.6.0
arviz==0.20.0
astropy==6.1.7
astropy-iers-data==0.2025.1.27.0.32.44
astunparse==1.6.3
atpublic==4.1.0
attrs==25.1.0
audioread==3.0.1
autograd==1.7.0
av==10.0.0
babel==2.16.0
backcall==0.2.0
beautifulsoup4==4.12.3
bigframes==1.34.0
bigquery-magics==0.5.0
bleach==6.2.0
blinker==1.9.0
blis==0.7.11
blosc2==3.0.0
bokeh==3.6.2
Bottleneck==1.4.2
bqplot==0.12.44
branca==0.8.1
CacheControl==0.14.2
cachetools==5.5.1
catalogue==2.0.10
certifi==2024.12.14
cffi==1.17.1
chardet==5.2.0
charset-normalizer==3.4.1
chex==0.1.88
clarabel==0.9.0
click==8.1.8
cloudpathlib==0.20.0
cloudpickle==3.1.1
cmake==3.31.4
cmdstanpy==1.2.5
colorcet==3.1.0
colorlover==0.3.0
colour==0.1.5
community==1.0.0b1
confection==0.1.5
cons==0.4.6
contou