In [2]:
import librosa
import sys
import numpy as np
import time
from third.smart_turn.inference import predict_endpoint

file_path = './audio_samples/output_full.wav'

audio, sr = librosa.load(file_path, sr=None, mono=True)
print(f"Loaded audio with sample rate: {sr} Hz, duration: {len(audio) / sr:.2f} seconds")
if sr != 16000:
    print(f"Resampling from {sr}Hz to 16000Hz")
    audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)

if audio.dtype != np.float32:
    audio = audio.astype(np.float32)

if np.max(np.abs(audio)) > 1.0:
    audio = audio / np.max(np.abs(audio))

audio = audio[:16000]

print("Running endpoint prediction...")
st = time.time()
result = predict_endpoint(audio)
print(f"{(time.time() - st)*1000:.1f} ms")

print("\nResults:")
print(f"Prediction: {'Complete' if result['prediction'] == 1 else 'Incomplete'}")
print(f"Probability of complete: {result['probability']:.4f}")

Loaded audio with sample rate: 24000 Hz, duration: 14.34 seconds
Resampling from 24000Hz to 16000Hz
Running endpoint prediction...
31.6 ms

Results:
Prediction: Incomplete
Probability of complete: 0.3235


In [None]:
from audiotools import AudioSignal

AudioSignal(f'./test_audio_save.wav').widget()

In [None]:
# from IPython.display import Audio
from audiotools import AudioSignal

for i in range(1, 7):
    # display(Audio(f'./test_{i}.wav'))
    AudioSignal(f'./test_{i}.wav').widget()

In [None]:
from audiotools import AudioSignal
from IPython.display import Audio
import torch

signals = [AudioSignal(f'./test_{i}.wav').audio_data.squeeze(0)[:, :-4800] for i in range(1, 5)]
print(signals[0].shape)
combined = torch.cat(signals, dim=-1)

Audio(combined.cpu().numpy(), rate=sr)

In [None]:
import numpy as np

AudioSignal(f'./test_3.wav').widget()

fade_out_ms = 80
fade_samples = int(24000 * fade_out_ms / 1000)
fade_curve = np.linspace(1, 0, fade_samples)

trimmed = AudioSignal(f'./test_3.wav').audio_data.squeeze(0)

for i in range(1, 3):
    t = trimmed[:, :-4800*i].cpu().numpy()
    t[:, -fade_samples:] *= fade_curve
    display(Audio(t, rate=24000))

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="openai/gpt-oss-20b")
messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe(messages)