In [1]:
import torch
print(torch.cuda.is_available())  # Harus mengembalikan True jika GPU tersedia
print(torch.cuda.current_device())  # Menampilkan ID GPU saat ini
print(torch.cuda.get_device_name(0))  # Nama GPU pertama

True
0
NVIDIA GeForce RTX 3060 Laptop GPU


In [2]:
from pydub.utils import which

ffmpeg_path = which("ffmpeg")
if ffmpeg_path:
    print(f"✅ FFmpeg ditemukan di: {ffmpeg_path}")
else:
    print("❌ FFmpeg tidak ditemukan, pastikan sudah terinstal dan ada di PATH.")

✅ FFmpeg ditemukan di: C:\Program Files\ArcGIS\Pro\bin\Python\envs\arcgispro-py3\Library\bin\ffmpeg.exe


In [3]:
from pydub.utils import get_encoder_name
print(get_encoder_name())

ffmpeg


In [4]:
from pyannote.audio import Pipeline
print("pyannote.audio installed successfully!")

pyannote.audio installed successfully!


In [5]:
import os
import whisper
import torch
import warnings
import subprocess
from collections import defaultdict
from transformers import pipeline
from docx import Document
from tkinter import Tk, filedialog
from tqdm import tqdm
from pyannote.audio import Pipeline

warnings.filterwarnings("ignore", category=UserWarning, message="FP16 is not supported on CPU")

# Path FFMPEG
FFMPEG_PATH = r"C:\Users\zavaa\OneDrive\Desktop\ffmpeg-2025-01-30-git-1911a6ec26-full_build\bin\ffmpeg.exe"
SUPPORTED_FORMATS = {"mp3", "m4a", "ogg", "flac", "aac", "wav"}

# Hugging Face API Token untuk Pyannote
HF_TOKEN = "Input HF token disini"

def check_ffmpeg():
    try:
        subprocess.run([FFMPEG_PATH, "-version"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)
        return True
    except Exception:
        return False

if not check_ffmpeg():
    print("❌ FFmpeg tidak ditemukan!")
    exit(1)

def select_audio_file():
    root = Tk()
    root.withdraw()
    return filedialog.askopenfilename(title="Pilih file audio", filetypes=[("Audio Files", "*.m4a;*.mp3;*.wav;*.flac;*.ogg;*.aac")])

def convert_audio_to_wav(input_file, output_file="temp_audio.wav"):
    if not os.path.exists(input_file):
        print("❌ File tidak ditemukan:", input_file)
        return None
    
    file_extension = os.path.splitext(input_file)[-1].lower()[1:]
    if file_extension == "wav":
        return input_file
    
    if file_extension not in SUPPORTED_FORMATS:
        print(f"❌ Format {file_extension} tidak didukung!")
        return None
    
    try:
        print(f"🔄 Mengonversi {file_extension} ke WAV dengan FFmpeg...")
        command = [FFMPEG_PATH, "-i", input_file, "-acodec", "pcm_s16le", "-ar", "16000", "-y", output_file]
        subprocess.run(command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)
        return output_file
    except subprocess.CalledProcessError:
        print("❌ FFmpeg gagal mengonversi!")
        return None

def diarize_audio(file_path):
    print("🔄 Memproses diarization...")
    pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=HF_TOKEN)
    diarization = pipeline(file_path)
    return diarization

def transcribe_audio(file_path, diarization):
    converted_path = convert_audio_to_wav(file_path)
    if not converted_path:
        return ""
    
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = whisper.load_model("medium").to(device)
    
    print("🔄 Memulai transkripsi...")
    result = model.transcribe(converted_path, language=None, fp16=(device == "cuda"))
    
    if converted_path == "temp_audio.wav":
        os.remove(converted_path)
    
    speaker_texts = defaultdict(list)
    for segment in result["segments"]:
        start_time = segment["start"]
        end_time = segment["end"]
        text = segment["text"]
        
        speaker = "Unknown"
        for turn, _, spk in diarization.itertracks(yield_label=True):
            if turn.start <= start_time and turn.end >= end_time:
                speaker = f"Speaker {spk}"
                break
        
        speaker_texts[speaker].append(f"[{start_time:.2f} - {end_time:.2f}] {text}")
    
    return speaker_texts

def save_text_to_file(transcript, output_file):
    with open(output_file, "w", encoding="utf-8") as f:
        for speaker, texts in transcript.items():
            f.write(f"{speaker}\n")
            f.write("\n".join(texts) + "\n\n")

def summarize_text(transcript):
    print("🔄 Memuat model BART untuk rangkuman...")
    device = 0 if torch.cuda.is_available() else -1
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=device)
    
    print("🔄 Merangkum teks...")
    full_text = " ".join([" ".join(texts) for texts in transcript.values()])
    summary_chunks = [summarizer(full_text[i:i+1024], max_length=250, min_length=100, do_sample=False, truncation=True)[0]['summary_text'] for i in range(0, len(full_text), 1024)]
    
    summary = "\n- " + "\n- ".join(summary_chunks)
    return summary.strip()

def save_summary_to_docx(summary, output_file):
    doc = Document()
    doc.add_heading("Summary", level=1)
    doc.add_paragraph(summary)
    doc.save(output_file)

def main():
    audio_file = select_audio_file()
    if not audio_file:
        print("❌ Tidak ada file yang dipilih.")
        return
    
    print(f"✅ File dipilih: {audio_file}")
    diarization = diarize_audio(audio_file)
    transcript = transcribe_audio(audio_file, diarization)
    if not transcript:
        print("❌ Transkripsi gagal.")
        return
    
    transcript_file = os.path.splitext(audio_file)[0] + "_transcript.txt"
    save_text_to_file(transcript, transcript_file)
    print(f"✅ Transkripsi selesai, disimpan di {transcript_file}")
    
    summary = summarize_text(transcript)
    summary_file = os.path.splitext(audio_file)[0] + "_summary.docx"
    save_summary_to_docx(summary, summary_file)
    print(f"✅ Rangkuman disimpan di {summary_file}")

if __name__ == "__main__":
    main()


✅ File dipilih: D:/2024/MODEL AI/NOTULEN AI/TEST HASIL RAPAT/AUDIO_TEST_CLIP.WAV
🔄 Memproses diarization...


INFO:speechbrain.utils.quirks:Applied quirks (see `speechbrain.utils.quirks`): [disable_jit_profiling, allow_tf32]
INFO:speechbrain.utils.quirks:Excluded quirks specified by the `SB_DISABLE_QUIRKS` environment (comma-separated list): []
  if ismodule(module) and hasattr(module, '__file__'):
Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.5.0.post0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\zavaa\.cache\torch\pyannote\models--pyannote--segmentation\snapshots\c4c8ceafcbb3a7a280c2d357aee9fbc9b0be7f9b\pytorch_model.bin`
INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached


Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.6.0+cu118. Bad things might happen unless you revert torch to 1.x.


INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
  wrapped_fwd = torch.cuda.amp.custom_fwd(fwd, cast_inputs=cast_inputs)
INFO:speechbrain.utils.fetching:Fetch embedding_model.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch mean_var_norm_emb.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch classifier.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch label_encoder.txt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.parameter_transfer:Loading pretrained files for: embedding_model, mean_var_norm_emb, classifier, label_encoder


🔄 Memulai transkripsi...
✅ Transkripsi selesai, disimpan di D:/2024/MODEL AI/NOTULEN AI/TEST HASIL RAPAT/AUDIO_TEST_CLIP_transcript.txt
🔄 Memuat model BART untuk rangkuman...


Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


🔄 Merangkum teks...


Your max_length is set to 250, but your input_length is only 205. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=102)


✅ Rangkuman disimpan di D:/2024/MODEL AI/NOTULEN AI/TEST HASIL RAPAT/AUDIO_TEST_CLIP_summary.docx
