In [None]:
# CÀI ĐẶT
!pip install ffmpeg-python openai-whisper torch torchvision torchaudio
!pip install transformers sentencepiece

In [None]:
# IMPORT THƯ VIỆN
import torch
import os
import whisper
import subprocess
import re
# import nltk
# from nltk.tokenize import sent_tokenize
from transformers import MarianMTModel, MarianTokenizer
from datetime import timedelta

In [None]:
# LOAD MODEL WHISPER
model = whisper.load_model("large")

In [None]:
# # TẢI DỮ LIỆU TOKENIZER
# nltk.download('punkt_tab')

# TẢI MÔ HÌNH VAD
vad_model, utils = torch.hub.load('snakers4/silero-vad', 'silero_vad', trust_repo=True)
get_speech_timestamps, save_audio, read_audio, _, _ = utils

In [None]:
# TIỀN XỬ LÝ ÂM THANH BẰNG FFMPEG
def preprocess_audio(input_path, output_path="processed.wav"):
    command = ["ffmpeg", "-y", "-i", input_path, "-ac", "1", "-ar", "16000", output_path]
    subprocess.run(command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    return output_path

# TÁCH ĐOẠN BẰNG VAD
def split_audio_by_vad(input_wav, model, output_dir="segments"):
    os.makedirs(output_dir, exist_ok=True)
    audio = read_audio(input_wav, sampling_rate=16000)
    speech_timestamps = get_speech_timestamps(audio, model, sampling_rate=16000)
    clean_segments = [s for s in speech_timestamps if s['end'] - s['start'] > 1000]

    segment_paths = []
    for i, segment in enumerate(clean_segments):
        segment_path = os.path.join(output_dir, f"segment_{i}.wav")
        save_audio(segment_path, audio[segment['start']:segment['end']], sampling_rate=16000)
        segment_paths.append(segment_path)

    return segment_paths, clean_segments

# NHẬN DIỆN TIẾNG NÓI (WHISPER)
def transcribe_segments(segment_paths, speech_timestamps):
    all_text = []
    all_segments = []

    for i, path in enumerate(segment_paths):
        offset_sec = speech_timestamps[i]['start'] / 16000
        result = model.transcribe(path, language="vi", word_timestamps=False)
        all_text.append(result["text"])

        for seg in result["segments"]:
            seg["start"] += offset_sec
            seg["end"] += offset_sec
            all_segments.append(seg)

    return "\n".join(all_text), {"segments": all_segments}

# DỊCH VI -> EN BẰNG MarianMT
mt_model_name = "Helsinki-NLP/opus-mt-vi-en"
tokenizer = MarianTokenizer.from_pretrained(mt_model_name)
translator = MarianMTModel.from_pretrained(mt_model_name)

# TÁCH CÂU TIẾNG VIỆT
def split_sentences(text):
    # Tách câu theo dấu kết thúc câu và có thể bắt đầu bằng chữ cái hoa
    sentences = re.split(r'(?<=[.!?])\s+(?=[A-ZÀ-Ỵ])', text)
    return [s.strip() for s in sentences if s.strip()]

def translate_sentences(sentences):
    translated = []
    for sent in sentences:
        inputs = tokenizer(sent, return_tensors="pt", padding=True, truncation=True)
        outputs = translator.generate(**inputs)
        translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        translated.append(translated_text)
    return translated

# TẠO FILE SONG NGỮ
def export_bilingual(sentences_vi, sentences_en, txt_path="bilingual_transcript.txt"):
    with open(txt_path, "w", encoding="utf-8") as f:
        for vi, en in zip(sentences_vi, sentences_en):
            f.write(f"{vi}\n{en}\n\n")
    print(f"Đã lưu tại {txt_path}")

In [None]:
# XUẤT PHỤ ĐỀ
def format_timestamp(seconds: float):
    delta = timedelta(seconds=seconds)
    return str(delta)[:-3].replace('.', ',')

def export_srt(transcription_results, output_path="output_subtitles.srt"):
    with open(output_path, "w", encoding="utf-8") as f:
        for i, segment in enumerate(transcription_results["segments"]):
            start = format_timestamp(segment["start"])
            end = format_timestamp(segment["end"])
            text = segment["text"].strip()
            f.write(f"{i+1}\n{start} --> {end}\n{text}\n\n")
    print(f"Đã xuất phụ đề tại: {output_path}")

In [None]:
# GHI VĂN BẢN
def export_text_output(text, txt_path):
    with open(txt_path, "w", encoding="utf-8") as f:
        f.write(text)
    print(f"Đã lưu văn bản tại: {txt_path}")

In [None]:
# CHẠY CHƯƠNG TRÌNH
input_file = "/content/au1.m4a"
processed = preprocess_audio(input_file)
segments, timestamps = split_audio_by_vad(processed, vad_model)
text_output, segment_data = transcribe_segments(segments, timestamps)

# GHI VĂN BẢN GỐC
export_text_output(text_output, "transcript_vi_full.txt")

# XUẤT SRT
export_srt(segment_data, "transcript_vi.srt")

# CHIA CÂU, DỊCH, VÀ GHI FILE SONG NGỮ
sentences_vi = split_sentences(text_output)
sentences_en = translate_sentences(sentences_vi)
export_bilingual(sentences_vi, sentences_en)

# IN RA 5 DÒNG MẪU
for vi, en in zip(sentences_vi[:5], sentences_en[:5]):
    print("- VI:", vi)
    print("- EN:", en)
    print()

In [None]:
# NHẬN DIỆN BẰNG WHISPER TRỰC TIẾP KHÔNG VAD, KHÔNG CHUẨN HÓA ÂM THANH

result = model.transcribe(input_file, language="vi")
text_output = result["text"]

print("Văn bản nhận diện:")
print(text_output)

export_text_output(text_output, "raw_transcript.txt")

In [None]:
# NHẬN DIỆN BẰNG WHISPER CÓ VAD, CÓ CHUẨN HÓA ÂM THANH

processed = preprocess_audio(input_file)
segments, timestamps = split_audio_by_vad(processed, vad_model)
text_output, segment_data = transcribe_segments(segments, timestamps)

print("Văn bản nhận diện:")
print(text_output)

export_text_output(text_output, "whisper_vad.txt")

In [None]:
# NHẬN DIỆN BẰNG WHISPER KHÔNG VAD, CÓ CHUẨN HÓA ÂM THANH

processed = preprocess_audio(input_file)
result = model.transcribe(processed, language="vi")
text_output = result["text"]

print("Văn bản nhận diện:")
print(text_output)

export_text_output(text_output, "whisper_no_vad.txt")

In [None]:
!pip install fastapi uvicorn nest_asyncio pyngrok python-multipart

In [None]:
import shutil
import os
from fastapi import FastAPI, File, UploadFile
from fastapi.responses import FileResponse
import nest_asyncio
from pyngrok import ngrok, conf
import uvicorn
import threading

# Ngắt hết tunnel hiện có
for tunnel in ngrok.get_tunnels():
    ngrok.disconnect(tunnel.public_url)

# Apply nest_asyncio để chạy uvicorn
nest_asyncio.apply()

# Khởi tạo FastAPI
app = FastAPI()

# Thư mục lưu file upload + kết quả
UPLOAD_DIR = "/content/uploads"
RESULT_DIR = "/content/results"
os.makedirs(UPLOAD_DIR, exist_ok=True)
os.makedirs(RESULT_DIR, exist_ok=True)

# Biến global lưu URL public ngrok
public_url = None

@app.post("/transcribe/")
async def transcribe_audio(file: UploadFile = File(...)):
    # Lưu file upload
    upload_path = os.path.join(UPLOAD_DIR, file.filename)
    with open(upload_path, "wb") as buffer:
        shutil.copyfileobj(file.file, buffer)

    # Gọi lại các hàm xử lý
    processed_path = preprocess_audio(upload_path, output_path=os.path.join(UPLOAD_DIR, "processed.wav"))
    segments = split_audio_by_vad(processed_path, vad_model, output_dir=os.path.join(UPLOAD_DIR, "segments"))
    text_output, segment_data = transcribe_segments(segments)

    base_name = file.filename.rsplit('.', 1)[0]
    vi_txt = os.path.join(RESULT_DIR, f"transcript_vi_{base_name}.txt")
    srt_file = os.path.join(RESULT_DIR, f"transcript_vi_{base_name}.srt")
    bilingual_txt = os.path.join(RESULT_DIR, f"bilingual_transcript_{base_name}.txt")

    export_text_output(text_output, vi_txt)
    export_srt(segment_data, srt_file)
    sentences_vi = split_sentences(text_output)
    sentences_en = translate_sentences(sentences_vi)
    export_bilingual(sentences_vi, sentences_en, bilingual_txt)

    return {
        "message": "Xử lý thành công!",
        "preview_text": text_output[:300],
        "download_links": {
            "transcript_vi": f"{public_url}/download/{os.path.basename(vi_txt)}",
            "subtitles_srt": f"{public_url}/download/{os.path.basename(srt_file)}",
            "bilingual_txt": f"{public_url}/download/{os.path.basename(bilingual_txt)}"
        }
    }

@app.get("/download/{filename}")
async def download_file(filename: str):
    # Tìm file trong thư mục kết quả và trả về
    for dir_ in [RESULT_DIR, UPLOAD_DIR]:
        file_path = os.path.join(dir_, filename)
        if os.path.exists(file_path):
            return FileResponse(file_path, filename=filename)
    return {"error": "File không tồn tại"}

# Khởi chạy server trong luồng background
def run():
    uvicorn.run(app, host="0.0.0.0", port=8000)

# Thêm token ngrok
NGROK_TOKEN = "2xfBr6SgqdP0T5h6D1yoXzWhnCy_2im2NSeh4rEJdJZJaK66S"
ngrok.set_auth_token(NGROK_TOKEN)
public_url = ngrok.connect(8000).public_url

print(f"Server đang chạy tại: {public_url}")
print(f"API docs: {public_url}/docs")

threading.Thread(target=run, daemon=True).start()