In [None]:
from vllm import LLM, SamplingParams

llm = LLM(
    model="trillionlabs/Tri-1.8B-Translation",
    dtype="float16",               # GPU라면 권장
    tensor_parallel_size=1,        # 단일 GPU 보장
    enforce_eager=True,            # CUDA graph 캡쳐 이슈 회피
    gpu_memory_utilization=0.5,   # 메모리 여유 확보
    disable_log_stats=True
)

In [None]:
sp = SamplingParams(temperature=0.1, max_tokens=512)

target = "ko"
text = """There's so much to do on a day like this, lots of things to do, but maybe later when you're having dinner."""

prompt_old = f"""
Translate into ko\n
{text}<ko>
이런 날에는 할 일이 너무 많고 할 일도 많지만, 나중에 저녁을 먹을 때쯤이면.
"""

prompt = f"""
Translate into ko\n
There will be a chance to reflect.<ko>
"""
out = llm.chat([{"role": "user", "content": prompt}], sampling_params=sp)
out[0].outputs[0].text.strip()

In [None]:
out[0].outputs

In [None]:
from vllm import LLM, SamplingParams

model_path = "ByteDance-Seed/Seed-X-PPO-7B-GPTQ-Int8"

model = LLM(
    model=model_path,
    max_num_seqs=512,
    tensor_parallel_size=1,
    enable_prefix_caching=True, 
    gpu_memory_utilization=0.9
)

In [None]:
messages = [
    """
Translate the following English sentence into Korean:
So I think Yeah, that stays in your memory you keep remembering how big Manchester United is and so when I heard they were interested it was like Yeah, childhood dream.
In my team in Antwerp there were some players who were also in England and when I talked to them about Manchester United you could directly see their face change.
<ko>
그래서 저는 그래요, 그건 당신의 기억에 남아있어요. 당신은 맨체스터 유나이티드가 얼마나 큰지 계속 기억하고 있고, 그래서 저가 그들이 관심이 있다는 것을 들었을 때는 마치 어린 시절의 꿈 같았어요.""",
]

# Sampling
decoding_params = SamplingParams(temperature=0.1,
                                 max_tokens=512,
                                 skip_special_tokens=True)

results = model.generate(messages, decoding_params)
responses = [res.outputs[0].text.strip() for res in results]

print(responses)

In [None]:
messages = [
    """
Translate the following English sentence into Korean:
In my team in Antwerp there were some players who were also in England and when I talked to them about Manchester United you could directly see their face change.
<ko>
""",
]

# Sampling
decoding_params = SamplingParams(temperature=0.1,
                                 max_tokens=512,
                                 skip_special_tokens=True)

import time
st = time.time()
results = model.generate(messages, decoding_params)
print(time.time() - st)
responses = [res.outputs[0].text.strip() for res in results]

print(responses)

In [None]:
import os
from typing import Callable
from openai import OpenAI
import time

OPENAI_KEY = os.environ.get("OPENAI_KEY")

client = OpenAI(api_key=OPENAI_KEY)

totals = 0
for _ in range(12):
    st = time.time()
    response = client.chat.completions.create(
        model='gpt-4.1-mini',  # 최신 경량 모델
        messages=[
            {"role": "system", "content": "You are a professional translator specializing in [English] → [Korean] translation. Your job is to incrementally translate Korean speech as it comes in."},
            {"role": "user", "content": f"""
Translate the following English sentence into Korean:
So I think Yeah, that stays in your memory you keep remembering how big Manchester United is and so when I heard they were interested it was like Yeah, childhood dream.<ko>
그래서 저는 그래요, 그건 당신의 기억에 남아있어요. 당신은 맨체스터 유나이티드가 얼마나 큰지 계속 기억하고 있고, 그래서 저가 그들이 관심이 있다는 것을 들었을 때는 마치...
"""}
        ],
        temperature=0.4,
        user="k2e-translator-v1-hojinkhj6051230808",
        prompt_cache_key="k2e-translator-v1-hojinkhj6051230808",
        # stream=True,
        # stream_options={"include_usage": True},
    )
    
    # sent = ''
    
    # pt = 0
    # pt_cached = 0
    # ct = 0
    
    # for chunk in response:
    #     if chunk.usage and chunk.usage is not None:
    #         if pt == 0:
    #             # print(time.time() - st)
    #             pt += 1
    #         pass
    #     else:
    #         if chunk.choices[0].delta.content != '' and chunk.choices[0].delta.content is not None:
    #             sent += chunk.choices[0].delta.content
    
    print(time.time() - st, "\n", response)
    totals += time.time() - st
print(totals)

In [None]:
len("And there's a dimension of human intelligence.")

In [None]:
import re

def text_pr(old, new):
    # old, new 둘 다 소문자로 변환
    o = old.lower()
    n = new.lower()

    # 공백과 콤마 제거
    o_clean = re.sub(r"[ ,]", "", o)
    n_clean = re.sub(r"[ ,]", "", n)

    # 공통 prefix 찾기
    i = 0
    while i < len(o_clean) and i < len(n_clean) and o_clean[i] == n_clean[i]:
        i += 1

    # old는 공통 부분까지만, 나머지는 new에서 가져오기
    return new[:i] + new[i:]

In [None]:
import re

def text_pr(old, new):
    # old, new 둘 다 소문자로 변환
    o = old.lower()
    n = new.lower()

    # 공백과 콤마 제거
    o_clean = re.sub(r"[ ,.]", "", o)
    n_clean = re.sub(r"[ ,.]", "", n)

    # 공통 prefix 찾기
    i = 0
    while i < len(o_clean) and i < len(n_clean) and o_clean[i] == n_clean[i]:
        i += 1

    # old는 공통 부분까지만, 나머지는 new에서 가져오기
    return old[:i] + new[i:]

old = 'So you stop saving in U S government bonds and you start saving in the hardest money around, which is Bitcoin.'
new = 'So you stop saving in US government bonds and you start saving in the hardest money around, which is Bitcoin.'

text_pr(old, new)

In [None]:
texts = ['So you stop saving in US government.',
'So you stop saving in US government bonds.',
'So you stop saving in US government bonds and you',
'So you stop saving in US government bonds and you start',
'So you stop saving in US government bonds and you start saving.',
'So you stop saving in US government bonds and you start saving',
'So you stop saving in US government bonds and you start saving',
'So you stop saving in U.S. government bonds and you start saving in the hard',
'So you stop saving in U.S. government bonds and you start saving in the hardest way.',
'So you stop saving in US government bonds and you start saving in the hardest money around.',
'So you stop saving in U.S. government bonds and you start saving in the hardest money around.',
'So you stop saving in U.S. government bonds and you start saving in the hardest money around, which is Bitcoin.',
'So you stop saving in US government bonds and you start saving in the hardest money around, which is Bitcoin.',
'So you stop saving in US government bonds and you start saving in the hardest money around, which is Bitcoin.',
'So you stop saving in U S government bonds and you start saving in the hardest money around, which is Bitcoin.',
]

tt = ''
for t in texts:
    tt = text_pr(tt, t)
    print(tt)

In [None]:
ss = ['weaf', '222', '333', '444', '555']

text = "\n".join([f"<{x}>" for x in ss])
text

In [None]:
'de' in {'de': 123}.keys()

In [None]:
from stt.asr import load_asr_backend
ASR = load_asr_backend(kind="nemo", device='cuda')

In [None]:
import librosa
import torch
import time
import numpy as np

audio, sr = librosa.load("./utils/individualAudio.wav", mono=True, sr=16000)
# audio = torch.tensor(audio).to('cuda')
pcm_bytes = (np.clip(audio, -1.0, 1.0) * 32767.0).astype(np.int16).tobytes()

st = time.time()
ASR.transcribe_pcm(pcm_bytes, sr, 1, language="english")
print(time.time() - st)

In [None]:
audio.shape

In [None]:
from llm.openai_test import translate_simple
import os
from typing import Callable
from openai import OpenAI

OPENAI_KEY = os.environ.get("OPENAI_KEY")

client = OpenAI(api_key=OPENAI_KEY)

def translate_simple(prevScripts:str, current_scripted_sentence:str):
    hist = "\n".join([f" me:{x}," for x in prevScripts])

    st = time.time()
    response = client.chat.completions.create(
        model='gpt-4.1-mini',  # 최신 경량 모델
        messages=[
            {"role": "system", "content": "You are a professional translator specializing in [Korean] → [English] translation."},
            {"role": "user", "content": f"""
지금 계속 한글로 말하는걸 영어로 번역하고 있어.
<previous utterances>는 현재 문장 이전에 이야기하던 문장이야. 번역을 위한 맥락 파악에 사용할 수 있어.
<speaking english>은 번역해야하는 현재 발화야.

말을 하는걸 script로 만든 input이기 때문에, 발음 문제로 인해서 텍스트가 잘못 들어왔을 수 있어. 그걸 감안해서 번역해줘.

출력 english를 일반 글 문장보다는 실제로 사람이 말하는 것 같은 구어체로 적어줘. 예를 들어, Oh, Ah, uhm..을 쓰거나 아님 같은 단어를 두번 쓰거나 이런 것들 있잖아?
Translate into casual spoken English. 근데 너무 심하게 하진 말고, 없는 말을 만들거나 들어온 input을 왜곡하면 안돼.

-- INPUT --
<previous utterances>{hist}
<speaking korean> : {current_scripted_sentence}
<english> : 
"""}
        ],
        temperature=0.3,
        user="k2e-translator-v1-hojinkhj6051230808",
        prompt_cache_key="k2e-translator-v1-hojinkhj6051230808",
        stream=True,
    )
    sent = ''
    for chunk in response:
        if chunk.usage and chunk.usage is not None:
            u = chunk.usage;
        else:
            if chunk.choices[0].delta.content != '' and chunk.choices[0].delta.content is not None:
                print(time.time() - st, "-", chunk.choices[0].delta.content)
                sent += chunk.choices[0].delta.content
            if chunk.choices[0].finish_reason is not None:
                print("END RETURN!", time.time() - st)
                return sent

    return sent

def tt(token):
    pass

sts = time.time()
result = translate_simple("", "아 아 그건 좀 아닌 것 같은데.. 오늘은 뭐 먹을까?ㅋㅋ 맛난거 먹자")
print(result, time.time() - sts)

In [None]:
from IPython.display import Audio

ctx = load_infer_context({
    "model_name": "zipvoice",
    "model_dir": None,
    "checkpoint_name": "model.pt",
    "vocoder_path": None,
    "tokenizer": "emilia",
    "lang": "en-us",
    "num_step": 32,
    "guidance_scale": None,
    "feat_scale": 0.1,
    "speed": 0.9,
    "t_shift": 0.5,
    "target_rms": 0.1,
})

wav, info = generate_sentence(
    prompt_text='Ahh you flipped on me, Oh, that smooth. Honestly, Pretty chill, just existing, you know.',
    prompt_wav_path='./denoised.wav',
    text=result,
    ctx=ctx,
)

display(Audio(wav, rate=24000))

In [None]:
from pydub import AudioSegment

audio = AudioSegment.from_file("./tts/voice.wav")
display(Audio("./tts/voice.wav"))
# 앞 0.1초 (100ms) 추출
first_100ms = audio[:800]  # 밀리초 단위

# 새로운 파일로 저장
first_100ms.export("output.wav", format="wav")
display(Audio("output.wav"))

In [None]:
from pydub import AudioSegment
audio = AudioSegment.from_file("./sam.m4a")
first_100ms = audio[:-500]  # 밀리초 단위
first_100ms.export("output.wav", format="wav")
display(Audio("output.wav"))

In [None]:
from zipvoice.tokenizer.tokenizer import EmiliaTokenizer

tokenizer = EmiliaTokenizer(token_file="/workspace/ttssocketserver/tts/tokens.txt")

print(tokenizer.texts_to_tokens(["안녕하세요, what's happening? 霍...啦啦啦超过"]))

In [None]:
import librosa
import noisereduce as nr
import soundfile as sf

y, sr = librosa.load("./output.wav", sr=None)

noise_clip = y[:int(sr*0.3)]

reduced_audio = nr.reduce_noise(y=y, sr=sr, y_noise=noise_clip)

sf.write("denoised.wav", reduced_audio, sr)

display(Audio(reduced_audio, rate=48000))

In [None]:
from llm.openai_test import translate_simple
import time

st = time.time()

def tes(tk):
    print(time.time() - st, tk)

res = translate_simple("", "안녕하세요 밥이나 잡수시죠?", "", tes)
print(time.time() - st)
print(res)

In [None]:
import librosa

y, sr = librosa.load("./output.wav", sr=24000)
print(y.shape, y.min(), y.max())

In [None]:
from librosa.util import normalize

yy = normalize(y) * 0.95
print(yy.min(), yy.max())

In [None]:
import torch
import librosa

filler_audios_path = ["./utils/hmhm.wav", "./utils/uhuh.wav", "./utils/ohoh.wav", "./utils/uhmuhm.wav"]
filler_audios = []
for p in filler_audios_path:
    audiod, sr = librosa.load(p, sr=24000, mono=True)
    audiod = torch.tensor(audiod)
    filler_audios.append(audiod)

In [None]:
audiod.shape

In [None]:
import random
import queue

aa = queue.Queue()
aa.get()

In [None]:
from chatterbox_infer.mtl_tts import ChatterboxMultilingualTTS
tts_model = ChatterboxMultilingualTTS.from_pretrained(device="cuda")

In [None]:
import torch

res = tts_model.generate(
        "Hm.. I'm planning to head to San Francisco... around next week, and..",
        language_id='en',
        audio_prompt_path="./a3.wav"
    )

In [None]:
from IPython.display import Audio

display(Audio(res.cpu().numpy(), rate=24000))

In [None]:
import torch
import time

start_time = time.time()
async for event in tts_model.generate_stream(
        "I don't know. where are you going?",
        language_id='en',
        audio_prompt_path="./hmhm.wav"
    ):
    if event.get("type") == "eos":
        print(time.time() - start_time)

In [None]:
import torch
import time

start_time = time.time()
async for event in tts_model.generate_stream(
        "I don't know. where are you going?",
        language_id='en',
        audio_prompt_path="./hmhm.wav"
    ):
    if event.get("type") == "eos":
        print(time.time() - start_time)

In [None]:
from chatterbox_infer.tts import ChatterboxTTS
tts_model = ChatterboxTTS.from_pretrained(device="cuda")

In [None]:
import torch
import time

i=0
start_time = time.time()
async for event in tts_model.generate_stream(
        "I don't know. where are you going?",
        audio_prompt_path="./hmhm.wav"
    ):
    print(f"{i}th - ", time.time() - start_time)
    i += 1
    start_time = time.time()

In [None]:
import torch
import time

i=0
start_time = time.time()
async for event in tts_model.generate_stream(
        "I don't know. where are you going?",
        language_id='en',
        audio_prompt_path="./hmhm.wav"
    ):
    print(f"{i}th - ", time.time() - start_time)
    i += 1
    start_time = time.time()