# Preliminaries


In [1]:
import torch
import gc


# unassign the model and free up GPU RAM
def freeup_VRAM(*args):
    memory_used_before = torch.cuda.memory_reserved(0) / 1024**3
    for var in args:
        try:
            del globals()[var]
            print(f"'{var}' deleted from memory.")
        except:
            pass
    gc.collect()
    torch.cuda.empty_cache()
    memory_used_after = torch.cuda.memory_reserved(0) / 1024**3
    print(f"Freed up {memory_used_before - memory_used_after:.1f} GB of VRAM.")


# Language Model


In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM


def generate_text(
    prompt, do_sample=True, max_new_tokens=50, temperature=0.9, top_k=50, top_p=0.95, eos_token_id=None
):
    if not eos_token_id:
        eos_token_id = model.config.eos_token_id
    encoded_input = tokenizer(prompt, return_tensors="pt")
    output_sequences = model.generate(
        input_ids=encoded_input["input_ids"].cuda(),
        max_new_tokens=max_new_tokens,
        do_sample=do_sample,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        eos_token_id=eos_token_id
    )
    generated_text = tokenizer.decode(output_sequences[0], skip_special_tokens=True)
    return generated_text


model_names = [
    "EleutherAI/gpt-neo-125M",
    "EleutherAI/gpt-neo-1.3B",
    "EleutherAI/gpt-neo-2.7B",
    "EleutherAI/gpt-j-6B",
]


In [3]:
model_idx = 0
load_in_8bit = False
half_precision = True

freeup_VRAM('model', 'tokenizer')

tokenizer = AutoTokenizer.from_pretrained(model_names[model_idx])
if load_in_8bit:
    model = AutoModelForCausalLM.from_pretrained(
        model_names[model_idx], device_map="auto", load_in_8bit=True
    )
else:
    model = AutoModelForCausalLM.from_pretrained(
        model_names[model_idx],
        device_map="auto",
        torch_dtype=torch.float16 if half_precision else "auto",
    )
print(
    f"Loaded model {model_names[model_idx]} in {'int8' if load_in_8bit else 'fp16' if half_precision else 'fp32'}"
)

model.eval()

# set pad_token_id to eos_token_id because GPT2 does not have a EOS token
model.config.pad_token_id = model.config.eos_token_id
model.generation_config.pad_token_id = model.config.eos_token_id


Freed up 0.0 GB of VRAM.
Loaded model EleutherAI/gpt-neo-125M in fp16


# Text-to-Speech


In [5]:
from TTS.api import TTS

freeup_VRAM('tts')

# tts = TTS(model_name="tts_models/en/ek1/tacotron2", gpu=True)
# tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC_ph", gpu=True)
# tts = TTS(model_name="tts_models/en/ljspeech/vits", gpu=True)
# tts = TTS(model_name="tts_models/en/ljspeech/vits--neon", gpu=True)
tts = TTS(model_name="tts_models/en/vctk/vits", gpu=True) # speaker: p243, p259, p263, p270, p306
# tts = TTS(model_name="tts_models/zh-CN/baker/tacotron2-DDC-GST", gpu=True)

# tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", gpu=True)
# tts = TTS(model_name="tts_models/en/ljspeech/glow-tts", gpu=True)
# tts = TTS(model_name="tts_models/en/ljspeech/speedy-speech", gpu=True)
# tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DCA", gpu=True)
# tts = TTS(model_name="tts_models/en/ljspeech/fast_pitch", gpu=True)
# tts = TTS(model_name="tts_models/en/ljspeech/overflow", gpu=True)
# tts = TTS(model_name="tts_models/en/ljspeech/neural_hmm", gpu=True)
# tts = TTS(model_name="tts_models/en/vctk/fast_pitch", gpu=True)
# tts = TTS(model_name="tts_models/en/sam/tacotron-DDC", gpu=True)
# tts = TTS(model_name="tts_models/en/blizzard2013/capacitron-t2-c50", gpu=True)
# tts = TTS(model_name="tts_models/en/blizzard2013/capacitron-t2-c150_v2", gpu=True)


Freed up 0.0 GB of VRAM.
 > tts_models/en/vctk/vits is already downloaded.
 > Using model: vits
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
 > initialization of speaker-embedding layers.


# Application


In [7]:
import IPython.display as ipd


few_shot_training_prompt = """Chat: I like the weather. It's great, isn't it?
Reply: Yes, I totally agree. I wish I could have it all the year.
###
Chat: Hey, you've got your hair cut. Looks nice!
Reply: Thank you! I like it, too. You know what? It only costs me 15 bucks.
###
Chat: Where have you been these days? I haven't seen you around.
Reply: I was travelling abroad with my husband. It's a lot of fun, really. 
###
"""

In [None]:
chat_history = ''
sample_chat = """Do you think I'm annoying?"""

input_chat = input("You say: ")
while True:
    chat_history += "Chat: " + (input_chat if input_chat else sample_chat) + "\nReply:"
    prompt = few_shot_training_prompt + chat_history

    # eos_token_id 21017 refers to "###"
    text = generate_text(prompt, eos_token_id=21017)[len(prompt):-3]

#     chat_history += text
    chat_history += text + "###\n"
    print(chat_history)

    # output to .wav using tts
    wav = tts.tts(text=text, speaker='p306')
    ipd.display(ipd.Audio(wav, rate=22050, autoplay=True))
    
    input_chat = input("You say: ")

# Playground

In [60]:
# prompt = """If I were a cat, I would"""
# prompt = '''Once upon a time,'''
# prompt = '''飞船登机口处，一群人在那儿拼命地挥手，船上的人也在欢呼，只是每个人的目光都是那么的恋恋不舍。这是第五批运向诺顿星开拓者，仙女星系的三个星球是人类移民的天堂，那里经过三百年的建设已经形成了完备高级的生活设施，而且各方面的福利，教育，以及军事防御都是太阳系标准，甚至更高，但诺顿星……还太原始了些，尤其是还有无孔不入的扎戈族，它们就像地球上的某种古董级生物蟑螂一样的顽强，而且随着人类的进化而进化。这次是诺顿星进行第二期开发计划，愿意去这种地方的多是生活不怎么样的，希望用诺顿星的两年来改善生活，年纪都是在三四十岁甚至更老。可是在人群中却有一个看来相当相当年轻的身影，看起来顶多十四五岁，男孩望着送别区，一个红鼻子老头拼命地挥舞着自己的帽子，老人的眼圈都红了，显然是哭过，是什么理由把自己这么年轻的孩子送上这样的飞船？'''
prompt = """Chat: I like the weather. It's great, isn't it?
Reply: Yes, I totally agree. I with I could have it all the year.
###
Chat: Hey, you've got your hair cut. Looks nice!
Reply: Thank you! I like it, too. You know what? It only costs me 15 bucks.
###
Chat: Where have you been these days? I haven't seen you around.
Reply: I was travelling abroad with my husband. It's a lot of fun, really. 
###
Chat: Do you think I'm annoying?
Reply:"""

# eos_token_id 21017 refers to "###"
text = generate_text(prompt, eos_token_id=21017)
print(text)


Chat: I like the weather. It's great, isn't it?
Reply: Yes, I totally agree. I with I could have it all the year.
###
Chat: Hey, you've got your hair cut. Looks nice!
Reply: Thank you! I like it, too. You know what? It only costs me 15 bucks.
###
Chat: Where have you been these days? I haven't seen you around.
Reply: I was travelling abroad with my husband. It's a lot of fun, really. 
###
Chat: Do you think I'm annoying?
Reply: Of course you are.
###


In [None]:
# for speaker in ['p243', 'p259', 'p263', 'p270', 'p306']:
#     # speaker = 'VCTK_' + speaker
#     tts.tts_to_file(text=text, file_path=f"output-{speaker}.wav", speaker=speaker)
    

In [50]:
# tts.tts_to_file(text=text, file_path="output.wav")
tts.tts_to_file(text=text, file_path="output.wav", speaker='p306')


 > Text splitted to sentences.
['description: a red button that says stop', 'code: <button style=color:white; background-color:red;>Stop', '###', 'description: a blue box that contains yellow circles with red borders', 'code: <div style=background-color: blue; padding: 20px;><div style=background-color: yellow; border: 5px solid red; border-radius: 50%; padding: 20px; width: 100px; height: 100px;>', '###', 'description: a Headline saying Welcome to AI', 'code: <p style=color:white;>Welcome to AI', '###', 'description: a simple text with blue and red backgrounds', 'code: <p style=color:white; background-color: blue;>Hello world!', '</']
 > Processing time: 1.025214433670044
 > Real-time factor: 0.022072012144621782


In [97]:
from IPython.lib.display import Audio

# wav = tts.tts(text=text, speaker='p306')
# Audio(wav, rate=27000, autoplay=True)
Audio('output.wav', autoplay=True)

In [9]:
print(torch.cuda.memory_summary())

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |    5353 MB |    5457 MB |   22603 MB |   17249 MB |
|       from large pool |    5346 MB |    5449 MB |    9080 MB |    3733 MB |
|       from small pool |       7 MB |      44 MB |   13522 MB |   13515 MB |
|---------------------------------------------------------------------------|
| Active memory         |    5353 MB |    5457 MB |   22603 MB |   17249 MB |
|       from large pool |    5346 MB |    5449 MB |    9080 MB |    3733 MB |
|       from small pool |       7 MB |      44 MB |   13522 MB |   13515 MB |
|---------------------------------------------------------------

In [55]:
encoded_text = tokenizer("###", return_tensors='pt')

In [56]:
print(encoded_text)

{'input_ids': tensor([[21017]]), 'attention_mask': tensor([[1]])}


In [57]:
print(model.config.eos_token_id)

50256
