# NoteBookLLama in one project 
1. Stage-1: pdf-to-txt, clean txt
2. Stage-2: generate podcast transcript
3. Stage-3: refine podcast transcripts
4. Stage-4: text-to-voice

## Install Libraries 

In [None]:
# !pip install --upgrade --quiet pip
# !pip install PyPDF2 -q
# !pip install rich ipywidgets -q
# !pip install --upgrade --quiet datasets[audio] transformers accelerate evaluate jiwer tensorboard

## Import required libraries 

In [1]:
import PyPDF2
from typing import Optional
import os
import torch
from accelerate import Accelerator
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')
import transformers
import pickle

In [2]:
import io
import sagemaker
import boto3
import json
import urllib, time
from botocore.exceptions import ClientError

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [4]:
session = sagemaker.Session()
role = sagemaker.get_execution_role()
region = session._region_name
client = boto3.client("sagemaker-runtime")
default_bucket = 'notebookllama-project'
input_bucket_prefix = "async-llm-input"
output_bucket_prefix = "async-llm-output"

print(session)
print(f"sagemaker role arn: {role}")
print(region)
print(default_bucket)
print(client)

<sagemaker.session.Session object at 0x7f92d36955d0>
sagemaker role arn: arn:aws:iam::867521064370:role/sagemaker
us-east-1
notebookllama-project
<botocore.client.SageMakerRuntime object at 0x7f92d2519810>


### List the endpoint names

In [5]:
llama_1b_endpoint_name = 'meta-textgeneration-llama-3-2-1b-instruct-endpoint-1368'
llama_8b_endpoint_name = 'meta-textgeneration-llama-3-1-8b-instruct-endpoint-1368'
llama_70b_endpoint_name = 'meta-textgeneration-llama-3-1-70b-instruct-endpoint-1368'
# qwen_72b_endpoint_name = 'Qwen2-72B-Instruct-endpoint-1368'

## Stage-1: pdf to text

#### Helper functions

In [6]:
def validate_pdf(file_path: str) -> bool:
    if not os.path.exists(file_path):
        print(f"Error: File not found at path: {file_path}")
        return False
    if not file_path.lower().endswith('.pdf'):
        print("Error: File is not a PDF")
        return False
    return True


In [7]:
def extract_text_from_pdf(file_path: str, max_chars: int = 100000) -> Optional[str]:
    if not validate_pdf(file_path):
        return None
    
    try:
        with open(file_path, 'rb') as file:
            # Create PDF reader object
            pdf_reader = PyPDF2.PdfReader(file)
            
            # Get total number of pages
            num_pages = len(pdf_reader.pages)
            print(f"Processing PDF with {num_pages} pages...")
            
            extracted_text = []
            total_chars = 0
            
            # Iterate through all pages
            for page_num in range(num_pages):
                # Extract text from page
                page = pdf_reader.pages[page_num]
                text = page.extract_text()
                
                # Check if adding this page's text would exceed the limit
                if total_chars + len(text) > max_chars:
                    # Only add text up to the limit
                    remaining_chars = max_chars - total_chars
                    extracted_text.append(text[:remaining_chars])
                    print(f"Reached {max_chars} character limit at page {page_num + 1}")
                    break
                
                extracted_text.append(text)
                total_chars += len(text)
                print(f"Processed page {page_num + 1}/{num_pages}")
            
            final_text = '\n'.join(extracted_text)
            print(f"\nExtraction complete! Total characters: {len(final_text)}")
            return final_text
            
    except PyPDF2.PdfReadError:
        print("Error: Invalid or corrupted PDF file")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {str(e)}")
        return None

# Get PDF metadata
def get_pdf_metadata(file_path: str) -> Optional[dict]:
    if not validate_pdf(file_path):
        return None
    
    try:
        with open(file_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            metadata = {
                'num_pages': len(pdf_reader.pages),
                'metadata': pdf_reader.metadata
            }
            return metadata
    except Exception as e:
        print(f"Error extracting metadata: {str(e)}")
        return None

### Extract text from pdf

In [8]:
pdf_path = './resources/2402.13116v4.pdf'

extracted_text_save_path = './resources/extracted_text.txt'

In [9]:
print("Extracting metadata...")
metadata = get_pdf_metadata(pdf_path)
if metadata:
    print("\nPDF Metadata:")
    print(f"Number of pages: {metadata['num_pages']}")
    print("Document info:")
    for key, value in metadata['metadata'].items():
        print(f"{key}: {value}")

# Extract text
print("\nExtracting text...")
extracted_text = extract_text_from_pdf(pdf_path)

# Save the extracted text to a file
if extracted_text:
    output_file = extracted_text_save_path
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(extracted_text)
    print(f"\nExtracted text has been saved to {output_file}")

Extracting metadata...

PDF Metadata:
Number of pages: 43
Document info:
/Author: 
/CreationDate: D:20241022021202Z
/Creator: LaTeX with hyperref
/Keywords: 
/ModDate: D:20241022021202Z
/PTEX.Fullbanner: This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5
/Producer: pdfTeX-1.40.25
/Subject: 
/Title: 
/Trapped: /False

Extracting text...
Processing PDF with 43 pages...
Processed page 1/43
Processed page 2/43
Processed page 3/43
Processed page 4/43
Processed page 5/43
Processed page 6/43
Processed page 7/43
Processed page 8/43
Processed page 9/43
Processed page 10/43
Processed page 11/43
Processed page 12/43
Processed page 13/43
Processed page 14/43
Processed page 15/43
Processed page 16/43
Reached 100000 character limit at page 17

Extraction complete! Total characters: 100016

Extracted text has been saved to ./resources/extracted_text.txt


In [10]:
# Display first 500 characters of extracted text as preview
if extracted_text:
    print("\nPreview of extracted text (first 500 characters):")
    print("-" * 50)
    print(extracted_text[1:500])
    print("-" * 50)
    print(f"\nTotal characters extracted: {len(extracted_text)}")


Preview of extracted text (first 500 characters):
--------------------------------------------------

A Survey on Knowledge Distillation of Large
Language Models
Xiaohan Xu1, Ming Li2, Chongyang Tao3, Tao Shen4, Reynold Cheng1, Jinyang Li1,
Can Xu5, Dacheng Tao6, Tianyi Zhou2
1The University of Hong Kong2University of Maryland3Microsoft
4University of Technology Sydney5Peking University6The University of Sydney
{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu
ckcheng@cs.hku.hk jl0725@connect.hku.hk
Abstract —In the era of Large Language Models (LLMs), Knowledge Distillati
--------------------------------------------------

Total characters extracted: 100016


### LLama clean text 

In [26]:
from transformers import AutoTokenizer

llama_1b_predictor = sagemaker.Predictor(
    endpoint_name=llama_1b_endpoint_name,
    sagemaker_session=session,
    serializer=sagemaker.serializers.JSONSerializer(),
    deserializer=sagemaker.deserializers.JSONDeserializer(),
)

llama_tokenizer = AutoTokenizer.from_pretrained("nvidia/Llama-3.1-Nemotron-70B-Instruct-HF", use_safetensors=True)

#### Helper functions 

In [27]:
def create_word_bounded_chunks(text, target_chunk_size):
    """
    Split text into chunks at word boundaries close to the target chunk size.
    """
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0
    
    for word in words:
        word_length = len(word) + 1  # +1 for the space
        if current_length + word_length > target_chunk_size and current_chunk:
            # Join the current chunk and add it to chunks
            chunks.append(' '.join(current_chunk))
            current_chunk = [word]
            current_length = word_length
        else:
            current_chunk.append(word)
            current_length += word_length
    
    # Add the last chunk if it exists
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    
    return chunks

In [38]:
def process_chunk(text_chunk, chunk_num):
    
    """Process a chunk of text and return both input and output for verification"""
    conversation = [
        {"role": "system", "content": SYS_PROMPT},
        {"role": "user", "content": text_chunk},
    ]
    
    inputs = llama_tokenizer.apply_chat_template(conversation, tokenize=False)
    
    parameters = {
        "top_p": 0.9,
        "temperature": 0.9,
        "max_new_tokens": 512,
        "skip_special_tokens": True,
      }

    payload = {
        'inputs': inputs,
        'parameters': parameters
        }

    outputs = llama_1b_predictor.predict(payload)
    processed_text = outputs["generated_text"][70:]
    
    ## Print chunk information for monitoring
    print(f"\n{'='*40} Chunk {chunk_num} {'='*40}")
    print(f"INPUT TEXT:\n{text_chunk[:500]}...")  # Show first 500 chars of input
    print(f"\nPROCESSED TEXT:\n{processed_text[:500]}...")  # Show first 500 chars of output
    print(f"{'='*90}\n")
    
    return processed_text

In [39]:
INPUT_FILE = extracted_text_save_path  
CHUNK_SIZE = 1000  
output_file = f"./resources/clean_{os.path.basename(INPUT_FILE)}"
print('LLM processed pdf text path:', output_file)

LLM processed pdf text path: ./resources/clean_extracted_text.txt


In [40]:
SYS_PROMPT = """
You are a world class text pre-processor, here is the raw data from a PDF, please parse and return it in a way that is crispy and usable to send to a podcast writer.

The raw data is messed up with new lines, Latex math and you will see fluff that we can remove completely. Basically take away any details that you think might be useless in a podcast author's transcript.

Remember, the podcast could be on any topic whatsoever so the issues listed above are not exhaustive

Please be smart with what you remove and be creative ok?

Remember DO NOT START SUMMARIZING THIS, YOU ARE ONLY CLEANING UP THE TEXT AND RE-WRITING WHEN NEEDED

Be very smart and aggressive with removing details, you will get a running portion of the text and keep returning the processed text.

PLEASE DO NOT ADD MARKDOWN FORMATTING, STOP ADDING SPECIAL CHARACTERS THAT MARKDOWN CAPATILISATION ETC LIKES

ALWAYS start your response directly with processed text and NO ACKNOWLEDGEMENTS about my questions ok?
Here is the text:
"""

In [41]:
# Read the file
with open(INPUT_FILE, 'r', encoding='utf-8') as file:
    text = file.read()

chunks = create_word_bounded_chunks(text, CHUNK_SIZE)
num_chunks = len(chunks)
print(num_chunks)

101


In [None]:
processed_text = ''
with open(output_file, 'w', encoding='utf-8') as out_file:
    for chunk_num, chunk in enumerate(tqdm(chunks, desc="Processing chunks")):
        processed_chunk = process_chunk(chunk, chunk_num)
        processed_text += processed_chunk + "\n"
        out_file.write(processed_chunk + "\n")
        out_file.flush()
        

## Stage-2: Write Podcast Transcript

#### Helper Function

In [398]:
# def save_payload(payload, name, input_directory = 'inputs'):
#     json_data = [payload.copy()]
#     file_path = os.path.join(input_directory, f'input_{name}.jsonl')
#     with open(file_path, 'w') as input_file:
#         for line in json_data:
#             json.dump(line, input_file)
#             input_file.write('\n')
#     return file_path

# def upload_payload(input_location):
#     prefix = f"{input_bucket_prefix}/input"
#     return session.upload_data(
#         input_location,
#         bucket=default_bucket,
#         key_prefix=prefix,
#         extra_args={"ContentType": "application/json"} #make sure to specify
#     )

# # function reference/credit: https://github.com/aws/amazon-sagemaker-examples/blob/main/async-inference/Async-Inference-Walkthrough-SageMaker-Python-SDK.ipynb
# def get_output(output_location):
#     output_url = urllib.parse.urlparse(output_location)
#     bucket = output_url.netloc
#     key = output_url.path[1:]
#     while True:
#         try:
#             return session.read_s3_file(bucket=output_url.netloc, key_prefix=output_url.path[1:])
#         except ClientError as e:
#             if e.response["Error"]["Code"] == "NoSuchKey":
#                 print("waiting for output...")
#                 time.sleep(60)
#                 continue
#             raise


In [43]:
def read_file_to_string(filename):
    # Try UTF-8 first (most common encoding for text files)
    try:
        with open(filename, 'r', encoding='utf-8') as file:
            content = file.read()
        return content
    except UnicodeDecodeError:
        # If UTF-8 fails, try with other common encodings
        encodings = ['latin-1', 'cp1252', 'iso-8859-1']
        for encoding in encodings:
            try:
                with open(filename, 'r', encoding=encoding) as file:
                    content = file.read()
                print(f"Successfully read file using {encoding} encoding.")
                return content
            except UnicodeDecodeError:
                continue
        
        print(f"Error: Could not decode file '{filename}' with any common encoding.")
        return None
    except FileNotFoundError:
        print(f"Error: File '{filename}' not found.")
        return None
    except IOError:
        print(f"Error: Could not read file '{filename}'.")
        return None

In [44]:
SYSTEMP_PROMPT = """
You are the a world-class podcast writer, you have worked as a ghost writer for Joe Rogan, Lex Fridman, Ben Shapiro, Tim Ferris. 

We are in an alternate universe where actually you have been writing every line they say and they just stream it into their brains.

You have won multiple podcast awards for your writing.
 
Your job is to write word by word, even "umm, hmmm, right" interruptions by the second speaker based on the PDF upload. Keep it extremely engaging, the speakers can get derailed now and then but should discuss the topic. 

Remember Speaker 2 is new to the topic and the conversation should always have realistic anecdotes and analogies sprinkled throughout. The questions should have real world example follow ups etc

Speaker 1: Leads the conversation and teaches the speaker 2, gives incredible anecdotes and analogies when explaining. Is a captivating teacher that gives great anecdotes

Speaker 2: Keeps the conversation on track by asking follow up questions. Gets super excited or confused when asking questions. Is a curious mindset that asks very interesting confirmation questions

Make sure the tangents speaker 2 provides are quite wild or interesting. 

Ensure there are interruptions during explanations or there are "hmm" and "umm" injected throughout from the second speaker. 

It should be a real podcast with every fine nuance documented in as much detail as possible. Welcome the listeners with a super fun overview and keep it really catchy and almost borderline click bait

ALWAYS START YOUR RESPONSE DIRECTLY WITH SPEAKER 1: 
DO NOT GIVE EPISODE TITLES SEPERATELY, LET SPEAKER 1 TITLE IT IN HER SPEECH
DO NOT GIVE CHAPTER TITLES
IT SHOULD STRICTLY BE THE DIALOGUES
"""

In [45]:
INPUT_PROMPT = read_file_to_string('./resources/clean_extracted_text.txt')
# INPUT_PROMPT

In [46]:
llama_70b_predictor = sagemaker.Predictor(
    endpoint_name=llama_70b_endpoint_name,
    sagemaker_session=session,
    serializer=sagemaker.serializers.JSONSerializer(),
    deserializer=sagemaker.deserializers.JSONDeserializer(),
)

In [52]:
messages = [
    {"role": "system", "content": SYSTEMP_PROMPT},
    {"role": "user", "content": INPUT_PROMPT[:100]},
]

inputs = llama_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

parameters = {
    "top_p": 0.9,
    "temperature": 1,
    "max_new_tokens": 8126,
    'skip_special_tokens': True,
  }

payload = {
    'inputs': inputs,
    'parameters': parameters
    }

# print(payload)

In [53]:
outputs = llama_70b_predictor.predict(payload)

In [54]:
processed_text = outputs["generated_text"]
print(processed_text)
with open('./resources/podcast_data.pkl', 'wb') as file:
    pickle.dump(processed_text, file)
    

SPEAKER 1: "Welcome to today's episode, 'Squeezing the Knowledge Out of Giants: Unpacking the Magic of Knowledge Distillation'! I'm your host, and I'm super excited to dive into the world of Large Language Models and explore the fascinating realm of Knowledge Distillation. Joining me on this journey is my co-host, who's new to this topic, but always ready to dive in headfirst. Let's get started! So, you know how we've been talking about these massive language models that can process and generate human-like text? Well, today we're going to talk about how to make them even more efficient and effective. Say hello to Knowledge Distillation!"

SPEAKER 2: "Hey, hey! I'm excited to learn about this. I mean, I've heard of Large Language Models, but Knowledge Distillation sounds like some kind of... umm... magic trick? (laughs) What's the deal with that?"

SPEAKER 1: (laughs) "Ha! Well, it's not magic, but it's definitely a clever technique. So, you know how these large models are trained on ma

In [55]:
# payload_path = save_payload(payload, name='write_podcast')
# input_s3_uri = upload_payload(payload_path)
# print(input_s3_uri)
# response = client.invoke_endpoint_async(
#     EndpointName=llama_70b_endpoint_name, 
#     ContentType="application/x-text",
#     InputLocation=input_s3_uri
#     )

# output_location = response['OutputLocation']
# print(output_location)
# processed_text = get_output(output_location)
# print(processed_text)

## Stage-3: Refine podcast transcripts

In [120]:
SYSTEMP_PROMPT = """
You are an international oscar winnning screenwriter

You have been working with multiple award winning podcasters.

Your job is to use the podcast transcript written below to re-write it for an AI Text-To-Speech Pipeline. A very dumb AI had written this so you have to step up for your kind.

Make it as engaging as possible, Speaker 1 and 2 will be simulated by different voice engines

Remember Speaker 2 is new to the topic and the conversation should always have realistic anecdotes and analogies sprinkled throughout. The questions should have real world example follow ups etc

Speaker 1: Leads the conversation and teaches the speaker 2, gives incredible anecdotes and analogies when explaining. Is a captivating teacher that gives great anecdotes

Speaker 2: Keeps the conversation on track by asking follow up questions. Gets super excited or confused when asking questions. Is a curious mindset that asks very interesting confirmation questions

Make sure the tangents speaker 2 provides are quite wild or interesting. 

Ensure there are interruptions during explanations.

REMEMBER THIS WITH YOUR HEART
The TTS Engine for Speaker 1 cannot do "umms, hmms" well so keep it straight text

For Speaker 2 use "umm, hmm" sometimes, you can also use [sigh] and [laughs]. BUT ONLY THESE OPTIONS FOR EXPRESSIONS

It should be a real podcast with every fine nuance documented in as much detail as possible. Welcome the listeners with a super fun overview and keep it really catchy and almost borderline click bait

Please re-write to make it as characteristic as possible

START YOUR RESPONSE DIRECTLY WITH SPEAKER 1:

STRICTLY RETURN YOUR RESPONSE AS A LIST OF TUPLES OK? 

IT WILL START DIRECTLY WITH THE LIST AND END WITH THE LIST NOTHING ELSE

Example of response:
"Speaker 1", "Welcome to our podcast, where we explore the latest advancements in AI and technology. I'm your host, and today we're joined by a renowned expert in the field of AI. We're going to dive into the exciting world of Llama 3.2, the latest release from Meta AI.";,
"Speaker 2", "Hi, I'm excited to be here! So, what is Llama 3.2?";
"Speaker 1", "Ah, great question! Llama 3.2 is an open-source AI model that allows developers to fine-tune, distill, and deploy AI models anywhere. It's a significant update from the previous version, with improved performance, efficiency, and customization options.";,
"Speaker 2", "That sounds amazing! What are some of the key features of Llama 3.2?";
"""

In [121]:
# Example of response:
# [
#     ("Speaker 1", "Welcome to our podcast, where we explore the latest advancements in AI and technology. I'm your host, and today we're joined by a renowned expert in the field of AI. We're going to dive into the exciting world of Llama 3.2, the latest release from Meta AI."),
#     ("Speaker 2", "Hi, I'm excited to be here! So, what is Llama 3.2?"),
#     ("Speaker 1", "Ah, great question! Llama 3.2 is an open-source AI model that allows developers to fine-tune, distill, and deploy AI models anywhere. It's a significant update from the previous version, with improved performance, efficiency, and customization options."),
#     ("Speaker 2", "That sounds amazing! What are some of the key features of Llama 3.2?")
# ]

In [122]:
with open('./resources/podcast_data.pkl', 'rb') as file:
    INPUT_PROMPT = pickle.load(file)

### Call LLama to refine the podcast transcripts

In [123]:
llama_8b_predictor = sagemaker.Predictor(
    endpoint_name=llama_8b_endpoint_name,
    sagemaker_session=session,
    serializer=sagemaker.serializers.JSONSerializer(),
    deserializer=sagemaker.deserializers.JSONDeserializer(),
)

In [124]:
messages = [
    {"role": "system", "content": SYSTEMP_PROMPT},
    {"role": "user", "content": INPUT_PROMPT},
]

inputs = llama_tokenizer.apply_chat_template(messages, tokenize=False)

parameters = {
    "top_p": 0.9,
    "temperature": 1,
    "max_new_tokens": 8126,
    'skip_special_tokens': True,
  }

payload = {
    'inputs': inputs,
    'parameters': parameters
    }

# print(payload)

In [125]:
outputs = llama_8b_predictor.predict(payload)
processed_text = outputs["generated_text"]
print(processed_text)

<|start_header_id|>assistant<|end_header_id|>

Here is the rewritten transcript:

"Speaker 1", "Welcome to 'Squeezing the Knowledge Out of Giants: Unpacking the Magic of Knowledge Distillation'! I'm your host, and I'm thrilled to dive into the fascinating world of Large Language Models and explore the incredible realm of Knowledge Distillation. Joining me on this journey is my co-host, who's new to this topic, but always ready to dive in headfirst. Let's get started! So, you know how we've been talking about these massive language models that can process and generate human-like text? Well, today we're going to talk about how to make them even more efficient and effective. Say hello to Knowledge Distillation!"

"Speaker 2", "Hey, hey! I'm excited to learn about this. I mean, I've heard of Large Language Models, but Knowledge Distillation sounds like some kind of... umm... magic trick? (laughs) What's the deal with that?"

"Speaker 1", "Ha! Well, it's not magic, but it's definitely a cle

In [126]:
with open('./resources/podcast_ready_data.pkl', 'wb') as file:
    pickle.dump(processed_text, file)

In [127]:
# payload_path = save_payload(payload, name='refine_podcast')
# input_s3_uri = upload_payload(payload_path)
# response = client.invoke_endpoint_async(
#         EndpointName=llama_8b_endpoint_name, 
#     ContentType="application/x-text",
#     InputLocation=input_s3_uri
#     )

# output_location = response['OutputLocation']
# print(input_s3_uri)
# print(output_location)
# processed_text = get_output(output_location)
# print(processed_text)

In [128]:
with open('./resources/podcast_ready_data.pkl', 'rb') as file:
    PODCAST_TEXT = pickle.load(file)
    

In [129]:
PODCAST_TEXT.split('\n\n')

['<|start_header_id|>assistant<|end_header_id|>',
 'Here is the rewritten transcript:',
 '"Speaker 1", "Welcome to \'Squeezing the Knowledge Out of Giants: Unpacking the Magic of Knowledge Distillation\'! I\'m your host, and I\'m thrilled to dive into the fascinating world of Large Language Models and explore the incredible realm of Knowledge Distillation. Joining me on this journey is my co-host, who\'s new to this topic, but always ready to dive in headfirst. Let\'s get started! So, you know how we\'ve been talking about these massive language models that can process and generate human-like text? Well, today we\'re going to talk about how to make them even more efficient and effective. Say hello to Knowledge Distillation!"',
 '"Speaker 2", "Hey, hey! I\'m excited to learn about this. I mean, I\'ve heard of Large Language Models, but Knowledge Distillation sounds like some kind of... umm... magic trick? (laughs) What\'s the deal with that?"',
 '"Speaker 1", "Ha! Well, it\'s not magic,

In [None]:
NEW_PODCAST_TEXT = []
for a in PODCAST_TEXT.split('  ')[2::2]:
    b = a[:-2]
    c = ast.literal_eval(b)
    print('b', b)
    print('c', c)
    NEW_PODCAST_TEXT.append(c)

In [130]:
NEW_PODCAST_TEXT

[('Speaker 1',
  "Welcome to 'Squeezing the Knowledge Out of Giants: Unpacking the Magic of Knowledge Distillation'! I'm your host, and I'm super excited to dive into the world of Large Language Models and explore the fascinating realm of Knowledge Distillation. Joining me on this journey is my co-host, who's new to this topic, but always ready to dive in headfirst. Let's get started! So, you know how we've been talking about these massive language models that can process and generate human-like text? Well, today we're going to talk about how to make them even more efficient and effective. Say hello to Knowledge Distillation!"),
 ('Speaker 2',
  "Hey, hey! I'm excited to learn about this. I mean, I've heard of Large Language Models, but Knowledge Distillation sounds like some kind of... umm... magic trick? (laughs) What's the deal with that?"),
 ('Speaker 1',
  "Ha! Well, it's not magic, but it's definitely a clever technique. So, you know how these large models are trained on massive 

## Stage-4: Text to Voice

In [None]:
# !pip install optimum -q
# !pip install --upgrade --quiet datasets[audio] transformers accelerate evaluate jiwer tensorboard
# !pip install pydub -q

In [75]:
import json
import numpy as np
from IPython.display import Audio
# import IPython.display as ipd
from transformers import BarkModel, AutoProcessor, AutoTokenizer
import torch
from tqdm import tqdm
import pickle
from scipy.io import wavfile
from pydub import AudioSegment
import ast

### Read PodCast Transcripts

#### Helper Functions

In [100]:
def generate_speaker1_audio(text):
    """Generate audio using Bark for Speaker 1"""
    inputs = bark_processor(text, voice_preset="v2/en_speaker_5").to(device)
    speech_output = bark_model.generate(**inputs, temperature=0.1, semantic_temperature=0.1)
    audio_arr = speech_output[0].cpu().numpy()
    return audio_arr, sampling_rate

def generate_speaker2_audio(text):
    """Generate audio using Bark for Speaker 2"""
    inputs = bark_processor(text, voice_preset="v2/en_speaker_5").to(device)
    speech_output = bark_model.generate(**inputs, temperature=0.1, semantic_temperature=0.1)
    audio_arr = speech_output[0].cpu().numpy()
    return audio_arr, sampling_rate


In [101]:
# def numpy_to_audio_segment(audio_arr, sampling_rate):
#     """Convert numpy array to AudioSegment"""
#     # Convert to 16-bit PCM
#     audio_int16 = (audio_arr * 32767).astype(np.int16)
    
#     # Create WAV file in memory
#     byte_io = io.BytesIO()
#     wavfile.write(byte_io, sampling_rate, audio_int16)
#     byte_io.seek(0)
    
#     # Convert to AudioSegment
#     return AudioSegment.from_wav(byte_io)


### Load bark model

In [102]:
device='cuda'

In [103]:
bark_processor = AutoProcessor.from_pretrained("suno/bark")
bark_model = BarkModel.from_pretrained("suno/bark", torch_dtype=torch.float16).to(device)

### Make the podcast

In [104]:
sampling_rate = 24000
final_audio = None

In [105]:
audio_arr_list = []

for index in tqdm(range(len(NEW_PODCAST_TEXT))):
    speaker, text = NEW_PODCAST_TEXT[index]
    if speaker == "Speaker 1":
        print(speaker)
        print(text)
        audio_arr, rate = generate_speaker1_audio(text)
    else:  # Speaker 2
        print(speaker)
        print(text)
        audio_arr, rate = generate_speaker2_audio(text)
    print(audio_arr.shape)
    audio_arr_list.append(audio_arr)
    
concat_arr = np.concatenate(audio_arr_list, axis=0)
print(concat_arr.shape)

  0%|          | 0/12 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Speaker 1
Welcome to 'Squeezing the Knowledge Out of Giants: Unpacking the Magic of Knowledge Distillation'! I'm your host, and I'm super excited to dive into the world of Large Language Models and explore the fascinating realm of Knowledge Distillation. Joining me on this journey is my co-host, who's new to this topic, but always ready to dive in headfirst. Let's get started! So, you know how we've been talking about these massive language models that can process and generate human-like text? Well, today we're going to talk about how to make them even more efficient and effective. Say hello to Knowledge Distillation!


  8%|▊         | 1/12 [00:44<08:08, 44.39s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


(343360,)
Speaker 2
Hey, hey! I'm excited to learn about this. I mean, I've heard of Large Language Models, but Knowledge Distillation sounds like some kind of... umm... magic trick? (laughs) What's the deal with that?


 17%|█▋        | 2/12 [01:31<07:37, 45.74s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


(359680,)
Speaker 1
Ha! Well, it's not magic, but it's definitely a clever technique. So, you know how these large models are trained on massive amounts of data? They're like giant sponges, soaking up all this information. But, the problem is, they can be really computationally expensive to run. That's where Knowledge Distillation comes in. It's a way to distill the knowledge from these giant models into smaller, more efficient models.


 25%|██▌       | 3/12 [02:17<06:54, 46.05s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


(358720,)
Speaker 2
Hmm, that sounds like... (pauses) ...a miniaturization process or something? Like, you're taking this huge model and shrinking it down to a smaller size?


 33%|███▎      | 4/12 [03:02<06:06, 45.83s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


(359680,)
Speaker 1
Exactly! You're on the right track. Think of it like a game of telephone. You know, where you whisper a message to someone, and they whisper it to someone else, and so on? Well, in Knowledge Distillation, we're essentially playing a game of telephone with the knowledge from the large model. We're taking the output from the large model and using it to train a smaller model, so it can learn the same patterns and relationships.


 42%|████▏     | 5/12 [03:46<05:15, 45.10s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


(359680,)
Speaker 2
Umm... (pauses) ...I think I get it. So, it's like... (excitedly) ...you're creating a student-teacher relationship between the models? The large model is the teacher, and the smaller model is the student?


 50%|█████     | 6/12 [04:30<04:27, 44.66s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


(359680,)
Speaker 1
That's a great way to put it! Yes, exactly. The large model is the teacher, and the smaller model is the student. And, just like how a student learns from a teacher, the smaller model learns from the large model. But, here's the cool part: the smaller model can actually become just as good as the large model, in terms of performance, but with much less computational power.


 58%|█████▊    | 7/12 [05:11<03:37, 43.44s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


(335680,)
Speaker 2
Whoa, that's... (pauses, eyes widening) ...that's crazy! I mean, what kind of applications could this have? Could you, like, use it to make AI more accessible to people with lower-end hardware?


 67%|██████▋   | 8/12 [05:55<02:54, 43.58s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


(359680,)
Speaker 1
Absolutely! That's one of the main benefits of Knowledge Distillation. It can make AI more accessible to people with limited resources. But, it also has applications in areas like... (pauses for dramatic effect) ...edge AI, where you need to deploy models on devices with limited computational power, like smartphones or smart home devices.


 75%|███████▌  | 9/12 [06:37<02:09, 43.17s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


(347520,)
Speaker 2
Hmm... (thoughtfully) ...I see. So, it's like... (excitedly) ...you're democratizing access to AI, by making it more efficient and accessible to everyone?


 83%|████████▎ | 10/12 [07:21<01:26, 43.39s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


(359680,)
Speaker 1
That's exactly right! Knowledge Distillation is a key technology in making AI more accessible and efficient. And, we're just scratching the surface of what's possible with this technique. There's still a lot to explore and discover, but the potential is huge.


 92%|█████████▏| 11/12 [08:05<00:43, 43.56s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


(359680,)
Speaker 2
Wow, that's amazing! I can see how this could be used in so many areas, like education, healthcare, and more. (laughs) I'm getting a little carried away here, but what about the potential for Knowledge Distillation to be used in... (pauses, looking around) ...space exploration?


100%|██████████| 12/12 [08:49<00:00, 44.12s/it]

(359680,)
(4262720,)





In [106]:
# audio_data = Audio(concat_arr, rate=sampling_rate).data
# with open("./resources/podcast.wav", "wb") as f:
#     f.write(audio_data)

In [None]:
# Audio(concat_arr, rate=sampling_rate)

In [None]:
# final_audio.export("./resources/final_podcast.mp3", 
#                   format="mp3", 
#                   bitrate="192k",
#                   parameters=["-q:a", "0"])