# Avatar Presentation

## Transcript Generation


In [None]:
# Generate a transcript for a presentation based on the input slide in pptx format using llm 
from pptx import Presentation
import openai
import os
import json
from dotenv import load_dotenv

# Load environment variables
load_dotenv()
# Initialize OpenAI client
llm = openai.OpenAI(api_key="your-api-key")

# Load the presentation
pptx_path = "MASc_Seminar_final.pptx"
presentation = Presentation(pptx_path)

# Process each slide
transcripts = []
for idx, slide in enumerate(presentation.slides, 1):
    # Extract all text from the slide
    slide_content = []
    for shape in slide.shapes:
        if hasattr(shape, "text") and shape.text.strip():
            slide_content.append(shape.text.strip())
    
    if not slide_content:
        continue
        
    # Join all text content
    full_content = "\n".join(slide_content)
    
    # Generate transcript using LLM
    prompt = f"""Please generate a natural, conversational transcript for the following presentation slide content. 
    Make it sound like someone giving a presentation, with proper transitions and explanations.
    
    Slide content:
    {full_content}
    
    Transcript:"""
    
    response = llm.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=500
    )
    
    transcript = response.choices[0].message.content.strip()
    
    # Store results
    slide_data = {
        "slide_number": idx,
        "original_content": full_content,
        "transcript": transcript
    }
    transcripts.append(slide_data)
    
    print(f"Processed slide {idx}")

# Save transcripts to a JSON file
output_file = "presentation_transcripts.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(transcripts, f, indent=2, ensure_ascii=False)

print(f"\nTranscripts have been saved to {output_file}")

# Print first transcript as example
if transcripts:
    print("\nExample transcript for first slide:")
    print(transcripts[0]["transcript"])

## Voice Generation

In [None]:
import json
import requests
import os
import time
from pathlib import Path

# Load the transcripts
with open("presentation_transcripts.json", "r", encoding="utf-8") as f:
    transcripts = json.load(f)

# Create output directory for audio files if it doesn't exist
output_dir = Path("audio_files")
output_dir.mkdir(exist_ok=True)

# Minimax TTS API configuration
api_url = "https://api.minimax.chat/v1/t2a_v2"
api_key = "your-api-key"  # Replace with your actual API key

headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {api_key}"
}

# Process each transcript
for slide_data in transcripts:
    slide_number = slide_data["slide_number"]
    transcript = slide_data["transcript"]
    
    # Prepare the request payload
    payload = {
        "model": "speech-02-hd",
        "text": transcript,
        "stream": False,
        "language_boost": "auto",
        "output_format": "hex",
        "voice_setting": {
            "voice_id": "male-qn-qingse",
            "speed": 1,
            "vol": 1,
            "pitch": 0,
            "emotion": "happy"
        },
        "audio_setting": {
            "sample_rate": 32000,
            "bitrate": 128000,
            "format": "mp3"
        }
    }
    
    # Make the API request
    try:
        response = requests.post(api_url, headers=headers, json=payload)
        response.raise_for_status()  # Raise an exception for HTTP errors
        
        # Get the audio data from the response
        audio_data = response.json().get("audio")
        
        if audio_data:
            # Convert hex to binary and save to file
            audio_binary = bytes.fromhex(audio_data)
            output_file = output_dir / f"slide_{slide_number}.mp3"
            
            with open(output_file, "wb") as f:
                f.write(audio_binary)
            
            print(f"Generated audio for slide {slide_number}")
            
            # Add a small delay to avoid rate limiting
            time.sleep(1)
        else:
            print(f"No audio data received for slide {slide_number}")
    
    except Exception as e:
        print(f"Error generating audio for slide {slide_number}: {str(e)}")

print("Audio generation complete!") 