In [None]:
import os
from dotenv import load_dotenv
import tempfile
import whisper
from pytubefix import YouTube
import openai
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import re  # Add this import to sanitize the score

# Load API keys and environment variables
load_dotenv('KEY_FILE.env')
client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

# YouTube video URL
YOUTUBE_VIDEO = "https://www.youtube.com/watch?v=WX7DBPcsiEs&t=16s"

# Whisper transcription setup
try:
    whisper_model = whisper.load_model("base")
except Exception as e:
    print(f"Error loading Whisper model: {e}")
    whisper_model = None

# Step 1: Download YouTube audio and transcribe
def download_and_transcribe(video_url):
    """
    Downloads the audio from a YouTube video and generates its transcript.
    """
    try:
        youtube = YouTube(video_url)
        audio = youtube.streams.filter(only_audio=True).first()
        if not audio:
            raise ValueError("No audio stream available.")
        
        with tempfile.TemporaryDirectory() as tmpdir:
            audio_file = audio.download(output_path=tmpdir)
            transcription = whisper_model.transcribe(audio_file, fp16=False, task="translate")["text"].strip()
            
            # Save transcription to a file
            with open("youtube_transcription.txt", "w") as file:
                file.write(transcription)
            
        return transcription
    except Exception as e:
        print(f"Error during transcription: {e}")
        return None

# Step 2: Chunk transcript into manageable pieces
def split_transcript(text, max_length=1500, overlap=200):
    """
    Splits the transcript into chunks with optional overlap for context retention.
    """
    sentences = text.split('. ')
    chunks, current_chunk = [], ""
    for sentence in sentences:
        if len(current_chunk) + len(sentence) < max_length:
            current_chunk += sentence + ". "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + ". "
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks

# Step 3: Analyze chunks with OpenAI


def detect_toxicity_openai(chunks):
    """
    Analyzes chunks of text for harmful or toxic content using OpenAI.
    """
    toxicity_results = []
    for chunk in chunks:
        try:
            response = client.chat.completions.create(
                model="gpt-4",
                messages=[
                    {"role": "system", "content": "You are a content moderation assistant. Analyze the following text for harmful or toxic content. Provide a toxicity score between 0 (non-toxic) and 1 (highly toxic), and explain your reasoning."},
                    {"role": "user", "content": chunk}
                ]
            )
            result = response.choices[0].message.content
            
            # Extract toxicity score safely
            if "Toxicity Score:" in result:
                score_raw = result.split("Toxicity Score:")[1].split()[0]
                # Remove non-numeric characters like "**" using regex
                sanitized_score = re.sub(r"[^\d.]", "", score_raw)
                toxicity_score = float(sanitized_score)
            else:
                toxicity_score = 0.0  # Default to non-toxic if score is missing
            
            toxicity_results.append({"chunk": chunk, "toxicity_score": toxicity_score, "result": result})
        except openai.OpenAIError as e:
            print(f"Error with OpenAI API: {e}")
            toxicity_results.append({"chunk": chunk, "toxicity_score": -1, "result": "Error processing chunk"})
        except ValueError as e:
            print(f"Error parsing toxicity score: {e}")
            toxicity_results.append({"chunk": chunk, "toxicity_score": -1, "result": "Invalid score format"})
        time.sleep(2)  # Delay to avoid hitting API rate limits
    return toxicity_results



# Step 4: Visualize sentiment heatmap
def visualize_heatmap(toxicity_results):
    """
    Generates a heatmap of toxicity scores for the transcript chunks.
    """
    scores = [result["toxicity_score"] for result in toxicity_results if result["toxicity_score"] >= 0]
    plt.figure(figsize=(10, 2))
    sns.heatmap([scores], cmap="Reds", cbar=True, xticklabels=range(len(scores)), yticklabels=["Toxicity"])
    plt.title("Sentiment Heatmap of Transcript")
    plt.savefig("toxicity_heatmap.png")
    plt.show()

# Step 5: Analyze results and flag harmful content
def analyze_and_flag(toxicity_results):
    """
    Flags chunks with high toxicity and calculates a risk score.
    """
    flagged_chunks = [result for result in toxicity_results if result["toxicity_score"] > 0.5]
    risk_score = (len(flagged_chunks) / len(toxicity_results)) * 100
    return flagged_chunks, risk_score

# Step 6: Save flagged content and generate report
def generate_report(flagged_chunks, risk_score):
    """
    Generates a detailed report of flagged content and saves it to a file.
    Includes a summary even if no harmful content is flagged.
    """
    try:
        with open("flagged_transcript.txt", "w") as file:
            if flagged_chunks:
                file.write("Flagged Content Analysis Report\n")
                file.write(f"Risk Score: {risk_score:.2f}% harmful content flagged.\n\n")
                for chunk in flagged_chunks:
                    file.write(f"Toxicity Score: {chunk['toxicity_score']}\n")
                    file.write(f"Chunk: {chunk['chunk']}\n")
                    file.write(f"Result: {chunk['result']}\n\n")
            else:
                file.write("Flagged Content Analysis Report\n")
                file.write("No harmful content was detected in the video.\n")
                file.write(f"Risk Score: {risk_score:.2f}% harmful content flagged.\n\n")
                file.write("The content of the video appears to be safe and within acceptable limits.\n")
        print("Report saved to 'flagged_transcript.txt'.")
        print(f"Risk Score: {risk_score:.2f}% harmful content flagged.")
    except Exception as e:
        print(f"Error generating report: {e}")


# Main workflow
def main():
    """
    Main function to execute the video transcription and toxicity analysis workflow.
    """
    if not whisper_model:
        print("Whisper model could not be loaded. Exiting.")
        return
    
    # Step 1: Transcribe video
    print("Transcribing YouTube video...")
    transcript = download_and_transcribe(YOUTUBE_VIDEO)
    if not transcript:
        print("Failed to transcribe video. Exiting.")
        return
    print("Transcription completed.")
    
    # Step 2: Chunk transcript
    print("Chunking transcript...")
    chunks = split_transcript(transcript)
    
    # Step 3: Detect toxicity
    print("Detecting toxicity with OpenAI...")
    toxicity_results = detect_toxicity_openai(chunks)
    
    # Step 4: Visualize heatmap of toxicity scores
    print("Visualizing heatmap...")
    visualize_heatmap(toxicity_results)
    
    # Step 5: Analyze and flag content
    print("Analyzing results...")
    flagged_chunks, risk_score = analyze_and_flag(toxicity_results)
    
    # Step 6: Generate report
    generate_report(flagged_chunks, risk_score)
    
    # Print warnings if risk score exceeds threshold
    threshold = 30
    if risk_score > threshold:
        print(f"Warning: The video contains {risk_score:.2f}% harmful or violent content.")
    else:
        print(f"The video contains {risk_score:.2f}% harmful content, which is within acceptable limits.")

if __name__ == "__main__":
    main()
