In [None]:
pip install openai-whisper librosa scikit-learn

Collecting openai-whisper
  Downloading openai-whisper-20240930.tar.gz (800 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/800.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m36.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tiktoken (from openai-whisper)
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->openai-whisper)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->openai-whisper)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nv

In [None]:

import whisper
import librosa
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from moviepy.editor import AudioFileClip

# Step 1: Load Whisper Model for Speech Recognition
model = whisper.load_model("base")

# Step 2: Transcribe Speech using Whisper
def transcribe_audio(audio_path):
    result = model.transcribe(audio_path)
    print("Transcription: ", result["text"])
    return result["text"], result["language"]

# Step 3: Extract Audio Features (MFCCs and Pitch) for Emotion Recognition
def extract_audio_features(audio_path):
    # Load audio using librosa
    y, sr = librosa.load(audio_path, sr=16000)

    # Extract MFCC features
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)

    # Extract pitch
    pitch, _ = librosa.core.piptrack(y=y, sr=sr)

    # Ensure pitch has same length as MFCC (take mean of pitch over time)
    pitch = np.mean(pitch, axis=1)

    # Calculate mean and standard deviation for MFCCs and pitch as features
    mfcc_features = np.mean(mfcc, axis=1)

    # Limit pitch features to match number of MFCC features (13 features)
    pitch_features = pitch[:13]  # Use first 13 pitch features

    # Combine features
    audio_features = np.concatenate([mfcc_features, pitch_features])

    return audio_features

# Step 4: Train a Simple Emotion Recognition Model (You can train this model beforehand with labeled data)
def train_emotion_classifier():
    # Sample labeled data (features and emotions)
    X = np.random.rand(100, 26)  # 13 MFCC + 13 pitch features
    y = np.random.choice(['happy', 'sad', 'angry', 'neutral'], size=100)

    # Encode labels
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)

    # Train a simple classifier (Support Vector Machine)
    classifier = SVC(kernel='linear')
    classifier.fit(X, y_encoded)

    return classifier, le

# Step 5: Classify Emotion Based on Extracted Features
def classify_emotion(features, classifier, le):
    emotion_idx = classifier.predict([features])[0]
    emotion = le.inverse_transform([emotion_idx])[0]
    return emotion

# Step 6: Convert MP4 to WAV using MoviePy
def convert_mp4_to_wav(mp4_path, wav_path):
    audio_clip = AudioFileClip(mp4_path)
    audio_clip.write_audiofile(wav_path, codec='pcm_s16le')

# Step 7: Combine Everything for Emotion-Aware Speech Recognition
def emotion_aware_speech_recognition(mp4_path):
    # Step 7.1: Convert MP4 to WAV
    wav_path = "Ducky_Happy_Juice_😇🐤_#animation_#ducky_#happyjuice_#funny(360p).wav"
    convert_mp4_to_wav(mp4_path, wav_path)

    # Step 7.2: Transcribe Audio
    transcription, language = transcribe_audio(wav_path)

    # Step 7.3: Extract Audio Features for Emotion Detection
    audio_features = extract_audio_features(wav_path)

    # Step 7.4: Classify Emotion Based on Audio Features
    emotion = classify_emotion(audio_features, emotion_classifier, label_encoder)

    # Step 7.5: Output Result
    print(f"Detected Emotion: {emotion}")
    print(f"Transcription: {transcription}")
    print(f"Language Detected: {language}")

# Step 8: Train the Emotion Classifier (Only once)
emotion_classifier, label_encoder = train_emotion_classifier()

# Example usage
audio_path = "/content/Ducky_Happy_Juice_😇🐤_#animation_#ducky_#happyjuice_#funny(360p).mp4"  # Replace with your actual file path
emotion_aware_speech_recognition(audio_path)

  if event.key is 'enter':

100%|████████████████████████████████████████| 139M/139M [00:00<00:00, 217MiB/s]


MoviePy - Writing audio in Ducky_Happy_Juice_😇🐤_#animation_#ducky_#happyjuice_#funny(360p).wav





MoviePy - Done.
Transcription:   Happy juice. Happy juice. Say it louder. Happy juice. So happy juice. Happy juice. Is this mom is happy juice?
Detected Emotion: happy
Transcription:  Happy juice. Happy juice. Say it louder. Happy juice. So happy juice. Happy juice. Is this mom is happy juice?
Language Detected: en
