In [None]:
!pip install pydub

In [None]:
!pip install ffprobe-python

In [None]:
import torch
from transformers import AutoProcessor, AutoModelForAudioClassification, Wav2Vec2FeatureExtractor
import numpy as np
from pydub import AudioSegment
from ffprobe import FFProbe
import torch
from torch.nn.functional import softmax
import os
import pandas as pd

In [None]:
model1 = AutoModelForAudioClassification.from_pretrained("ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition")
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-large-xlsr-53")

In [None]:
!unzip /content/segment_audios.zip

In [None]:
def predict_emotion(audio_file):
    sound = AudioSegment.from_file(audio_file)
    sound = sound.set_frame_rate(16000)
    sound_array = np.array(sound.get_array_of_samples())

    inputs = feature_extractor(
        raw_speech=sound_array,
        sampling_rate=16000,
        padding=True,
        return_tensors="pt")
    input_values = inputs.input_values.type(torch.float32)
    # Use the model to predict the emotion scores
    result = model1(input_values).logits
    # Apply softmax to get probabilities
    probabilities = torch.nn.functional.softmax(result, dim=1)
    # Convert probabilities to percentages
    percentages = probabilities.detach().cpu().numpy().flatten() * 100

    # Create a mapping of labels to percentages
    id2label = {
        "0": "angry",
        "1": "calm",
        "2": "disgust",
        "3": "fearful",
        "4": "happy",
        "5": "neutral",
        "6": "sad",
        "7": "surprised"
    }

    # Combine the labels with their corresponding percentages
    emotion_percentages = {id2label[str(i)]: percentage for i, percentage in enumerate(percentages)}

    # Sort the emotion percentages dictionary by the highest percentage and return it
    sorted_emotion_percentages = dict(sorted(emotion_percentages.items(), key=lambda item: item[1], reverse=True))

    return sorted_emotion_percentages

In [None]:
def convert_logits_to_percentages(prediction):
  # Convert logits to a tensor
  logits_tensor = torch.tensor(list(prediction.values()))

  # Apply softmax to convert logits to probabilities
  probabilities_tensor = softmax(logits_tensor, dim=0)

  # Convert probabilities to percentages
  percentages = probabilities_tensor.numpy()

  # Map back to the corresponding labels
  percentages_dict = {emotion: percentage for emotion, percentage in zip(prediction.keys(), percentages)}

  # Sort the percentages dictionary by value in descending order
  sorted_percentages_dict = dict(sorted(percentages_dict.items(), key=lambda item: item[1], reverse=True))

  return sorted_percentages_dict

In [None]:
base_directory = '/content/segment_audios'


# Iterate over each 'youtubeX_segments' folder
for folder in sorted(os.listdir(base_directory)):
    csv_rows = []
    print("Processing " + folder)
    # Make sure we're working with a directory
    if os.path.isdir(os.path.join(base_directory, folder)):
        # List all mp3 files in the folder
        files = sorted([f for f in os.listdir(os.path.join(base_directory, folder)) if f.endswith('.mp3')])

        # Iterate over each segment file
        for file in files:
            # Full path to the segment file
            audio_file_path = os.path.join(base_directory, folder, file)
            # Predict emotion for the segment
            emotion_prediction = predict_emotion(audio_file=audio_file_path)
            prob_emotions = convert_logits_to_percentages(emotion_prediction)
            # Add a row to the CSV list with the filename and predictions
            row = [f'{folder}_{file}'] + [prob_emotions.get(emotion, None) for emotion in prob_emotions]
            csv_rows.append(row)
            print("Add row for " + f'{folder}_{file}')
        # Convert the list to a DataFrame
        df = pd.DataFrame(csv_rows, columns=['segment_file'] + sorted(prob_emotions))

        # Save to CSV
        csv_name = "emotion_predictions " + f'{folder}.csv'
        df.to_csv('/content/' + csv_name, index=False)

In [None]:
import pandas as pd
import glob

file_pattern = "/content/audio_emotion/*.csv"
file_list = glob.glob(file_pattern)

# Step 2: Read each of the CSV files into a list of DataFrames
dfs = [pd.read_csv(file) for file in file_list]

# Step 3: Concatenate all the DataFrames into one
combined_df = pd.concat(dfs, ignore_index=True)
# Step 4: Write the combined DataFrame to a new CSV file
#combined_df.to_csv('combined_file.csv', index=False)

In [None]:
combined_df = combined_df.rename(columns={'angry': 'audio_anger'})
combined_df = combined_df.rename(columns={'disgust': 'audio_disgust'})
combined_df = combined_df.rename(columns={'fearful': 'audio_fear'})
combined_df = combined_df.rename(columns={'happy': 'audio_happiness'})
combined_df = combined_df.rename(columns={'neutral': 'audio_neutral'})
combined_df = combined_df.rename(columns={'sad': 'audio_sadness'})
combined_df = combined_df.rename(columns={'surprised': 'audio_surprise'})

In [None]:
combined_df.head()

Unnamed: 0,video_id,audio_anger,audio_disgust,audio_fear,audio_happiness,audio_neutral,audio_sadness,audio_surprise
0,youtube8_segments_segment_0.mp3,0.541744,0.080928,0.079554,0.047709,0.029272,0.021951,0.020403
1,youtube8_segments_segment_1.mp3,0.436936,0.104587,0.084405,0.060152,0.032314,0.027693,0.018113
2,youtube8_segments_segment_2.mp3,0.510702,0.106545,0.095304,0.049503,0.034129,0.024019,0.019587
3,youtube8_segments_segment_3.mp3,0.464592,0.128863,0.110034,0.060575,0.030673,0.024038,0.022259
4,youtube8_segments_segment_4.mp3,0.477547,0.110294,0.083829,0.051282,0.031823,0.025334,0.023851


In [None]:
combined_df.to_csv('combined_file.csv', index=False)