In [5]:
# Import necessary libraries
import os
import pandas as pd
import librosa
import librosa.display
import matplotlib.pyplot as plt
import seaborn as sns
import soundfile as sf
import numpy as np

# Configuration for displaying plots
%matplotlib inline
sns.set(style="whitegrid")

# Path to the annotations CSV file
annotations_file = 'data/annotations/annotations.csv'

# Load the annotations
annotations = pd.read_csv(annotations_file)

# Display the first few rows of the annotations
print("Annotations:")
display(annotations.head())

# Visualize the distribution of Korean proficiency levels
plt.figure(figsize=(10, 6))
sns.countplot(data=annotations, x='korean_level')
plt.title('Distribution of Korean Proficiency Levels')
plt.xlabel('Korean Proficiency Level')
plt.ylabel('Number of Participants')
plt.show()

# Visualize the distribution of countries of origin
plt.figure(figsize=(10, 6))
sns.countplot(data=annotations, x='country_of_origin')
plt.title('Distribution of Countries of Origin')
plt.xlabel('Country of Origin')
plt.ylabel('Number of Participants')
plt.xticks(rotation=45)
plt.show()

# Path to the processed audio files
processed_audio_dir = 'data/processed_audio/'

# Function to plot the waveform and spectrogram of an audio file
def plot_audio_features(file_path):
    # Load the audio
    audio, sample_rate = librosa.load(file_path, sr=None)

    # Plot the waveform
    plt.figure(figsize=(12, 4))
    librosa.display.waveshow(audio, sr=sample_rate)
    plt.title('Waveform')
    plt.xlabel('Time (s)')
    plt.ylabel('Amplitude')
    plt.show()

    # Plot the spectrogram
    plt.figure(figsize=(12, 4))
    S = librosa.feature.melspectrogram(y=audio, sr=sample_rate)
    S_dB = librosa.power_to_db(S, ref=np.max)
    librosa.display.specshow(S_dB, sr=sample_rate, x_axis='time', y_axis='mel')
    plt.colorbar(format='%+2.0f dB')
    plt.title('Spectrogram')
    plt.xlabel('Time (s)')
    plt.ylabel('Frequency (Hz)')
    plt.show()

# Example of exploring an audio file
example_file = os.path.join(processed_audio_dir, 'data/processed_audio/Participant01_easy1_processed.wav')
plot_audio_features(example_file)

# Function to get the duration of an audio file
def get_audio_duration(file_path):
    audio, sample_rate = librosa.load(file_path, sr=None)
    duration = librosa.get_duration(audio, sr=sample_rate)
    return duration

# Calculate the durations of all audio files
audio_files = [f for f in os.listdir(processed_audio_dir) if f.endswith('.wav')]
durations = [get_audio_duration(os.path.join(processed_audio_dir, f)) for f in audio_files]

# Plot the distribution of audio file durations
plt.figure(figsize=(10, 6))
sns.histplot(durations, bins=20, kde=True)
plt.title('Distribution of Audio File Durations')
plt.xlabel('Duration (s)')
plt.ylabel('Number of Files')
plt.show()


FileNotFoundError: [Errno 2] No such file or directory: 'data/annotations/annotations.csv'