# Imports

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

from pathlib import Path

# Constants

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

FEATURES_DIR = Path('../data/features')
SKIP_TEENS = False
SEPARATE_TEENS_PLOTS = False

COLORS = {'male': '#4682B4', 'female': '#F3A5B1'}

LANGUAGE_CODES = [
    'ar',
    'es',
    'hi',
    'id',
    'id',
    'ja',
    'ru',
    'sw',
    'tr',
    'zh-CN'
]

In [None]:
def load_csv(language_code):
    file_path = FEATURES_DIR / f'{language_code}_features.csv'
    
    if not file_path.exists():
        print(f"File {language_code}_features.csv does not exist. Please run the feature extraction script first.")
        return None
    
    df = pd.read_csv(file_path)

    if SKIP_TEENS:
        df = filter_out_teens(df, lang_code=language_code)
    
    return df


def split_by_gender(df: pd.DataFrame, gender_column='gender'):
    df_male = df[df[gender_column].str.startswith('male')]
    df_female = df[df[gender_column].str.startswith('female')]

    return df_male, df_female


def filter_out_teens(df, age_column='age', age_group='youth', lang_code=''):
    rows = len(df)
    df_filtered = df[~df[age_column].str.contains(age_group, case=False)]
    
    print(f"Dropped {rows - len(df_filtered)} records of {age_group} from the dataset for {lang_code}")

    return df_filtered


def calculate_histogram_and_bins_for_genders(df_male, df_female, column, bins=30):
    male_counts, male_bins = np.histogram(df_male[column], bins=bins, density=True)
    female_counts, female_bins = np.histogram(df_female[column], bins=bins, density=True)

    male_bin_centers = 0.5 * (male_bins[1:] + male_bins[:-1])
    female_bin_centers = 0.5 * (female_bins[1:] + female_bins[:-1])

    return (male_counts, female_counts), (male_bin_centers, female_bin_centers)

# Basic Data Analysis

In this step we load the data and perform some basic data analysis, like:
- counting the number of males and females
- calculating male to female ratio
- counting the number of teens (age < 20) and adults (age >= 20)

Then we plot the data to visualize the distribution of the data.

In [None]:
summary_data = []

for lang_code in LANGUAGE_CODES:
    df_features = load_csv(lang_code)
    
    if not isinstance(df_features, pd.DataFrame):
        continue

    num_rows = len(df_features)

    df_male, df_female = split_by_gender(df_features)
    
    male_count = len(df_male)
    female_count = len(df_female)

    male_to_female_ratio = male_count / female_count * 100

    # Age classification: count 'teens' (13-19) or 'adults' (20+)
    df_features['age_group'] = df_features['age'].apply(
        lambda x: 'teens' if 'teens' in str(x).lower() else 'adults'
    )

    age_group_counts = df_features['age_group'].value_counts().to_dict()
    teens_count = age_group_counts.get('teens', 0)
    adults_count = age_group_counts.get('adults', 0)

    summary_data.append([
        lang_code, num_rows, male_count, female_count, male_to_female_ratio, teens_count, adults_count
    ])

summary_columns = ['lang_code', 'num_rows', 'male_count', 'female_count', 'male_to_female_ratio', 'teens_count', 'adults_count']

summary_df = pd.DataFrame(summary_data, columns=summary_columns)

In [None]:
print(summary_df)

plt.figure(figsize=(9, 6))

# Plot: Number of Rows by Language
plt.bar(summary_df['lang_code'], summary_df['male_count'], label='Male', color=COLORS['male'])
plt.bar(summary_df['lang_code'], summary_df['female_count'], bottom=summary_df['male_count'], label='Female', color=COLORS['female'])

plt.title('Number of rows by language')
plt.xlabel('Language')
plt.ylabel('Count')
plt.legend()

plt.show()

# Data Exploration

## 3.1 Pitch

### 3.1.1 Pitch per language

#### 3.1.1.1 Histograms

In [None]:
# TODO - NORMALIZE PLOT RANGES

for lang_code in LANGUAGE_CODES:
    df_features = load_csv(lang_code)

    if not isinstance(df_features, pd.DataFrame):
        continue

    df_male, df_female = split_by_gender(df_features)

    counts, centers = calculate_histogram_and_bins_for_genders(df_male, df_female, 'pitch_mean')

    male_counts, female_counts = counts
    male_bin_centers, female_bin_centers = centers

    plt.plot(male_bin_centers, male_counts, label=f'Male {lang_code}', color=COLORS['male'], linewidth=2, marker='o', markersize=4)
    plt.plot(female_bin_centers, female_counts, label=f'Female {lang_code}', color=COLORS['female'], linewidth=2, marker='o', markersize=4)

    plt.title(f'Mean Pitch Distribution for {lang_code}')
    plt.xlabel('Mean Pitch')
    plt.ylabel('Density')
    plt.legend()

    plt.show()

#### 3.1.1.2 Box plots

In [None]:
for lang_code in LANGUAGE_CODES:
    df_features = load_csv(lang_code)

    if not isinstance(df_features, pd.DataFrame):
        continue

    df_male, df_female = split_by_gender(df_features)

    df_gender_pitch = pd.DataFrame({
        'Pitch Mean': pd.concat([df_female['pitch_mean'], df_male['pitch_mean']], ignore_index=True),
        'Gender': ['Female'] * len(df_female) + ['Male'] * len(df_male)
    })

    # TODO - print out the mean, median, std etc. because the boxplot is not very informative for some languages

    plt.figure(figsize=(8, 5))
    sns.boxplot(x='Gender', y='Pitch Mean', data=df_gender_pitch, hue='Gender', dodge=False, 
                palette={'Female': COLORS['female'], 'Male': COLORS['male']}, legend=False)
    
    plt.title(f'Mean pitch distribution by gender for {lang_code}')
    plt.xlabel('Gender')
    plt.ylabel('Mean pitch')

    plt.show()


### 3.1.2 Pitch across all languages

#### 3.1.2.1 Histograms

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(16, 6), sharey=True)

ax_male: plt.Axes = axes[0]
ax_female: plt.Axes = axes[1]

ax_male.set_title('Mean pitch distribution for males across all languages')
ax_female.set_title('Mean pitch distribution for females across all languages')

for lang_code in LANGUAGE_CODES:
    df_features = load_csv(lang_code)

    if not isinstance(df_features, pd.DataFrame):
        continue

    df_male, df_female = split_by_gender(df_features)

    counts, centers = calculate_histogram_and_bins_for_genders(df_male, df_female, 'pitch_mean')

    male_counts, female_counts = counts
    male_bin_centers, female_bin_centers = centers

    ax_male.plot(male_bin_centers, male_counts, label=lang_code, linewidth=2, marker='o', alpha=0.7, markersize=4)
    ax_female.plot(female_bin_centers, female_counts, label=lang_code, linewidth=2, marker='o', alpha=0.7, markersize=4)

plt.xlabel('Mean Pitch')
plt.ylabel('Density')

ax_male.legend()
ax_female.legend()

plt.tight_layout()
plt.show()

#### 3.1.2.1 Heatmaps

In [None]:
pitch_means = pd.DataFrame(columns=['Female', 'Male'])

for lang_code in LANGUAGE_CODES:
    df_features = load_csv(lang_code)

    if not isinstance(df_features, pd.DataFrame):
        continue

    df_male, df_female = split_by_gender(df_features)

    male_pitch_mean = df_male['pitch_mean'].mean()
    female_pitch_mean = df_female['pitch_mean'].mean()

    pitch_means.loc[lang_code] = [female_pitch_mean, male_pitch_mean]

plt.figure(figsize=(10, 6))
sns.heatmap(pitch_means, annot=True, cmap='viridis', cbar_kws={'label': 'Mean Pitch (Hz)'}, fmt=".2f")

plt.title('Mean Pitch by Gender and Language')
plt.xlabel('Gender')
plt.ylabel('Language')

plt.tight_layout()
plt.show()


#### 3.1.2.2 Violin plots

In [None]:
all_data = pd.DataFrame()

for lang_code in LANGUAGE_CODES:
    df_features = load_csv(lang_code)

    if not isinstance(df_features, pd.DataFrame):
        continue
        
    df_features['language'] = lang_code

    all_data = pd.concat([all_data, df_features[['language', 'pitch_mean', 'gender']]])

_, axes = plt.subplots(2, 1, figsize=(15, 12))

ax_male: plt.Axes = axes[0]
ax_female: plt.Axes = axes[1]

ax_male.set_title('Pitch Mean Distribution by Language (Male)')
ax_female.set_title('Pitch Mean Distribution by Language (Female)')

sns.violinplot(x='language', y='pitch_mean', hue='language', 
               data=all_data[all_data['gender'].str.contains('male', case=False)], 
               palette='muted', legend=False, ax=ax_male)

sns.violinplot(x='language', y='pitch_mean', hue='language', 
               data=all_data[all_data['gender'].str.contains('female', case=False)], 
               palette='muted', legend=False, ax=ax_female)

plt.xlabel('Language')
plt.ylabel('Spectral Centroid Mean')

plt.tight_layout()
plt.show()

### 3.1.3 Pitch mean statistics

In [None]:
# TODO - plot is not very informative, maybe use a different plot type or style

summary_stats = pd.DataFrame()

for lang_code in LANGUAGE_CODES:
    df_features = load_csv(lang_code)

    if not isinstance(df_features, pd.DataFrame):
        continue

    pitch_mean = df_features['pitch_mean'].mean()
    pitch_min = df_features['pitch_mean'].min()
    pitch_max = df_features['pitch_mean'].max()

    summary_stats.loc[lang_code, 'pitch_mean'] = pitch_mean
    summary_stats.loc[lang_code, 'pitch_min'] = pitch_min
    summary_stats.loc[lang_code, 'pitch_max'] = pitch_max

pitch_stats = summary_stats[[f"pitch_mean", "pitch_min", "pitch_max"]]

plt.figure(figsize=(8, 5))
ax = pitch_stats.plot(kind='bar')

ax.set_title("Mean, Min, Max of Pitch by Language")
ax.set_ylabel("Pitch (Hz)")
ax.legend(loc='upper right')

for p in ax.patches:
    ax.annotate(f'{p.get_height():.2f}', 
                (p.get_x() + p.get_width() / 2, p.get_height()), 
                ha='center', va='bottom', 
                fontsize=9, color='black', rotation=0)

plt.tight_layout()
plt.show()


## 3.2 Spectral Centroid

### 3.2.1 Spectral Centroid per language

#### 3.2.1.1 Histograms/Plots

In [None]:
# TODO - normalize the plot ranges

for lang_code in LANGUAGE_CODES:
    df_features = load_csv(lang_code)

    if not isinstance(df_features, pd.DataFrame):
        continue

    df_male, df_female = split_by_gender(df_features)

    counts, centers = calculate_histogram_and_bins_for_genders(df_male, df_female, 'pitch_mean')

    male_counts, female_counts = counts
    male_bin_centers, female_bin_centers = centers

    plt.plot(female_bin_centers, female_counts, label=f'Female {lang_code}', color=COLORS['female'], linewidth=2, marker='o', markersize=4)
    plt.plot(male_bin_centers, male_counts, label=f'Male {lang_code}', color=COLORS['male'], linewidth=2, marker='o', markersize=4)

    plt.title(f'Mean Spectral Centroid Distribution for {lang_code}')
    plt.xlabel('Mean Spectral Centroid')
    plt.ylabel('Density')
    plt.legend()

    plt.show()

### 3.2.2 Spectral Centroid Across All Languages

#### 3.2.2.1 Histograms/Plots

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(16, 6), sharey=True)

ax_male: plt.Axes = axes[0]
ax_female: plt.Axes = axes[1]

ax_male.set_title('Mean spectral centroid distribution for males across all languages')
ax_female.set_title('Mean spectral centroid distribution for females across all languages')

for lang_code in LANGUAGE_CODES:
    df_features = load_csv(lang_code)

    if not isinstance(df_features, pd.DataFrame):
        continue

    df_male, df_female = split_by_gender(df_features)

    counts, centers = calculate_histogram_and_bins_for_genders(df_male, df_female, 'spectral_centroid_mean')

    male_counts, female_counts = counts
    male_bin_centers, female_bin_centers = centers

    ax_male.plot(male_bin_centers, male_counts, label=lang_code, linewidth=2, marker='o', alpha=0.7, markersize=4)
    ax_female.plot(female_bin_centers, female_counts, label=lang_code, linewidth=2, marker='o', alpha=0.7, markersize=4)

plt.xlabel('Mean Pitch')
plt.ylabel('Density')

ax_male.legend()
ax_female.legend()

plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(12, 6))
plt.title('Mean Spectral Centroid Distribution Across All Languages')
plt.xlabel('Mean Spectral Centroid')
plt.ylabel('Density')
sns.set_style('whitegrid')

for lang_code in LANGUAGE_CODES:
    df_features = load_csv(lang_code)

    if not isinstance(df_features, pd.DataFrame):
        continue

    df_filtered = df_features[df_features['gender'].str.contains('male|female', case=False, regex=True)]

    sns.kdeplot(df_filtered['spectral_centroid_mean'], label=lang_code, linewidth=2, alpha=0.7)

plt.legend(title='Language')
plt.grid(True)

plt.show()

#### 3.2.2.2 Normalized plots

In [None]:
plt.figure(figsize=(14, 7))
plt.title('Peak-Normalized Spectral Centroid Distribution Across All Languages')
plt.xlabel('Spectral Centroid')
plt.ylabel('Normalized Density (Peak = 1.0)')
sns.set_style('whitegrid')

# Define a color palette and line styles for variety
palette = sns.color_palette("husl", len(LANGUAGE_CODES))
line_styles = ['-', '--', '-.', ':']

# Loop through each language with adjusted settings
for idx, lang_code in enumerate(LANGUAGE_CODES):
    df_features = load_csv(lang_code)

    if not isinstance(df_features, pd.DataFrame):
        continue
    
    df_filtered = df_features[df_features['gender'].str.contains('male|female', case=False, regex=True)]
    
    # Histogram calculation with manual normalization
    counts, bin_edges = np.histogram(df_filtered['spectral_centroid_mean'], bins=50, density=False)
    peak_normalized_counts = counts / counts.max()
    bin_centers = 0.5 * (bin_edges[1:] + bin_edges[:-1])
    
    # Apply a simple rolling mean for smoothness
    smooth_counts = np.convolve(peak_normalized_counts, np.ones(3)/3, mode='same')
    
    # Plot with custom color, transparency, and line style
    plt.plot(bin_centers, smooth_counts, label=lang_code, 
             color=palette[idx], linestyle=line_styles[idx % len(line_styles)], 
             linewidth=2, alpha=0.8)
    
# Add legend and limit x-axis for focus on main data
plt.legend(title='Language', loc='upper right')
plt.xlim(0, 5000)
plt.grid(True)
plt.show()

# Define the layout for subplots
num_languages = len(LANGUAGE_CODES)
cols = 3  # Adjust the number of columns as desired
rows = (num_languages // cols) + (num_languages % cols > 0)

# Create the figure and axes
fig, axes = plt.subplots(rows, cols, figsize=(15, rows * 4), sharex=True, sharey=True)
axes = axes.flatten()  # Flatten the axes array for easy iteration

# Loop through each language
for i, lang_code in enumerate(LANGUAGE_CODES):
    df_features = load_csv(lang_code)

    if not isinstance(df_features, pd.DataFrame):
        continue

    # Filter for only male and female
    df_filtered = df_features[df_features['gender'].str.contains('male|female', case=False, regex=True)]

    # Calculate histogram bins and counts
    counts, bin_edges = np.histogram(df_filtered['spectral_centroid_mean'], bins=50, density=False)
    
    # Normalize the histogram so that the peak is 1.0
    peak_normalized_counts = counts / counts.max()

    # Calculate the bin centers
    bin_centers = 0.5 * (bin_edges[1:] + bin_edges[:-1])

    # Plot the peak-normalized histogram line on the current subplot
    axes[i].plot(bin_centers, peak_normalized_counts, label=lang_code, linewidth=2, alpha=0.7)
    axes[i].set_title(lang_code)
    axes[i].set_xlabel('Spectral Centroid')
    axes[i].set_ylabel('Normalized Density (Peak = 1.0)')
    axes[i].grid(True)

# Remove empty subplots if any
for j in range(i + 1, rows * cols):
    fig.delaxes(axes[j])

# Set the overall title and layout
plt.suptitle('Peak-Normalized Spectral Centroid Distribution by Language', fontsize=16)
plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()

#### 3.2.2.2 Feature distibution by language

In [None]:
# Initialize an empty DataFrame to hold all data for plotting
all_data = pd.DataFrame()

for lang_code in LANGUAGE_CODES:
    df_features = load_csv(lang_code)

    if not isinstance(df_features, pd.DataFrame):
        continue

    # Add a language column to the DataFrame
    df_features['language'] = lang_code

    # Append the spectral centroid mean, gender, and language to the main DataFrame
    all_data = pd.concat([all_data, df_features[['language', 'spectral_centroid_mean', 'gender']]])

# Set up the plot
plt.figure(figsize=(15, 12))

# First plot for Males
plt.subplot(2, 1, 1)  # 2 rows, 1 column, 1st subplot
sns.violinplot(x='language', y='spectral_centroid_mean', hue='language', 
               data=all_data[all_data['gender'].str.contains('male', case=False)], 
               palette='muted', legend=False)
plt.title('Spectral Centroid Mean Distribution by Language (Male)')
plt.xlabel('Language')
plt.ylabel('Spectral Centroid Mean')

# Second plot for Females
plt.subplot(2, 1, 2)  # 2 rows, 1 column, 2nd subplot
sns.violinplot(x='language', y='spectral_centroid_mean', hue='language', 
               data=all_data[all_data['gender'].str.contains('female', case=False)], 
               palette='muted', legend=False)
plt.title('Spectral Centroid Mean Distribution by Language (Female)')
plt.xlabel('Language')
plt.ylabel('Spectral Centroid Mean')

# Adjust layout
plt.tight_layout()
plt.show()

### 3.2.3 Spectral Centroid Mean Statistics

In [None]:
# Create a new DataFrame to store summary statistics if it doesn't exist
if 'summary_stats' not in locals():
    summary_stats = pd.DataFrame()

# Loop through each language
for lang_code in LANGUAGE_CODES:
    df_features = load_csv(lang_code)

    if not isinstance(df_features, pd.DataFrame):
        continue
    
    # Calculate summary statistics for spectral centroid
    spectral_centroid_mean = df_features['spectral_centroid_mean'].mean()
    spectral_centroid_min = df_features['spectral_centroid_mean'].min()
    spectral_centroid_max = df_features['spectral_centroid_mean'].max()

    # Add the summary statistics to the DataFrame
    summary_stats.loc[lang_code, 'spectral_centroid_mean'] = spectral_centroid_mean
    summary_stats.loc[lang_code, 'spectral_centroid_min'] = spectral_centroid_min
    summary_stats.loc[lang_code, 'spectral_centroid_max'] = spectral_centroid_max

# Calculate summary statistics for spectral centroid
spectral_centroid_stats = summary_stats[[f"spectral_centroid_mean", "spectral_centroid_min", "spectral_centroid_max"]]

# Plot spectral centroid statistics
plt.figure(figsize=(8, 5))
ax = spectral_centroid_stats.plot(kind='bar')

# Customize the plot
ax.set_title("Mean, Min, Max of Spectral Centroid by Language")
ax.set_ylabel("Spectral Centroid (Hz)")
ax.legend(loc='upper right')

# Add data labels
for p in ax.patches:
    ax.annotate(f'{p.get_height():.2f}', 
                (p.get_x() + p.get_width() / 2, p.get_height()), 
                ha='center', va='bottom', 
                fontsize=9, color='black', rotation=0)

plt.tight_layout()
plt.show()


## 3.3 Spectral Bandwith

In [None]:
# Create a new DataFrame to store summary statistics if it doesn't exist
if 'summary_stats' not in locals():
    summary_stats = pd.DataFrame()

# Loop through each language
for lang_code in LANGUAGE_CODES:
    df_features = load_csv(lang_code)

    if not isinstance(df_features, pd.DataFrame):
        continue

    # Calculate summary statistics for spectral bandwidth
    spectral_bandwidth_mean = df_features['spectral_bandwidth_mean'].mean()
    spectral_bandwidth_min = df_features['spectral_bandwidth_mean'].min()
    spectral_bandwidth_max = df_features['spectral_bandwidth_mean'].max()

    # Add the summary statistics to the DataFrame
    summary_stats.loc[lang_code, 'spectral_bandwidth_mean'] = spectral_bandwidth_mean
    summary_stats.loc[lang_code, 'spectral_bandwidth_min'] = spectral_bandwidth_min
    summary_stats.loc[lang_code, 'spectral_bandwidth_max'] = spectral_bandwidth_max

# Calculate summary statistics for spectral bandwidth
spectral_bandwidth_stats = summary_stats[['spectral_bandwidth_mean', 'spectral_bandwidth_min', 'spectral_bandwidth_max']]


# Plot spectral bandwidth statistics
ax = spectral_bandwidth_stats.plot(kind='bar', figsize=(15, 8))

# Customize the plot
ax.set_title("Mean, Min, Max of Spectral Bandwidth by Language")
ax.set_ylabel("Spectral Bandwidth (Hz)")
ax.legend(loc='upper right')

# Add data labels
for p in ax.patches:
    ax.annotate(f'{p.get_height():.2f}', 
                (p.get_x() + p.get_width() / 2, p.get_height()), 
                ha='center', va='bottom', 
                fontsize=9, color='black', rotation=0)

plt.show()

### 3.3.2 Range plot

In [None]:
# Create figure and axis
fig, ax = plt.subplots(figsize=(15, 8))

# Plot the horizontal line for the range (min to max) for each language
for i, lang_code in enumerate(spectral_bandwidth_stats.index):
    min_val = spectral_bandwidth_stats.loc[lang_code, 'spectral_bandwidth_min']
    max_val = spectral_bandwidth_stats.loc[lang_code, 'spectral_bandwidth_max']
    mean_val = spectral_bandwidth_stats.loc[lang_code, 'spectral_bandwidth_mean']
    
    # Plot the range
    ax.hlines(y=i, xmin=min_val, xmax=max_val, color='lightblue', lw=10, label='Min-Max Range' if i == 0 else "")
    
    # Plot the mean
    ax.plot(mean_val, i, 'ro', label='Mean' if i == 0 else "")

# Customize the y-axis with language names
ax.set_yticks(range(len(spectral_bandwidth_stats.index)))
ax.set_yticklabels(spectral_bandwidth_stats.index)

# Add labels and title
ax.set_title("Range of Spectral Bandwidth by Language with Mean Values")
ax.set_xlabel("Spectral Bandwidth (Hz)")
ax.set_ylabel("Language")
ax.legend(loc='upper right')

# Annotate the mean values
for i, mean in enumerate(spectral_bandwidth_stats['spectral_bandwidth_mean']):
    ax.annotate(f'{mean:.2f}', (mean, i), ha='center', va='center', color='black')

# Show the plot

## 3.4 MFCC

### 3.4.1.1 Mean, Min, Max statistics across languages

In [None]:
# Create a new DataFrame to store summary statistics if it doesn't exist
if 'summary_stats' not in locals():
    summary_stats = pd.DataFrame()

# Loop through each language
for lang_code in LANGUAGE_CODES:
    df_features = load_csv(lang_code)

    if not isinstance(df_features, pd.DataFrame):
        continue
    
    # Convert the MFCC string arrays to numerical arrays
    df_features['mfcc_mean'] = df_features['mfcc_mean'].apply(lambda x: np.fromstring(x.strip('[]'), sep=' '))
    
    # Calculate mean, min, and max for each MFCC coefficient
    mfcc_means = np.mean(np.stack(df_features['mfcc_mean'].values), axis=0)
    mfcc_mins = np.min(np.stack(df_features['mfcc_mean'].values), axis=0)
    mfcc_maxs = np.max(np.stack(df_features['mfcc_mean'].values), axis=0)
    
    # Store the results in the summary statistics DataFrame
    for i, (mean, min_val, max_val) in enumerate(zip(mfcc_means, mfcc_mins, mfcc_maxs), start=1):
        summary_stats.loc[lang_code, f'mfcc_{i}_mean'] = mean
        summary_stats.loc[lang_code, f'mfcc_{i}_min'] = min_val
        summary_stats.loc[lang_code, f'mfcc_{i}_max'] = max_val

# Calculate summary statistics for MFCCs
mfcc_stats = summary_stats.filter(regex='mfcc_.*_(mean|min|max)')

# Plot MFCC statistics for the first MFCC as an example
ax = mfcc_stats.filter(regex='mfcc_1_').plot(kind='bar', ax=plt.gca(), figsize=(15, 6))

# Customize the plot
ax.set_title("Mean, Min, Max of First Mel-Frequency Cepstral Coefficient by Language")
ax.set_ylabel("MFCC (Hz)")
ax.legend(loc='upper right')

# Add data labels
for p in ax.patches:
    ax.annotate(f'{p.get_height():.2f}', 
                (p.get_x() + p.get_width() / 2, p.get_height()), 
                ha='center', va='bottom', 
                fontsize=9, color='black', rotation=0)

plt.tight_layout()
plt.show()


 ### 3.4.1.2 MFCC - specific coefficients across languages

In [None]:
# List of selected MFCC coefficients to plot
selected_mfccs = [1, 2, 3, 4, 5]  # Adjust this list to include the coefficients you're interested in

# Create a DataFrame to store MFCC values with language labels
mfcc_df = pd.DataFrame()

# Loop through each language and collect MFCC values
for lang_code in LANGUAGE_CODES:
    df_features = load_csv(lang_code)

    if not isinstance(df_features, pd.DataFrame):
        continue

    # Convert the MFCC string arrays to numerical arrays
    df_features['mfcc_mean'] = df_features['mfcc_mean'].apply(lambda x: np.fromstring(x.strip('[]'), sep=' '))

    # Collect MFCC values for the selected coefficients
    for mfcc in selected_mfccs:
        mfcc_values = df_features['mfcc_mean'].apply(lambda x: x[mfcc - 1] if len(x) >= mfcc else np.nan).tolist()
        # mfcc_df = mfcc_df.append(pd.DataFrame({'Language': language, 'MFCC': f'MFCC_{mfcc}', 'Value': mfcc_values}), ignore_index=True)
        mfcc_df = pd.concat([mfcc_df, pd.DataFrame({'Language': lang_code, 'MFCC': f'MFCC_{mfcc}', 'Value': mfcc_values})], ignore_index=True)

# Plotting the selected MFCCs for each language
plt.figure(figsize=(12, 8))
sns.boxplot(x='MFCC', y='Value', hue='Language', data=mfcc_df, palette='Set2', dodge=True)

# Customize the plot
plt.title('Distribution of Selected MFCCs by Language')
plt.xlabel('MFCC Coefficient')
plt.ylabel('Value')
plt.legend(title='Language', loc='lower right')
plt.grid(True)
plt.tight_layout()
plt.show()


### 3.4.1.3 Violin plots

In [None]:
# List of selected MFCC coefficients to plot
selected_mfccs = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]  # Adjust this list to include the coefficients you're interested in

# Create a DataFrame to store MFCC values with language labels
mfcc_df = pd.DataFrame()

# Loop through each language and collect MFCC values
for lang_code in LANGUAGE_CODES:
    df_features = load_csv(lang_code)

    if not isinstance(df_features, pd.DataFrame):
        continue

    # Convert the MFCC string arrays to numerical arrays
    df_features['mfcc_mean'] = df_features['mfcc_mean'].apply(lambda x: np.fromstring(x.strip('[]'), sep=' '))

    # Collect MFCC values for the selected coefficients
    for mfcc in selected_mfccs:
        mfcc_values = df_features['mfcc_mean'].apply(lambda x: x[mfcc - 1] if len(x) >= mfcc else np.nan).tolist()
        mfcc_df = pd.concat([mfcc_df, pd.DataFrame({'Language': language, 'MFCC': f'MFCC_{mfcc}', 'Value': mfcc_values})], ignore_index=True)

# Set up the plot with multiple subplots for each MFCC
num_mfccs = len(selected_mfccs)
plt.figure(figsize=(12, 5 * num_mfccs))

# Loop through each selected MFCC and create a subplot
for i, mfcc in enumerate(selected_mfccs):
    plt.subplot(num_mfccs, 1, i + 1)  # Create a subplot for each MFCC
    sns.violinplot(x='Language', y='Value', data=mfcc_df[mfcc_df['MFCC'] == f'MFCC_{mfcc}'], palette='Set2', dodge=True, hue='Language')
    
    # Customize each subplot
    plt.title(f'Distribution of MFCC_{mfcc} by Language')
    plt.xlabel('Language')
    plt.ylabel('Value')
    plt.grid(True)

# Adjust layout
plt.tight_layout()
plt.show()