# **Data Exploration and Analysis**

This notebook aims to conduct a comprehensive data exploration to analyze frequency-related characteristics between different languages. We will focus on examining differences between genders for each language, including various statistical properties and visualizations.

## **1. Setup and Imports**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

from pathlib import Path
from sklearn.preprocessing import StandardScaler


pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

## **2. Constants**

In [None]:
FEATURES_DIR = Path('../data/features')
PLOTS_DIR = Path('../data/plots')
PLOTS_DIR.mkdir(exist_ok=True, parents=True)

SKIP_TEENS = False
SAVE_PLOTS = True

COLORS = {
    'male': '#4682B4',
    'female': '#F3A5B1'
    }

LANGUAGE_CODES = ["es",
                  "ru",
                  "pl",
                #   "en",
                  "de",
                  "zh-TW",
                  "yue",
                  "ar",
                  "sw",
                  "tr",
                  "ja",
                  "uz"]


BASE_COLUMNS = ['gender', 'age']

## **3. Data Normalization Function**

In [None]:
def normalize_features(df: pd.DataFrame, feature_columns):
    scaler = StandardScaler()
    df[feature_columns] = scaler.fit_transform(df[feature_columns])

    return df

## **4. Data Loading and Filtering Functions**

In [None]:
def load_data_frame(file_path: Path, selected_columns: list = None) -> pd.DataFrame:
    if selected_columns:
        return pd.read_csv(file_path, usecols=BASE_COLUMNS + selected_columns)
    return pd.read_csv(file_path)


def filter_out_age_group(df: pd.DataFrame, age_column='age', age_group='', lang_code=''):
    initial_rows = len(df)
    df_filtered = df[~df[age_column].str.contains(age_group, case=False)]
    
    print(f'Dropped {initial_rows - len(df_filtered)} records of {age_group} from the dataset for {lang_code}')

    return df_filtered


def select_numeric_features(df: pd.DataFrame, selected_columns: list = None) -> list:
    numeric_features = ['pitch_mean', 'hnr_mean', 'spectral_centroid_mean', 'spectral_bandwidth_mean', 'spectral_flatness_mean', 'zcr_mean']
    
    if selected_columns:
        numeric_features = [col for col in numeric_features if col in selected_columns]
    
    return numeric_features


def normalize_selected_features(df: pd.DataFrame, selected_columns: list = None) -> pd.DataFrame:
    numeric_features = select_numeric_features(df, selected_columns)
    
    if numeric_features:
        df = normalize_features(df, numeric_features)
    
    return df


def select_list_type_features(selected_columns: list = None) -> list:
    list_type_features = ['mfcc_mean','spectral_contrast_mean', 'chroma_mean']
    
    if selected_columns:
        list_type_features = [col for col in list_type_features if col in selected_columns]
    
    return list_type_features


def expand_numeric_list_columns(df: pd.DataFrame, list_columns: list) -> pd.DataFrame:
    for col in list_columns:
        if col in df.columns:
            def parse_tab_separated_list(x):
                try:
                    return np.array([float(i) for i in x.strip('[]').split()]) if isinstance(x, str) else np.array(x)
                except (ValueError, TypeError) as e:
                    print(f"Failed to parse {col} with value: {x} - Error: {e}")
                    return np.array([])
            
            expanded = df[col].apply(parse_tab_separated_list)
            
            max_length = expanded.apply(len).max()
            for i in range(max_length):
                df[f'{col}_{i}'] = expanded.apply(lambda x: x[i] if len(x) > i else np.nan)
            
            df.drop(columns=[col], inplace=True)
    
    return df


def expand_list_type_features(df: pd.DataFrame, selected_columns: list = None) -> pd.DataFrame:
    list_type_features = select_list_type_features(selected_columns)
    
    if list_type_features:
        df = expand_numeric_list_columns(df, list_type_features)
    
    return df


def normalize_expanded_features(df: pd.DataFrame, selected_columns: list = None) -> pd.DataFrame:
    expanded_columns = [col for col in df.columns if any(ftype in col for ftype in select_list_type_features(selected_columns))]
    
    if expanded_columns:
        df = normalize_features(df, expanded_columns)
    
    return df


def truncate_expanded_column_name(column_name: str) -> str:
    list_type_features_base = ['mfcc_mean', 'spectral_contrast_mean', 'chroma_mean']
    
    for base_feature in list_type_features_base:
        if column_name.startswith(base_feature):
            return base_feature
    return column_name


def load_features(language_code, selected_columns=None):        
    file_path = FEATURES_DIR / f'{language_code}_features.csv'
    
    if not file_path.exists():
        print(f'File {file_path.name} does not exist. Please run the feature extraction script first.')
        return None
    
    if selected_columns:
        selected_columns = [truncate_expanded_column_name(col) for col in selected_columns]
    
    
    df_features = load_data_frame(file_path, selected_columns)

    if SKIP_TEENS:
        df_features = filter_out_age_group(df_features, age_group='teen', lang_code=language_code)

    df_features = normalize_selected_features(df_features, selected_columns)
    df_features = expand_list_type_features(df_features, selected_columns)
    df_features = normalize_expanded_features(df_features, selected_columns)
    
    return df_features


def split_by_gender(df: pd.DataFrame, gender_column='gender'):
    df_male = df[df[gender_column].str.startswith('male')]
    df_female = df[df[gender_column].str.startswith('female')]

    return df_male, df_female


def filter_gender_male_female(df: pd.DataFrame, gender_column='gender'):
    return df[df[gender_column].str.startswith(('male', 'female'))]

## **5. Basic Data Analysis**

In this step we load the data and perform some basic data analysis, like:
- counting the number of males and females
- calculating male to female ratio
- counting the number of teens (age < 20) and adults (age >= 20)

In [None]:
summary_data = []


for lang_code in LANGUAGE_CODES:
    df_features = load_features(lang_code)
    
    if not isinstance(df_features, pd.DataFrame):
        continue

    num_rows = len(df_features)

    df_male, df_female = split_by_gender(df_features)
    
    male_count = len(df_male)
    female_count = len(df_female)

    # Age classification: count 'teens' (13-19) or 'adults' (20+)
    df_features['age_group'] = df_features['age'].apply(
        lambda x: 'teens' if 'teens' in str(x).lower() else 'adults'
    )

    age_group_counts = df_features['age_group'].value_counts().to_dict()
    teens_count = age_group_counts.get('teens', 0)
    adults_count = age_group_counts.get('adults', 0)

    summary_data.append([lang_code, num_rows, male_count, female_count, teens_count, adults_count])


summary_columns = ['Language', 'Total', 'Male', 'Female', 'Teens', 'Adults']

summary_df = pd.DataFrame(summary_data, columns=summary_columns)

summary_df


In [None]:
plt.figure(figsize=(10, 6))

plt.bar(summary_df['Language'], summary_df['Male'], label='Male', color=COLORS['male'])
plt.bar(summary_df['Language'], summary_df['Female'], bottom=summary_df['Male'], label='Female', color=COLORS['female'])

plt.title('Number of Records by Language and Gender')
plt.xlabel('Language')
plt.ylabel('Count')
plt.legend()

if SAVE_PLOTS:
    plt.savefig(PLOTS_DIR / 'record_count_by_gender.svg')

plt.show()

## **6. Data Exploration**

This section covers in-depth data exploration using KDE plots, boxplots, heatmaps, and violin plots.

### **6.1 KDE Plots**

In [None]:
import math

def plot_kde_per_language(feature: str, gender: str):
    num_languages = len(LANGUAGE_CODES)
    num_cols = 3
    num_rows = math.ceil(num_languages / num_cols)
    
    fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 4 * num_rows), sharey=True)
    axes = axes.flatten()
    
    for i, lang_code in enumerate(LANGUAGE_CODES):
        df_features = load_features(lang_code, [feature])

        if df_features is None:
            continue

        df_gender = df_features[df_features['gender'].str.startswith(gender)]
        
        sns.kdeplot(data=df_gender, x=feature, fill=False, bw_adjust=1.5, ax=axes[i])
        axes[i].set_title(f'{feature.capitalize()} KDE in {lang_code}', fontsize=10)

    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])

    plt.suptitle(f'{feature.capitalize()} KDE Plot for {gender.capitalize()} across Languages', fontsize=16, y=1.02)
    plt.tight_layout()

    if SAVE_PLOTS:
        save_path = PLOTS_DIR / 'kde' / f'{feature}_{gender}'
        save_path.mkdir(exist_ok=True, parents=True)
        plt.savefig(save_path / f'kde_{feature}_{gender}_all_languages_subplot.svg')

    plt.show()


def plot_combined_kde(feature: str, gender: str):
    plt.figure(figsize=(12, 8))

    for lang_code in LANGUAGE_CODES:
        df_features = load_features(lang_code, [feature])
        if df_features is None:
            continue

        df_gender = df_features[df_features['gender'].str.contains(gender, case=False)]

        sns.kdeplot(data=df_gender, x=feature, fill=False, bw_adjust=1.5, label=lang_code)

    plt.title(f'{feature} KDE Plot for {gender.capitalize()} (All Languages)')
    plt.legend(title='Language')

    if SAVE_PLOTS:
        save_path = PLOTS_DIR / 'kde' / f'{feature}_{gender}'
        save_path.mkdir(exist_ok=True, parents=True)
        plt.savefig(save_path / f'kde_{feature}_{gender}_all_languages.svg')

    plt.show()


def analyze_kde_for_feature(feature: str):
    genders = ['male', 'female']

    for gender in genders:
        print(f"Processing {feature} for {gender.capitalize()}...")

        plot_kde_per_language(feature, gender)

        plot_combined_kde(feature, gender)

In [None]:
analyze_kde_for_feature('pitch_mean')
analyze_kde_for_feature('hnr_mean')
analyze_kde_for_feature('spectral_centroid_mean')
analyze_kde_for_feature('spectral_bandwidth_mean')
analyze_kde_for_feature('spectral_flatness_mean')
analyze_kde_for_feature('zcr_mean')

for i in range(0, 11):
    analyze_kde_for_feature(f'mfcc_mean_{i}')

for i in range(0, 5):
    analyze_kde_for_feature(f'spectral_contrast_mean_{i}')

for i in range(0, 11):
    analyze_kde_for_feature(f'chroma_mean_{i}')

### **6.2 Boxplots**

In [None]:
def plot_boxplot_per_language(feature: str, gender: str):
    combined_data = []

    for lang_code in LANGUAGE_CODES:
        df_features = load_features(lang_code)

        if df_features is None:
            continue

        df_gender = df_features[df_features['gender'].str.startswith(gender)].copy()
        df_gender.loc[:, 'language'] = lang_code

        combined_data.append(df_gender[[feature, 'language']])

    combined_df = pd.concat(combined_data, ignore_index=True)

    plt.figure(figsize=(15, 6))
    sns.boxplot(data=combined_df, x='language', y=feature)

    plt.title(f'{feature.capitalize()} Boxplot for {gender.capitalize()} across Languages', fontsize=16)
    plt.xlabel('Language')
    plt.ylabel(feature.capitalize())

    if SAVE_PLOTS:
        save_path = PLOTS_DIR / 'boxplot' / f'{feature}_{gender}'
        save_path.mkdir(exist_ok=True, parents=True)
        plt.savefig(save_path / f'boxplot_{feature}_{gender}_all_languages.svg')

    plt.show()


def analyze_boxplot_for_feature(feature: str):
    genders = ['male', 'female']

    for gender in genders:
        print(f"Processing {feature} for {gender.capitalize()}...")

        plot_boxplot_per_language(feature, gender)

In [None]:
analyze_boxplot_for_feature('pitch_mean')
analyze_boxplot_for_feature('hnr_mean')
analyze_boxplot_for_feature('spectral_centroid_mean')
analyze_boxplot_for_feature('spectral_bandwidth_mean')
analyze_boxplot_for_feature('spectral_flatness_mean')
analyze_boxplot_for_feature('zcr_mean')

for i in range(0, 11):
    analyze_boxplot_for_feature(f'mfcc_mean_{i}')

for i in range(0, 5):
    analyze_boxplot_for_feature(f'spectral_contrast_mean_{i}')

for i in range(0, 11):
    analyze_boxplot_for_feature(f'chroma_mean_{i}')

### **6.3 Heatmaps**

In [None]:
def plot_heatmap_per_language(feature: str, gender: str):
    combined_data = []

    for lang_code in LANGUAGE_CODES:
        df_features = load_features(lang_code)

        if df_features is None:
            continue

        df_gender = df_features[df_features['gender'].str.startswith(gender)].copy()
        df_gender['language'] = lang_code

        combined_data.append(df_gender[[feature, 'language']])

    combined_df = pd.concat(combined_data, ignore_index=True)

    pivot_df = combined_df.pivot_table(index='language', values=feature, aggfunc='mean')

    plt.figure(figsize=(10, 6))
    sns.heatmap(pivot_df, annot=True, cmap='coolwarm', cbar_kws={'label': feature.capitalize()})
    
    plt.title(f'{feature.capitalize()} Heatmap for {gender.capitalize()} across Languages', fontsize=16)
    plt.xlabel('Language')
    plt.ylabel('Average Value')

    if SAVE_PLOTS:
        save_path = PLOTS_DIR / 'heatmap' / f'{feature}_{gender}'
        save_path.mkdir(exist_ok=True, parents=True)
        plt.savefig(save_path / f'heatmap_{feature}_{gender}_all_languages.svg')

    plt.show()


def analyze_heatmap_for_feature(feature: str):
    genders = ['male', 'female']

    for gender in genders:
        print(f"Processing {feature} for {gender.capitalize()}...")

        plot_heatmap_per_language(feature, gender)

In [None]:
analyze_heatmap_for_feature('pitch_mean')
analyze_heatmap_for_feature('hnr_mean')
analyze_heatmap_for_feature('spectral_centroid_mean')
analyze_heatmap_for_feature('spectral_bandwidth_mean')
analyze_heatmap_for_feature('spectral_flatness_mean')
analyze_heatmap_for_feature('zcr_mean')

for i in range(0, 11):
    analyze_heatmap_for_feature(f'mfcc_mean_{i}')

for i in range(0, 5):
    analyze_heatmap_for_feature(f'spectral_contrast_mean_{i}')

for i in range(0, 11):
    analyze_heatmap_for_feature(f'chroma_mean_{i}')

### **6.4 Violin Plots**

In [None]:
def plot_violinplot_per_language(feature: str, gender: str):
    combined_data = []

    for lang_code in LANGUAGE_CODES:
        df_features = load_features(lang_code)

        if df_features is None:
            continue

        df_gender = df_features[df_features['gender'].str.startswith(gender)].copy()
        df_gender['language'] = lang_code

        combined_data.append(df_gender[[feature, 'language']])

    combined_df = pd.concat(combined_data, ignore_index=True)

    plt.figure(figsize=(15, 6))
    sns.violinplot(data=combined_df, x='language', y=feature)

    plt.title(f'{feature.capitalize()} Violin Plot for {gender.capitalize()} across Languages', fontsize=16)
    plt.xlabel('Language')
    plt.ylabel(feature.capitalize())

    if SAVE_PLOTS:
        save_path = PLOTS_DIR / 'violinplot' / f'{feature}_{gender}'
        save_path.mkdir(exist_ok=True, parents=True)
        plt.savefig(save_path / f'violinplot_{feature}_{gender}_all_languages.svg')

    plt.show()


def analyze_violinplot_for_feature(feature: str):
    genders = ['male', 'female']

    for gender in genders:
        print(f"Processing {feature} for {gender.capitalize()}...")

        plot_violinplot_per_language(feature, gender)


In [None]:
analyze_violinplot_for_feature('pitch_mean')
analyze_violinplot_for_feature('hnr_mean')
analyze_violinplot_for_feature('spectral_centroid_mean')
analyze_violinplot_for_feature('spectral_bandwidth_mean')
analyze_violinplot_for_feature('spectral_flatness_mean')
analyze_violinplot_for_feature('zcr_mean')

for i in range(0, 11):
    analyze_violinplot_for_feature(f'mfcc_mean_{i}')

for i in range(0, 5):
    analyze_violinplot_for_feature(f'spectral_contrast_mean_{i}')

for i in range(0, 11):
    analyze_violinplot_for_feature(f'chroma_mean_{i}')

## **7. Feature Statistics**

In [None]:
def compute_statistics_per_language(feature: str, gender: str):
    stats_data = []

    for lang_code in LANGUAGE_CODES:
        df_features = load_features(lang_code)

        if df_features is None:
            continue

        df_gender = df_features[df_features['gender'].str.startswith(gender)].copy()

        feature_stats = {
            'language': lang_code,
            'gender': gender,
            'mean': df_gender[feature].mean(),
            'min': df_gender[feature].min(),
            'max': df_gender[feature].max(),
            'std': df_gender[feature].std(),
            'var': df_gender[feature].var(),
            'median': df_gender[feature].median(),
            'skew': df_gender[feature].skew(),
            'kurtosis': df_gender[feature].kurt()
        }

        stats_data.append(feature_stats)

    stats_df = pd.DataFrame(stats_data)
    
    return stats_df


def plot_bar_for_statistic(stats_df, statistic):
    plt.figure(figsize=(14, 6))
    
    sns.barplot(data=stats_df, x='language', y=statistic, hue='gender', palette='viridis')
    
    plt.title(f"{statistic.capitalize()} across Languages", fontsize=16)
    plt.xlabel("Language")
    plt.ylabel(f"{statistic.capitalize()} Value")
    
    if SAVE_PLOTS:
        save_path = PLOTS_DIR / 'statistics_barplots' / statistic
        save_path.mkdir(exist_ok=True, parents=True)
        plt.savefig(save_path / f'{statistic}_across_languages.png')
    
    plt.tight_layout()
    plt.show()


def visualize_statistics_for_feature(combined_stats_df, feature):
    statistics = ['mean', 'max', 'min', 'std', 'var', 'median', 'skew', 'kurtosis']
    
    for stat in statistics:
        print(f"Plotting {stat} for {feature}...")
        plot_bar_for_statistic(combined_stats_df, statistic=stat)


def analyze_statistics_for_feature(feature: str):
    genders = ['male', 'female']
    all_stats = []

    for gender in genders:
        print(f"Computing statistics for {feature} for {gender.capitalize()}...")
        
        stats_df = compute_statistics_per_language(feature, gender)
        stats_df['feature'] = feature
        all_stats.append(stats_df)

    combined_stats_df = pd.concat(all_stats, ignore_index=True)

    visualize_statistics_for_feature(combined_stats_df, feature)


In [None]:
analyze_statistics_for_feature('pitch_mean')
analyze_statistics_for_feature('hnr_mean')
analyze_statistics_for_feature('spectral_centroid_mean')
analyze_statistics_for_feature('spectral_bandwidth_mean')
analyze_statistics_for_feature('spectral_flatness_mean')
analyze_statistics_for_feature('zcr_mean')

for i in range(0, 11):
    analyze_statistics_for_feature(f'mfcc_mean_{i}')

for i in range(0, 5):
    analyze_statistics_for_feature(f'spectral_contrast_mean_{i}')

for i in range(0, 11):
    analyze_statistics_for_feature(f'chroma_mean_{i}')

## **8. Hypothesis and Statistical Tests Overview**
We aim to perform statistical tests to determine if there are significant differences in various features between languages. We will conduct normality tests and then use appropriate tests (ANOVA or Kruskal-Wallis) based on normality results.

### **8.1 Normality Testing**

In [None]:
from scipy import stats

def test_normality_per_language(feature: str, alpha: float = 0.05) -> None:
    all_meet_normality = True

    for lang_code in LANGUAGE_CODES:
        df_features = load_features(lang_code, [feature])
        if df_features is None:
            continue

        for gender in ['male', 'female']:
            df_gender = df_features[df_features['gender'].str.startswith(gender)]

            if len(df_gender) > 5000:
                _, p_value = stats.kstest(df_gender[feature].dropna(), 'norm')
            else:
                _, p_value = stats.shapiro(df_gender[feature].dropna())

            if p_value > alpha:
                all_meet_normality = False
                print(f"Warning: {feature} for {gender} in {lang_code} does not meet normality (p-value = {p_value:.4f})")

    if all_meet_normality:
        print(f"All groups for {feature} meet normality based on the selected alpha level of {alpha}")




def test_normality_for_feature(feature: str):
    print(f"Testing normality for feature: {feature}")
    test_normality_per_language(feature)

In [None]:
test_normality_for_feature('pitch_mean')
test_normality_for_feature('hnr_mean')
test_normality_for_feature('spectral_centroid_mean')
test_normality_for_feature('spectral_bandwidth_mean')
test_normality_for_feature('spectral_flatness_mean')
test_normality_for_feature('zcr_mean')

for i in range(0, 11):
    test_normality_for_feature(f'mfcc_mean_{i}')

for i in range(0, 5):
    test_normality_for_feature(f'spectral_contrast_mean_{i}')

for i in range(0, 11):
    test_normality_for_feature(f'chroma_mean_{i}')

### **8.2 Statistical Tests Across Languages**

In [None]:
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from scipy import stats

def prepare_data_for_statistical_tests(feature: str, gender: str) -> pd.DataFrame:
    combined_data = []
    for lang_code in LANGUAGE_CODES:
        df_features = load_features(lang_code, [feature])
        if df_features is None:
            continue

        df_gender = df_features[df_features['gender'].str.startswith(gender)].copy()
        df_gender['language'] = lang_code
        combined_data.append(df_gender[['language', feature]])
    
    return pd.concat(combined_data, ignore_index=True)


def check_normality(series: pd.Series, alpha: float = 0.05) -> bool:
    if len(series) > 5000:
        _, p_value = stats.kstest(series.dropna(), 'norm')
    else:
        _, p_value = stats.shapiro(series.dropna())
    
    return p_value < alpha


def statistical_test_for_feature(feature: str):
    genders = ['male', 'female']
    
    for gender in genders:
        print(f"\nPerforming statistical tests for feature: {feature} for {gender.capitalize()}")

        combined_df = prepare_data_for_statistical_tests(feature, gender)
        
        groups = [combined_df[combined_df['language'] == lang][feature].dropna() 
                  for lang in LANGUAGE_CODES]

        if len(combined_df[feature].dropna()) > 5000:
            normality_test = stats.kstest(combined_df[feature].dropna(), 'norm')
        else:
            normality_test = stats.shapiro(combined_df[feature].dropna())
        
        _, normal_p_value = normality_test

        if normal_p_value < 0.05:  # Normality is assumed
            stat, p = stats.f_oneway(*groups)
            print(f"ANOVA test result for {feature} ({gender.capitalize()}): F={stat}, p={p}")
            
            if p < 0.05:
                print("Significant differences detected. Performing post-hoc analysis...")
                tukey = pairwise_tukeyhsd(endog=combined_df[feature], groups=combined_df['language'], alpha=0.05)
                print(tukey)
        else:
            print(f"{feature} ({gender.capitalize()}) is not normally distributed. Performing Kruskal-Wallis test.")
            stat, p = stats.kruskal(*groups)
            print(f"Kruskal-Wallis test result for {feature} ({gender.capitalize()}): H={stat}, p={p}")

            if p < 0.05:
                print("Significant differences detected in Kruskal-Wallis test. Post-hoc analysis may be required.")


In [None]:
statistical_test_for_feature('pitch_mean')
statistical_test_for_feature('hnr_mean')
statistical_test_for_feature('spectral_centroid_mean')
statistical_test_for_feature('spectral_bandwidth_mean')
statistical_test_for_feature('spectral_flatness_mean')
statistical_test_for_feature('zcr_mean')

for i in range(0, 11):
    statistical_test_for_feature(f'mfcc_mean_{i}')

for i in range(0, 5):
    statistical_test_for_feature(f'spectral_contrast_mean_{i}')

for i in range(0, 11):
    statistical_test_for_feature(f'chroma_mean_{i}')