# **Data Exploration and Analysis**

This notebook aims to conduct a comprehensive data exploration to analyze frequency-related characteristics between different languages. We will focus on examining differences between genders for each language, including various statistical properties and visualizations.

## **1. Setup and Imports**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

from pathlib import Path
from sklearn.preprocessing import StandardScaler


pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

## **2. Constants**

In [None]:
FEATURES_DIR = Path('../data/features')
PLOTS_DIR = Path('../data/plots')
PLOTS_DIR.mkdir(exist_ok=True, parents=True)

SKIP_TEENS = False
SAVE_PLOTS = True

COLORS = {
    'male': '#4682B4',
    'female': '#F3A5B1'
    }

LANGUAGE_CODES =   ['es',
                    'sw',
                    'be',
                    'ba',
                    'ru',
                    'lv',
                    'pl',
                    'tr',
                    'ckb',
                    'ja',
                    'zh-TW',
                    'uk',
                    'zh-HK',
                    'nl',
                    'cy']

## **3. Data Normalization Function**

In [None]:
def normalize_features(df: pd.DataFrame, feature_columns):
    scaler = StandardScaler()
    df[feature_columns] = scaler.fit_transform(df[feature_columns])

    return df

## **4. Data Loading and Filtering Functions**

In [None]:


def filter_out_age_group(df: pd.DataFrame, age_column='age', age_group='teens', lang_code=''):
    initial_rows = len(df)
    df_filtered = df[~df[age_column].str.contains(age_group, case=False)]
    
    print(f'Dropped {initial_rows - len(df_filtered)} records of {age_group} from the dataset for {lang_code}')

    return df_filtered


def load_features(language_code):
    file_path = FEATURES_DIR / f'{language_code}_features.csv'
    
    if not file_path.exists():
        print(f'File {file_path.name} does not exist. Please run the feature extraction script first.')
        return None
    
    df_features = pd.read_csv(file_path)

    df_features = normalize_features(df_features, ['pitch_mean', 'hnr_mean', 'spectral_centroid_mean', 'spectral_bandwidth_mean', 'spectral_flatness_mean', 'zcr_mean'])

    if SKIP_TEENS:
       return filter_out_age_group(df_features, lang_code=language_code)
    
    return df_features


def split_by_gender(df: pd.DataFrame, gender_column='gender'):
    df_male = df[df[gender_column].str.startswith('male')]
    df_female = df[df[gender_column].str.startswith('female')]

    return df_male, df_female


def filter_gender_male_female(df: pd.DataFrame, gender_column='gender'):
    return df[df[gender_column].str.startswith(('male', 'female'))]

## **5. Basic Data Analysis**

In this step we load the data and perform some basic data analysis, like:
- counting the number of males and females
- calculating male to female ratio
- counting the number of teens (age < 20) and adults (age >= 20)

In [None]:
summary_data = []


for lang_code in LANGUAGE_CODES:
    df_features = load_features(lang_code)
    
    if not isinstance(df_features, pd.DataFrame):
        continue

    num_rows = len(df_features)

    df_male, df_female = split_by_gender(df_features)
    
    male_count = len(df_male)
    female_count = len(df_female)

    # Age classification: count 'teens' (13-19) or 'adults' (20+)
    df_features['age_group'] = df_features['age'].apply(
        lambda x: 'teens' if 'teens' in str(x).lower() else 'adults'
    )

    age_group_counts = df_features['age_group'].value_counts().to_dict()
    teens_count = age_group_counts.get('teens', 0)
    adults_count = age_group_counts.get('adults', 0)

    summary_data.append([lang_code, num_rows, male_count, female_count, teens_count, adults_count])


summary_columns = ['Language', 'Total', 'Male', 'Female', 'Teens', 'Adults']

summary_df = pd.DataFrame(summary_data, columns=summary_columns)

summary_df


In [None]:
plt.figure(figsize=(10, 6))

plt.bar(summary_df['Language'], summary_df['Male'], label='Male', color=COLORS['male'])
plt.bar(summary_df['Language'], summary_df['Female'], bottom=summary_df['Male'], label='Female', color=COLORS['female'])

plt.title('Number of Records by Language and Gender')
plt.xlabel('Language')
plt.ylabel('Count')
plt.legend()

if SAVE_PLOTS:
    plt.savefig(PLOTS_DIR / 'record_count_by_gender.png')

plt.show()

## **6. Data Exploration**

This section covers in-depth data exploration using KDE plots, boxplots, heatmaps, and violin plots.

### **6.1 KDE Plots**

In [None]:
import math

def plot_kde_per_language(feature: str, gender: str):
    num_languages = len(LANGUAGE_CODES)
    num_cols = 3
    num_rows = math.ceil(num_languages / num_cols)
    
    fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 4 * num_rows), sharey=True)
    axes = axes.flatten()
    
    for i, lang_code in enumerate(LANGUAGE_CODES):
        df_features = load_features(lang_code)

        if df_features is None:
            continue

        df_gender = df_features[df_features['gender'].str.startswith(gender)]
        
        sns.kdeplot(data=df_gender, x=feature, fill=False, bw_adjust=1.5, ax=axes[i])
        axes[i].set_title(f'{feature.capitalize()} KDE in {lang_code}', fontsize=10)

    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])

    plt.suptitle(f'{feature.capitalize()} KDE Plot for {gender.capitalize()} across Languages', fontsize=16, y=1.02)
    plt.tight_layout()

    if SAVE_PLOTS:
        save_path = PLOTS_DIR / 'kde' / f'{feature}_{gender}'
        save_path.mkdir(exist_ok=True, parents=True)
        plt.savefig(save_path / f'kde_{feature}_{gender}_all_languages_subplot.svg')

    plt.show()


def plot_combined_kde(feature: str, gender: str):
    combined_df = pd.concat([load_features(lang)[[feature, 'gender']].assign(language=lang)
                             for lang in LANGUAGE_CODES], ignore_index=True)
    
    plt.figure(figsize=(12, 8))

    sns.kdeplot(data=combined_df[combined_df['gender'].str.contains(gender, case=False)], 
                x=feature, hue='language', fill=False, bw_adjust=1.5)
    
    plt.title(f'{feature} KDE Plot for {gender.capitalize()} (All Languages)')

    if SAVE_PLOTS:
        save_path = PLOTS_DIR / 'kde' / f'{feature}_{gender}'
        save_path.mkdir(exist_ok=True, parents=True)
        plt.savefig(PLOTS_DIR / f'kde_{feature}_{gender}_all_languages.svg')

    plt.show()


def analyze_kde_for_feature(feature: str):
    genders = ['male', 'female']

    for gender in genders:
        print(f"Processing {feature} for {gender.capitalize()}...")

        plot_kde_per_language(feature, gender)

        plot_combined_kde(feature, gender)

In [None]:
analyze_kde_for_feature('pitch_mean')
analyze_kde_for_feature('hnr_mean')
analyze_kde_for_feature('spectral_centroid_mean')
analyze_kde_for_feature('spectral_bandwidth_mean')
analyze_kde_for_feature('spectral_flatness_mean')
analyze_kde_for_feature('zcr_mean')

### **6.2 Boxplots**

In [None]:
def plot_boxplot_per_language(feature: str, gender: str):
    combined_data = []

    for lang_code in LANGUAGE_CODES:
        df_features = load_features(lang_code)

        if df_features is None:
            continue

        df_gender = df_features[df_features['gender'].str.startswith(gender)].copy()
        df_gender.loc[:, 'language'] = lang_code

        combined_data.append(df_gender[[feature, 'language']])

    combined_df = pd.concat(combined_data, ignore_index=True)

    plt.figure(figsize=(15, 6))
    sns.boxplot(data=combined_df, x='language', y=feature)

    plt.title(f'{feature.capitalize()} Boxplot for {gender.capitalize()} across Languages', fontsize=16)
    plt.xlabel('Language')
    plt.ylabel(feature.capitalize())

    if SAVE_PLOTS:
        save_path = PLOTS_DIR / 'boxplot' / f'{feature}_{gender}'
        save_path.mkdir(exist_ok=True, parents=True)
        plt.savefig(save_path / f'boxplot_{feature}_{gender}_all_languages.svg')

    plt.show()


def analyze_boxplot_for_feature(feature: str):
    genders = ['male', 'female']

    for gender in genders:
        print(f"Processing {feature} for {gender.capitalize()}...")

        plot_boxplot_per_language(feature, gender)

In [None]:
analyze_boxplot_for_feature('pitch_mean')
analyze_boxplot_for_feature('hnr_mean')
analyze_boxplot_for_feature('spectral_centroid_mean')
analyze_boxplot_for_feature('spectral_bandwidth_mean')
analyze_boxplot_for_feature('spectral_flatness_mean')
analyze_boxplot_for_feature('zcr_mean')

### **6.3 Heatmaps**

In [None]:
def plot_heatmap_per_language(feature: str, gender: str):
    combined_data = []

    for lang_code in LANGUAGE_CODES:
        df_features = load_features(lang_code)

        if df_features is None:
            continue

        df_gender = df_features[df_features['gender'].str.startswith(gender)].copy()
        df_gender['language'] = lang_code

        combined_data.append(df_gender[[feature, 'language']])

    combined_df = pd.concat(combined_data, ignore_index=True)

    pivot_df = combined_df.pivot_table(index='language', values=feature, aggfunc='mean')

    plt.figure(figsize=(10, 6))
    sns.heatmap(pivot_df, annot=True, cmap='coolwarm', cbar_kws={'label': feature.capitalize()})
    
    plt.title(f'{feature.capitalize()} Heatmap for {gender.capitalize()} across Languages', fontsize=16)
    plt.xlabel('Language')
    plt.ylabel('Average Value')

    if SAVE_PLOTS:
        save_path = PLOTS_DIR / 'heatmap' / f'{feature}_{gender}'
        save_path.mkdir(exist_ok=True, parents=True)
        plt.savefig(save_path / f'heatmap_{feature}_{gender}_all_languages.svg')

    plt.show()


def analyze_heatmap_for_feature(feature: str):
    genders = ['male', 'female']

    for gender in genders:
        print(f"Processing {feature} for {gender.capitalize()}...")

        plot_heatmap_per_language(feature, gender)

In [None]:
analyze_heatmap_for_feature('pitch_mean')
analyze_heatmap_for_feature('hnr_mean')
analyze_heatmap_for_feature('spectral_centroid_mean')
analyze_heatmap_for_feature('spectral_bandwidth_mean')
analyze_heatmap_for_feature('spectral_flatness_mean')
analyze_heatmap_for_feature('zcr_mean')

### **6.4 Violin Plots**

In [None]:
def plot_violinplot_per_language(feature: str, gender: str):
    combined_data = []

    for lang_code in LANGUAGE_CODES:
        df_features = load_features(lang_code)

        if df_features is None:
            continue

        df_gender = df_features[df_features['gender'].str.startswith(gender)].copy()
        df_gender['language'] = lang_code

        combined_data.append(df_gender[[feature, 'language']])

    combined_df = pd.concat(combined_data, ignore_index=True)

    plt.figure(figsize=(15, 6))
    sns.violinplot(data=combined_df, x='language', y=feature)

    plt.title(f'{feature.capitalize()} Violin Plot for {gender.capitalize()} across Languages', fontsize=16)
    plt.xlabel('Language')
    plt.ylabel(feature.capitalize())

    if SAVE_PLOTS:
        save_path = PLOTS_DIR / 'violinplot' / f'{feature}_{gender}'
        save_path.mkdir(exist_ok=True, parents=True)
        plt.savefig(save_path / f'violinplot_{feature}_{gender}_all_languages.svg')

    plt.show()


def analyze_violinplot_for_feature(feature: str):
    genders = ['male', 'female']

    for gender in genders:
        print(f"Processing {feature} for {gender.capitalize()}...")

        plot_violinplot_per_language(feature, gender)


In [None]:
analyze_violinplot_for_feature('pitch_mean')
analyze_violinplot_for_feature('hnr_mean')
analyze_violinplot_for_feature('spectral_centroid_mean')
analyze_violinplot_for_feature('spectral_bandwidth_mean')
analyze_violinplot_for_feature('spectral_flatness_mean')
analyze_violinplot_for_feature('zcr_mean')

## **7. Feature Statistics**

In [None]:
def compute_statistics_per_language(feature: str, gender: str):
    stats_data = []

    for lang_code in LANGUAGE_CODES:
        df_features = load_features(lang_code)

        if df_features is None:
            continue

        df_gender = df_features[df_features['gender'].str.startswith(gender)].copy()

        feature_stats = {
            'language': lang_code,
            'gender': gender,
            'mean': df_gender[feature].mean(),
            'min': df_gender[feature].min(),
            'max': df_gender[feature].max(),
            'std': df_gender[feature].std(),
            'var': df_gender[feature].var(),
            'median': df_gender[feature].median(),
            'skew': df_gender[feature].skew(),
            'kurtosis': df_gender[feature].kurt()
        }

        stats_data.append(feature_stats)

    stats_df = pd.DataFrame(stats_data)
    
    return stats_df

def analyze_statistics_for_feature(feature: str):
    genders = ['male', 'female']
    all_stats = []

    for gender in genders:
        print(f"Computing statistics for {feature} for {gender.capitalize()}...")

        stats_df = compute_statistics_per_language(feature, gender)
        print(f"\nStatistics for {gender.capitalize()}:\n", stats_df)
        
        all_stats.append(stats_df)

    combined_stats_df = pd.concat(all_stats, ignore_index=True)
    
    return combined_stats_df

In [None]:
analyze_statistics_for_feature('pitch_mean')
analyze_statistics_for_feature('hnr_mean')
analyze_statistics_for_feature('spectral_centroid_mean')
analyze_statistics_for_feature('spectral_bandwidth_mean')
analyze_statistics_for_feature('spectral_flatness_mean')
analyze_statistics_for_feature('zcr_mean')