In [19]:
import polars as pl
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

In [20]:
female_formal_1 = pl.read_parquet('data/female_formal_1.parquet')
female_formal_2 = pl.read_parquet('data/female_formal_2.parquet')

female_informal_1 = pl.read_parquet('data/female_informal_1.parquet')
female_informal_2 = pl.read_parquet('data/female_informal_2.parquet')

male_formal_1 = pl.read_parquet('data/male_formal_1.parquet')
male_formal_2 = pl.read_parquet('data/male_formal_2.parquet')

male_informal_1 = pl.read_parquet('data/male_informal_1.parquet')
male_informal_2 = pl.read_parquet('data/male_informal_2.parquet')

In [21]:
n_au = 27 # Nr of action units

# Reference ids of the action units
au_ref = [1, 2, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22,
          23, 24, 25, 26, 27, 32, 38, 39]

def extract_au(df, threshold=0.9):

    features = df.select(
        [   
            'time',
            'frame',
            'face_prob',
            pl.lit([list(range(n_au))]).alias("face_au"),
            pl.col("face_aus").alias("value").list.take(list(range(n_au)))
        ]
    ).filter((pl.col("value").is_not_null()) & 
             (pl.col("face_prob") > threshold) &
             (pl.col("time") <= 50)
             ).explode(["face_au", "value"])

    features = features.to_pandas().drop(columns=['face_prob'])
    features['face_au'] = features['face_au'].apply(lambda x: au_ref[x])
    return features
    

# female
female_formal_1_au = extract_au(female_formal_1)
female_formal_2_au = extract_au(female_formal_2)

female_informal_1_au = extract_au(female_informal_1)
female_informal_2_au = extract_au(female_informal_2)

female_formal_au = pd.concat([female_formal_1_au, female_formal_2_au], keys=['Female Formal 1', ' Female Formal 2'])
female_informal_au = pd.concat([female_informal_1_au, female_informal_2_au], keys=['Female Inormal 1', ' Female Inormal 2'])

female = pd.concat([female_formal_au, female_informal_au])

# male
male_formal_1_au = extract_au(male_formal_1)
male_formal_2_au = extract_au(male_formal_2)

male_informal_1_au = extract_au(male_informal_1)
male_informal_2_au = extract_au(male_informal_2)

male_formal_au = pd.concat([male_formal_1_au, male_formal_2_au], keys=['Male Formal 1', ' Male Formal 2'])
male_informal_au = pd.concat([male_informal_1_au, male_informal_2_au], keys=['Male Informal 1', ' Male Informal 2'])

male = pd.concat([male_formal_au, male_informal_au])

# formal
formal = pd.concat([male_formal_au, female_formal_au])

# informal
informal = pd.concat([male_informal_au, female_informal_au])

# print out the length of male, female, formal and informal
print(f"Male: {male.shape[0]}")
print(f"Female: {female.shape[0]}")
print(f"Formal: {formal.shape[0]}")
print(f"Informal: {informal.shape[0]}")

Male: 25434
Female: 24921
Formal: 25677
Informal: 24678



Behavior for `lit` will change for sequence inputs. The result will change to be a literal of type List. To retain the old behavior, pass a Series instead, e.g. `Series(sequence)`.


Behavior for `lit` will change for sequence inputs. The result will change to be a literal of type List. To retain the old behavior, pass a Series instead, e.g. `Series(sequence)`.


Behavior for `lit` will change for sequence inputs. The result will change to be a literal of type List. To retain the old behavior, pass a Series instead, e.g. `Series(sequence)`.


Behavior for `lit` will change for sequence inputs. The result will change to be a literal of type List. To retain the old behavior, pass a Series instead, e.g. `Series(sequence)`.


Behavior for `lit` will change for sequence inputs. The result will change to be a literal of type List. To retain the old behavior, pass a Series instead, e.g. `Series(sequence)`.


Behavior for `lit` will change for sequence inputs. The result will change to be a l

In [22]:
# Define a dictionary for category colors
category_colors = {'Formal': px.colors.qualitative.Plotly[2], 'Informal': px.colors.qualitative.Plotly[3], 
                   'Male': px.colors.qualitative.Plotly[0], 'Female': px.colors.qualitative.Plotly[1]}

# comparing the mean activation of the action units between male and female speakers

def mean_activation(df):
    mean = df.groupby('face_au')['value'].mean().reset_index()

    # calculate the confidence interval
    std = df.groupby('face_au')['value'].std().reset_index()
    mean['std'] = std['value']
    mean['upper'] = mean['value'] + mean['std']
    mean['lower'] = mean['value'] - mean['std']
    return mean

formal_mean = mean_activation(formal)
informal_mean = mean_activation(informal)


# Plotting the mean activation of the action units and the confidence interval for female speakers

# Map numerical IDs to action unit labels
au_labels = [format(au_id) for au_id in au_ref]

fig = go.Figure()

fig.add_trace(go.Bar(
    x=au_labels,
    y=formal_mean['value'],
    error_y=dict(type='data', array=formal_mean['std'], visible=True),
    name='Formal',
    marker_color=category_colors['Formal']
))


fig.add_trace(go.Bar(
    x=au_labels,
    y=informal_mean['value'],
    error_y=dict(type='data', array=informal_mean['std'], visible=True),
    name='Informal',
    marker_color=category_colors['Informal']
))

# Customize the layout
fig.update_layout(
    title='Mean Activation of Action Units',
    xaxis=dict(title='Action Units', dtick=1),
    yaxis=dict(title='Mean Activation'),
    barmode='group',
    showlegend=True
)

# Show the plot
fig.show()

In [23]:
# Comparing the formal and informal speech in terms of the mean activation of the action units

def mean_activation(df):
    mean = df.groupby('time')['value'].mean().reset_index()
    return mean

formal_mean = mean_activation(formal)
informal_mean = mean_activation(informal)

# Plotting the mean activation of the action units for formal and informal speech
fig = go.Figure()

fig.add_trace(go.Scatter(x=formal_mean['time'], y=formal_mean['value'], mode='lines', name='Formal', line=dict(color=category_colors['Formal'])))
fig.add_trace(go.Scatter(x=informal_mean['time'], y=informal_mean['value'], mode='lines', name='Informal', line=dict(color=category_colors['Informal'])))

# Adding traces for the dashed lines representing the averages
fig.add_trace(go.Scatter(x=formal_mean['time'], y=[formal_mean['value'].mean()] * len(formal_mean), 
                         mode='lines', name='Formal Average', line=dict(dash='dash', color=category_colors['Formal'])))
fig.add_trace(go.Scatter(x=informal_mean['time'], y=[informal_mean['value'].mean()] * len(informal_mean), 
                         mode='lines', name='Informal Average', line=dict(dash='dash', color = category_colors['Informal'])))

fig.update_layout(title='Mean Activation of Action Units over Time for Formal and Informal Speakers',
                    xaxis_title='Time (s)',
                    yaxis_title='Mean Activation')

fig.show()

# run the T-test
from scipy.stats import ttest_ind

tstatistic, pvalue = ttest_ind(formal_mean['value'], informal_mean['value'], alternative='two-sided', nan_policy='omit')
print(f'T-statistic: {tstatistic}, P-value: {pvalue.round(5)}')

T-statistic: 16.7897650059143, P-value: 0.0


In [24]:
# comparing the mean activation of the action units between male and female speakers

male_mean = mean_activation(male)
female_mean = mean_activation(female)

fig = go.Figure()

fig.add_trace(go.Scatter(x=male_mean['time'], y=male_mean['value'], mode='lines', name='Male', line=dict(color=category_colors['Male'])))
fig.add_trace(go.Scatter(x=female_mean['time'], y=female_mean['value'], mode='lines', name='Informal', line=dict(color=category_colors['Female'])))

# Adding traces for the dashed lines representing the averages
fig.add_trace(go.Scatter(x=male_mean['time'], y=[male_mean['value'].mean()] * len(male_mean), 
                         mode='lines', name='Male Average', line=dict(dash='dash', color=category_colors['Male'])))
fig.add_trace(go.Scatter(x=female_mean['time'], y=[female_mean['value'].mean()] * len(female_mean), 
                         mode='lines', name='Female Average', line=dict(dash='dash', color = category_colors['Female'])))

fig.update_layout(title='Mean Activation of Action Units over Time for Male and Female Speakers',
                    xaxis_title='Time (s)',
                    yaxis_title='Mean Activation')

fig.show()

# run the T-test
tstatistic, pvalue = ttest_ind(male_mean['value'], female_mean['value'], alternative='two-sided', nan_policy='omit')
print(f'T-statistic: {tstatistic}, P-value: {pvalue.round(5)}')

T-statistic: -3.221721682981537, P-value: 0.00133


In [25]:
# Reference table linking emotions to action units
emotion_au_mapping = {
    'Happiness': [6, 12],
    'Sadness': [1, 4, 15],
    'Surprise': [1, 2, 5, 26],
    'Anger': [4, 5, 7, 23],
}

# Function to calculate mean activation for each emotion over time and normalize the data
def mean_activation_by_emotion_over_time(df, emotion_au_mapping):
    emotion_means_over_time = {}
    for emotion, aus in emotion_au_mapping.items():
        relevant_aus = df[df['face_au'].isin(aus)]
        emotion_means_over_time[emotion] = relevant_aus.groupby(['time', 'face_au'])['value'].mean().reset_index()

    # Normalize the data using min-max scaling for each action unit
    for emotion, mean_df in emotion_means_over_time.items():
        min_values = mean_df.groupby('face_au')['value'].min()
        max_values = mean_df.groupby('face_au')['value'].max()

        mean_df['value_normalized'] = mean_df.apply(
            lambda row: (row['value'] - min_values[row['face_au']]) / (max_values[row['face_au']] - min_values[row['face_au']]),
            axis=1
        )

    return emotion_means_over_time

# Calculate mean activation for formal and informal speakers over time and normalize the data
formal_emotion_means_over_time = mean_activation_by_emotion_over_time(female_formal_2_au, emotion_au_mapping)
informal_emotion_means_over_time = mean_activation_by_emotion_over_time(female_informal_2_au, emotion_au_mapping)

def plot_emotion_means_over_time(emotion_means_over_time, title):
    fig = go.Figure()

    for emotion, mean_df in emotion_means_over_time.items():
        for au in mean_df['face_au'].unique():
            fig.add_trace(go.Scatter(
                x=mean_df[mean_df['face_au'] == au]['time'],
                y=mean_df[mean_df['face_au'] == au]['value_normalized'],
                mode='lines+markers',
                name=f'{emotion} - AU{au}'
            ))

    fig.update_layout(
        title=f'Mean Activation of Action (Normalized) - {title}',
        xaxis=dict(title='Time'),
        yaxis=dict(title='Normalized Mean Activation'),
        showlegend=True
    )

    fig.show()

# Plot the mean activation of action units for different emotions over time for formal speakers
plot_emotion_means_over_time(formal_emotion_means_over_time, 'Female Formal 2')

In [26]:
# function to calculate the mean activation of action units for different emotions over time
def average_happiness(emotion_means_over_time):
    # only extract the happiness emotion
    emotion_means = {}
    for emotion, mean_df in emotion_means_over_time.items():
        emotion_means[emotion] = mean_df.groupby('time')['value_normalized'].mean().reset_index()

    return emotion_means['Happiness']

# Calculate the mean activation of action units for different emotions over time for formal speakers
formal_happiness = average_happiness(formal_emotion_means_over_time)
informal_happiness = average_happiness(informal_emotion_means_over_time)


# Plot the mean activation of action units for different emotions for formal speakers
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=formal_happiness['time'],
    y=formal_happiness['value_normalized'],
    mode='lines+markers',
))

fig.update_layout(
    title=f'Mean Activation of Happiness (Normalized)',
    xaxis=dict(title='Time'),
    yaxis=dict(title='Normalized Mean Activation'),
    showlegend=False
)

fig.show()

In [27]:
# Function to calculate mean activation for each emotion, normalize the data, and count peaks
def count_peaks_by_emotion(df, threshold):
    
    df['is_peak'] = df['value_normalized'] > threshold

    # Count the peaks 
    peak_count = 0
    is_peak = False

    for index, row in df.iterrows():
        if row['is_peak'] and not is_peak:
            peak_count += 1
            is_peak = True
        elif not row['is_peak']:
            is_peak = False

    # Check if the last row is a peak
    if df.iloc[-1]['is_peak']:
        peak_count += 1

    return peak_count

# Count the number of happiness peaks for each video and compare the count between formal and informal speakers
videos = {'formal': [male_formal_1_au, male_formal_2_au, female_formal_1_au, female_formal_2_au],
          'informal': [male_informal_1_au, male_informal_2_au, female_informal_1_au, female_informal_2_au] }

videos = {'male': [male_formal_1_au, male_formal_2_au, male_informal_1_au, male_informal_2_au],
          'female': [female_formal_1_au, female_formal_2_au,female_informal_1_au, female_informal_2_au ] }

def count_peaks_by_video(videos, threshold=0.8):
    peak_counts = []
    for category, video_list in videos.items():
        peak_count = 0
        for video in video_list:
            happiness = average_happiness(mean_activation_by_emotion_over_time(video, emotion_au_mapping))
            peak_count = peak_count + count_peaks_by_emotion(happiness, threshold)
        peak_counts.append({category: peak_count})
    return peak_counts

# Count the number of peaks for each video
peak_counts = count_peaks_by_video(videos)

# plot the number of peaks for each category
fig = go.Figure()

fig.add_trace(go.Bar(
    x=list(peak_counts[0].keys()),
    y=list(peak_counts[0].values()),
    text=list(peak_counts[0].values()),
    textposition='auto',
))

fig.add_trace(go.Bar(
    x=list(peak_counts[1].keys()),
    y=list(peak_counts[1].values()),
    text=list(peak_counts[1].values()),
    textposition='auto',
))

fig.update_layout(
    title='Number of Peaks between Male and Female Speakers',
    yaxis_title='Number of Peaks',
    width=800,
    height=400,
    showlegend=False
)

fig.show()

In [28]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score


# concatenate the data
df_all = pd.concat([formal, informal])

def pivot_au(df):
    pivoted = df.pivot_table(index='frame', columns='face_au', values='value')
    return pivoted

pivoted = pivot_au(df_all)

# Apply PCA to reduce the dimensionality
pca = PCA(n_components=2)
pivoted_pca = pca.fit_transform(pivoted)

# Apply k-means clustering
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(pivoted_pca)
clusters = kmeans.predict(pivoted_pca)

# evaluate the clustering
silhouette = silhouette_score(pivoted_pca, clusters)
print(f'Silhouette score: {silhouette}')

# Plot the clusters and the labels of informal and formal speech
# with plotly go
fig = go.Figure()

fig.add_trace(go.Scatter(x=pivoted_pca[clusters==0, 0], y=pivoted_pca[clusters==0, 1], mode='markers', name='Cluster 1'))
fig.add_trace(go.Scatter(x=pivoted_pca[clusters==1, 0], y=pivoted_pca[clusters==1, 1], mode='markers', name='Cluster 2'))
fig.add_trace(go.Scatter(x=pivoted_pca[clusters==2, 0], y=pivoted_pca[clusters==2, 1], mode='markers', name='Cluster 3'))
fig.add_trace(go.Scatter(x=pivoted_pca[clusters==3, 0], y=pivoted_pca[clusters==3, 1], mode='markers', name='Cluster 4'))

fig.update_layout(title='Clusters of Formal and Informal Speech',
                    xaxis_title='PC1',
                    yaxis_title='PC2')

fig.show()

Silhouette score: 0.36806698236227525






In [30]:
# get the real labels
df_all['real_label'] = df_all.index.get_level_values(0)
real_labels = df_all['real_label']