In [57]:
import polars as pl
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

In [58]:
female_formal_1 = pl.read_parquet('data/female_formal_1.parquet')
female_formal_2 = pl.read_parquet('data/female_formal_2.parquet')

female_informal_1 = pl.read_parquet('data/female_informal_1.parquet')
female_informal_2 = pl.read_parquet('data/female_informal_2.parquet')

male_formal_1 = pl.read_parquet('data/male_formal_1.parquet')
male_formal_2 = pl.read_parquet('data/male_formal_2.parquet')

male_informal_1 = pl.read_parquet('data/male_informal_1.parquet')
male_informal_2 = pl.read_parquet('data/male_informal_2.parquet')

In [59]:
n_au = 27 # Nr of action units

# Reference ids of the action units
au_ref = [1, 2, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22,
          23, 24, 25, 26, 27, 32, 38, 39]

def extract_au(df, threshold=0.9):

    features = df.select(
        [   
            'time',
            'frame',
            'face_prob',
            pl.lit([list(range(n_au))]).alias("face_au"),
            pl.col("face_aus").alias("value").list.take(list(range(n_au)))
        ]
    ).filter((pl.col("value").is_not_null()) & 
             (pl.col("face_prob") > threshold) &
             (pl.col("time") <= 50)
             ).explode(["face_au", "value"])

    return features.to_pandas().drop(columns=['face_prob'])

# female
female_formal_1_au = extract_au(female_formal_1)
female_formal_2_au = extract_au(female_formal_2)

female_informal_1_au = extract_au(female_informal_1)
female_informal_2_au = extract_au(female_informal_2)

female_formal_au = pd.concat([female_formal_1_au, female_formal_2_au], keys=['Female Formal 1', ' Female Formal 2'])
female_informal_au = pd.concat([female_informal_1_au, female_informal_2_au], keys=['Female Inormal 1', ' Female Inormal 2'])

female = pd.concat([female_formal_au, female_informal_au])

# male
male_formal_1_au = extract_au(male_formal_1)
male_formal_2_au = extract_au(male_formal_2)

male_informal_1_au = extract_au(male_informal_1)
male_informal_2_au = extract_au(male_informal_2)

male_formal_au = pd.concat([male_formal_1_au, male_formal_2_au], keys=['Male Formal 1', ' Male Formal 2'])
male_informal_au = pd.concat([male_informal_1_au, male_informal_2_au], keys=['Male Informal 1', ' Male Informal 2'])

male = pd.concat([male_formal_au, male_informal_au])

# formal
formal = pd.concat([male_formal_au, female_formal_au])

# informal
informal = pd.concat([male_informal_au, female_informal_au])


Behavior for `lit` will change for sequence inputs. The result will change to be a literal of type List. To retain the old behavior, pass a Series instead, e.g. `Series(sequence)`.


Behavior for `lit` will change for sequence inputs. The result will change to be a literal of type List. To retain the old behavior, pass a Series instead, e.g. `Series(sequence)`.


Behavior for `lit` will change for sequence inputs. The result will change to be a literal of type List. To retain the old behavior, pass a Series instead, e.g. `Series(sequence)`.


Behavior for `lit` will change for sequence inputs. The result will change to be a literal of type List. To retain the old behavior, pass a Series instead, e.g. `Series(sequence)`.


Behavior for `lit` will change for sequence inputs. The result will change to be a literal of type List. To retain the old behavior, pass a Series instead, e.g. `Series(sequence)`.


Behavior for `lit` will change for sequence inputs. The result will change to be a l

In [60]:
# Comparing the formal and informal speech in terms of the mean activation of the action units

def mean_activation(df):
    mean = df.groupby('time')['value'].mean().reset_index()
    return mean

formal_mean = mean_activation(formal)
informal_mean = mean_activation(informal)

# Plotting the mean activation of the action units for formal and informal speech
fig = go.Figure()

fig.add_trace(go.Scatter(x=formal_mean['time'], y=formal_mean['value'], mode='lines', name='Formal'))
fig.add_trace(go.Scatter(x=informal_mean['time'], y=informal_mean['value'], mode='lines', name='Informal'))

fig.update_layout(title='Mean Activation of Action Units for Formal and Informal Speech',
                    xaxis_title='Time (s)',
                    yaxis_title='Mean Activation')

fig.show()

In [61]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score


# concatenate the data
df_all = pd.concat([formal, informal])

def pivot_au(df):
    pivoted = df.pivot_table(index='frame', columns='face_au', values='value')
    return pivoted

pivoted = pivot_au(df_all)

# Apply PCA to reduce the dimensionality
pca = PCA(n_components=2)
pivoted_pca = pca.fit_transform(pivoted)

# Apply k-means clustering
kmeans = KMeans(n_clusters=4, random_state=42)
kmeans.fit(pivoted_pca)
clusters = kmeans.predict(pivoted_pca)

# evaluate the clustering
silhouette = silhouette_score(pivoted_pca, clusters)
print(f'Silhouette score: {silhouette}')

# Plot the clusters and the labels of informal and formal speech
# with plotly go
fig = go.Figure()

fig.add_trace(go.Scatter(x=pivoted_pca[clusters==0, 0], y=pivoted_pca[clusters==0, 1], mode='markers', name='Cluster 1'))
fig.add_trace(go.Scatter(x=pivoted_pca[clusters==1, 0], y=pivoted_pca[clusters==1, 1], mode='markers', name='Cluster 2'))
fig.add_trace(go.Scatter(x=pivoted_pca[clusters==2, 0], y=pivoted_pca[clusters==2, 1], mode='markers', name='Cluster 3'))
fig.add_trace(go.Scatter(x=pivoted_pca[clusters==3, 0], y=pivoted_pca[clusters==3, 1], mode='markers', name='Cluster 4'))

fig.update_layout(title='Clusters of Formal and Informal Speech',
                    xaxis_title='PC1',
                    yaxis_title='PC2')

fig.show()

Silhouette score: 0.36760463694468337




