In [1]:
import plotly.graph_objects as go
import polars as pl
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

In [2]:
female_formal_1 = pl.read_parquet('data/female_formal_1.parquet')
female_formal_2 = pl.read_parquet('data/female_formal_2.parquet')

female_informal_1 = pl.read_parquet('data/female_informal_1.parquet')
female_informal_2 = pl.read_parquet('data/female_informal_2.parquet')

male_formal_1 = pl.read_parquet('data/male_formal_1.parquet')
male_formal_2 = pl.read_parquet('data/male_formal_2.parquet')

male_informal_1 = pl.read_parquet('data/male_informal_1.parquet')
male_informal_2 = pl.read_parquet('data/male_informal_2.parquet')

In [3]:

def extract_sentimental(df, threshold=0.9):
    features = df.select([
        'time',
        'span_start',
        'span_end',
        'span_text',
        'span_confidence',
        'span_sent_pos',
        'span_sent_neg',
        'span_sent_neu'
    ]).filter(
        (pl.col("span_confidence") > threshold) &
        (pl.col("time") <= 50)
    )

    # Convert the final Polars DataFrame to pandas DataFrame before returning
    features_pd = features.to_pandas()
    return features_pd

# Female formal
female_formal_1_sent = extract_sentimental(female_formal_1)
female_formal_2_sent = extract_sentimental(female_formal_2)

# Female informal 
female_informal_1_sent = extract_sentimental(female_informal_1)
female_informal_2_sent = extract_sentimental(female_informal_2)

# Male formal 
male_formal_1_sent = extract_sentimental(male_formal_1)
male_formal_2_sent = extract_sentimental(male_formal_2)

# Male informal
male_informal_1_sent = extract_sentimental(male_informal_1)
male_informal_2_sent = extract_sentimental(male_informal_2)

# Concatenate female formal and informal
female_formal_au = pd.concat([female_formal_1_sent, female_formal_2_sent], keys=['Female Formal 1', 'Female Formal 2'])
female_informal_au = pd.concat([female_informal_1_sent, female_informal_2_sent], keys=['Female Informal 1', 'Female Informal 2'])
female = pd.concat([female_formal_au, female_informal_au])

# Concatenate male formal and informal
male_formal_sent = pd.concat([male_formal_1_sent, male_formal_2_sent], keys=['Male Formal 1', 'Male Formal 2'])
male_informal_sent = pd.concat([male_informal_1_sent, male_informal_2_sent], keys=['Male Informal 1', 'Male Informal 2'])
male = pd.concat([male_formal_sent, male_informal_sent])

# Concatenate formal and informal
formal = pd.concat([male_formal_sent, female_formal_au])
informal = pd.concat([male_informal_sent, female_informal_au])

# Print out the length of male, female, formal, and informal
print(f"Male: {male.shape[0]}")
print(f"Female: {female.shape[0]}")
print(f"Formal: {formal.shape[0]}")
print(f"Informal: {informal.shape[0]}")



Male: 464
Female: 517
Formal: 497
Informal: 484


In [11]:
# Mean sentiment for formal and informal videos
def mean_sentiment(df):
    mean = df.groupby('time')[['span_sent_pos', 'span_sent_neg', 'span_sent_neu']].mean().reset_index()
    return mean

formal_mean = mean_sentiment(formal)
informal_mean = mean_sentiment(informal)

# Get compound semtiment
formal_mean['compound_sentiment'] = formal_mean["span_sent_pos"] - formal_mean["span_sent_neg"] + formal_mean["span_sent_neu"]
formal_mean['compound_sentiment'] = formal_mean['compound_sentiment'].apply(lambda x: min(max(x, -1), 1))

informal_mean['compound_sentiment'] = informal_mean['span_sent_pos'] - informal_mean['span_sent_neg'] + informal_mean['span_sent_neu']
informal_mean['compound_sentiment'] = informal_mean['compound_sentiment'].apply(lambda x: min(max(x, -1), 1))

In [12]:
import plotly.graph_objects as go
import numpy as np

def plot_sentiment(df1, df2, title):
    fig = go.Figure()
    
    fig.add_trace(go.Scatter(x=df1['time'], y=df1['compound_sentiment'], mode='lines', name='Formal'))
    fig.add_trace(go.Scatter(x=df2['time'], y=df2['compound_sentiment'], mode='lines', name='Informal'))
    
    fig.add_hline(y=np.nanmean(df1['compound_sentiment']), line_dash="dash", name='Formal Mean', line_color='#636EFA')
    fig.add_hline(y=np.nanmean(df2['compound_sentiment']), line_dash="dash", name='Informal Mean', line_color='#EF553B')
    
    fig.update_layout(
        xaxis=dict(title='Time (s)', showgrid=True),
        yaxis=dict(title='Sentiment Score', showgrid=True),
        title=title
        )
    
    fig.show()


# Plot mean sentiment distributions for formal and informal speakers
plot_sentiment(formal_mean, informal_mean, 'Compound Sentiment Formal vs Informal')

In [13]:
# Mean sentiment for male and female videos
male_mean = mean_sentiment(male)
female_mean = mean_sentiment(female)

# Get compound semtiment
male_mean['compound_sentiment'] = male_mean["span_sent_pos"] - male_mean["span_sent_neg"] + male_mean["span_sent_neu"]
male_mean['compound_sentiment'] = male_mean['compound_sentiment'].apply(lambda x: min(max(x, -1), 1))

female_mean['compound_sentiment'] = female_mean['span_sent_pos'] - female_mean['span_sent_neg'] + female_mean['span_sent_neu']
female_mean['compound_sentiment'] = female_mean['compound_sentiment'].apply(lambda x: min(max(x, -1), 1))


In [16]:
def plot_sentiment(df1, df2, title):
    fig = go.Figure()
    
    fig.add_trace(go.Scatter(x=df1['time'], y=df1['compound_sentiment'], mode='lines', name='Male'))
    fig.add_trace(go.Scatter(x=df2['time'], y=df2['compound_sentiment'], mode='lines', name='Female'))
    
    fig.add_hline(y=np.nanmean(df1['compound_sentiment']), line_dash="dash", name='Male Mean', line_color='#636EFA')
    fig.add_hline(y=np.nanmean(df2['compound_sentiment']), line_dash="dash", name='Female Mean', line_color='#EF553B')
    
    fig.update_layout(
        xaxis=dict(title='Time (s)', showgrid=True),
        yaxis=dict(title='Sentiment Score', showgrid=True),
        title=title
        )
    
    fig.show()
# Plot mean sentiment distributions for male and female speakers
plot_sentiment(male_mean, female_mean, 'Compound Sentiment Male vs Female')