In [37]:
import pandas as pd
from scipy.stats import ttest_ind

In [38]:
# Read the dataset
meeting_dataset = pd.read_csv(
    'data/dataset_collaboration_with_survey_scores.csv')

In [39]:
meeting_dataset = meeting_dataset[meeting_dataset['project'] == 4]

# meeting_number가 12이고 speaker_id 또는 next_speaker_id가 5인 데이터를 제거

meeting_dataset = meeting_dataset[~((meeting_dataset['speaker_id'] == 5) | (
    meeting_dataset['next_speaker_id'] == 5))]

meeting_dataset

Unnamed: 0,id,project,meeting_number,speaker_number,speech_frequency,total_words,duration,normalized_speech_frequency,speaker_id,next_speaker_id,...,interaction_equality_index,degree_centrality,indegree_centrality,outdegree_centrality,betweenness_centrality,closeness_centrality,eigenvector_centrality,pagerank,overall_collaboration_score,individual_collaboration_score
80,4_1_SPEAKER_00,4,1,0,1294,12023,119,10.873950,0,0,...,0.330335,116,58,58,0.250000,0.125000,0.573573,0.237750,3.0,4.0
81,4_1_SPEAKER_00,4,1,0,1294,12023,119,10.873950,0,1,...,0.330335,116,58,58,0.250000,0.125000,0.573573,0.237750,3.0,3.0
82,4_1_SPEAKER_00,4,1,0,1294,12023,119,10.873950,0,2,...,0.330335,116,58,58,0.250000,0.125000,0.573573,0.237750,3.0,3.0
83,4_1_SPEAKER_00,4,1,0,1294,12023,119,10.873950,0,3,...,0.330335,116,58,58,0.250000,0.125000,0.573573,0.237750,3.0,6.0
84,4_1_SPEAKER_00,4,1,0,1294,12023,119,10.873950,0,4,...,0.330335,116,58,58,0.250000,0.125000,0.573573,0.237750,3.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
404,4_13_SPEAKER_02,4,13,2,402,17930,162,2.481481,2,3,...,0.376036,66,33,33,0.791667,0.121212,0.093852,0.075995,,-1.0
406,4_13_SPEAKER_03,4,13,3,10120,17930,162,62.469136,3,0,...,0.376036,487,243,244,0.000000,0.051948,0.674642,0.370750,,-1.0
407,4_13_SPEAKER_03,4,13,3,10120,17930,162,62.469136,3,1,...,0.376036,487,243,244,0.000000,0.051948,0.674642,0.370750,,-1.0
408,4_13_SPEAKER_03,4,13,3,10120,17930,162,62.469136,3,2,...,0.376036,487,243,244,0.000000,0.051948,0.674642,0.370750,,-1.0


In [40]:
# Read the dataset
chat_dataset = pd.read_csv('data/kakao_data.csv')

In [41]:
chat_dataset

Unnamed: 0,id,project,meeting_number,speaker_number,speech_frequency,total_words,duration,normalized_speech_frequency,speaker_id,next_speaker_id,...,weighted_network_density,gini_coefficient,interaction_equality_index,degree_centrality,indegree_centrality,outdegree_centrality,betweenness_centrality,closeness_centrality,eigenvector_centrality,pagerank
0,4_1_SPEAKER_00,4,1,0,1,22,24,0.041667,0,0,...,0.4,0.200000,0.500000,4,2,2,0.666667,0.666667,0.402992,0.183218
1,4_1_SPEAKER_00,4,1,0,1,22,24,0.041667,0,1,...,0.4,0.200000,0.500000,4,2,2,0.666667,0.666667,0.402992,0.183218
2,4_1_SPEAKER_00,4,1,0,1,22,24,0.041667,0,2,...,0.4,0.200000,0.500000,4,2,2,0.666667,0.666667,0.402992,0.183218
3,4_1_SPEAKER_00,4,1,0,1,22,24,0.041667,0,3,...,0.4,0.200000,0.500000,4,2,2,0.666667,0.666667,0.402992,0.183218
4,4_1_SPEAKER_00,4,1,0,1,22,24,0.041667,0,4,...,0.4,0.200000,0.500000,4,2,2,0.666667,0.666667,0.402992,0.183218
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204,4_11_SPEAKER_04,4,11,4,960,5474,48,20.000000,4,0,...,60.0,0.278667,0.471454,910,455,455,0.000000,0.017621,0.600885,0.275658
205,4_11_SPEAKER_04,4,11,4,960,5474,48,20.000000,4,1,...,60.0,0.278667,0.471454,910,455,455,0.000000,0.017621,0.600885,0.275658
206,4_11_SPEAKER_04,4,11,4,960,5474,48,20.000000,4,2,...,60.0,0.278667,0.471454,910,455,455,0.000000,0.017621,0.600885,0.275658
207,4_11_SPEAKER_04,4,11,4,960,5474,48,20.000000,4,3,...,60.0,0.278667,0.471454,910,455,455,0.000000,0.017621,0.600885,0.275658


In [42]:
# Function to calculate team metrics adjusted for duplicated data
def calculate_team_meeting_metrics(meetings):
    # Removing duplicates by averaging values for each speaker per meeting
    unique_speech_frequencies = meetings.groupby(['meeting_number', 'speaker_id'])[
        'normalized_speech_frequency'].mean().reset_index()

    # Summing normalized speech frequencies per meeting
    meeting_metrics = unique_speech_frequencies.groupby('meeting_number').agg({
        'normalized_speech_frequency': 'sum'
    }).reset_index()

    # Summing interaction counts per meeting
    interaction_metrics = meetings.groupby('meeting_number').agg({
        'count': 'sum',
    }).reset_index()

    # Removing self interactions
    self_interactions = meetings[meetings['speaker_id']
                                 == meetings['next_speaker_id']]
    total_self_interactions = self_interactions.groupby(
        'meeting_number')['count'].sum().reset_index()
    interaction_metrics = interaction_metrics.merge(
        total_self_interactions, on='meeting_number', how='left', suffixes=('', '_self'))
    interaction_metrics['count'] = interaction_metrics['count'] - \
        interaction_metrics['count_self'].fillna(0)
    interaction_metrics.drop(columns=['count_self'], inplace=True)

    # Combining the metrics
    combined_metrics = meeting_metrics.merge(
        interaction_metrics, on='meeting_number')

    return combined_metrics


chat = calculate_team_meeting_metrics(chat_dataset)
meeting = calculate_team_meeting_metrics(meeting_dataset)
meeting, chat

(    meeting_number  normalized_speech_frequency  count
 0                1                   101.033613    233
 1                2                    89.889764    244
 2                3                    85.061404    383
 3                4                    84.150000    213
 4                5                    90.027027    254
 5                6                    95.816092    155
 6                7                    84.575472    226
 7                8                    93.610294    716
 8                9                    93.788889   1296
 9               10                    95.884146    677
 10              11                   109.711656    913
 11              12                    80.148148    270
 12              13                    80.000000    267,
    meeting_number  normalized_speech_frequency  count
 0               1                     0.916667      4
 1               2                     5.777778     42
 2               3                     2.375000   

In [43]:
def perform_ttest(group1, group2):
    ttest_results = {}
    ttest_results['normalized_speech_frequency'] = ttest_ind(
        group1['normalized_speech_frequency'], group2['normalized_speech_frequency'], equal_var=False)
    ttest_results['count'] = ttest_ind(
        group1['count'], group2['count'], equal_var=False)
    return ttest_results


def dataframe_generator(ttest_results, group1, group2):
    variables = ['normalized_speech_frequency', 'count']
    rows = []
    for var in variables:
        row_meeting = {
            'Variable': var,
            'Group': 'Meeting',
            'Mean': group1[var].mean(),
            'Std': group1[var].std(),
            'df': len(group1[var]) - 1,
            't-statistic': ttest_results[var].statistic,
            'p-value': ttest_results[var].pvalue
        }
        row_chat = {
            'Variable': var,
            'Group': 'Chat',
            'Mean': group2[var].mean(),
            'Std': group2[var].std(),
            'df': len(group2[var]) - 1,
            't-statistic': '',
            'p-value': ''
        }
        rows.append(row_meeting)
        rows.append(row_chat)

    detailed_df = pd.DataFrame(rows)
    return detailed_df

In [44]:
# Performing t-tests for team metrics
team_ttest_results = perform_ttest(meeting, chat)
team_ttest_results

{'normalized_speech_frequency': TtestResult(statistic=5.456186171649123, pvalue=0.00047222008200934834, df=8.594266332339354),
 'count': TtestResult(statistic=2.6383446022660633, pvalue=0.015971105401018697, df=19.50791384944123)}

In [45]:

dataframe_generator(team_ttest_results, meeting, chat)

Unnamed: 0,Variable,Group,Mean,Std,df,t-statistic,p-value
0,normalized_speech_frequency,Meeting,91.053577,8.54795,12,5.456186,0.000472
1,normalized_speech_frequency,Chat,22.526389,37.00126,8,,
2,count,Meeting,449.769231,346.897957,12,2.638345,0.015971
3,count,Chat,141.444444,199.001326,8,,


In [46]:
def calculate_individual_metrics(meetings, meeting_count):
    # Removing duplicates by averaging values for each speaker per meeting
    unique_speech_frequencies = meetings.groupby(['meeting_number', 'speaker_id'])[
        'normalized_speech_frequency'].mean().reset_index()

    # Summing normalized speech frequencies per speaker
    individual_metrics = unique_speech_frequencies.groupby('speaker_id').agg({
        'normalized_speech_frequency': 'sum'
    }).reset_index()
    individual_metrics['normalized_speech_frequency'] /= meeting_count

    # Summing interaction counts per speaker
    interaction_metrics = meetings.groupby('speaker_id').agg({
        'count': 'sum'
    }).reset_index()

    # Removing self interactions
    self_interactions = meetings[meetings['speaker_id']
                                 == meetings['next_speaker_id']]
    total_self_interactions = self_interactions.groupby(
        'speaker_id')['count'].sum().reset_index()
    interaction_metrics = interaction_metrics.merge(
        total_self_interactions, on='speaker_id', how='left', suffixes=('', '_self'))
    interaction_metrics['count'] = interaction_metrics['count'] - \
        interaction_metrics['count_self'].fillna(0)
    interaction_metrics.drop(columns=['count_self'], inplace=True)
    interaction_metrics['count'] /= meeting_count

    # Combining the metrics
    combined_metrics = individual_metrics.merge(
        interaction_metrics, on='speaker_id')

    return combined_metrics


meeting_individual_metrics = calculate_individual_metrics(meeting_dataset, 13)

chat_individual_metrics = calculate_individual_metrics(chat_dataset, 9)
meeting_individual_metrics, chat_individual_metrics

(   speaker_id  normalized_speech_frequency       count
 0           0                    13.681547  123.153846
 1           1                     6.837533   72.153846
 2           2                     3.092354   29.000000
 3           3                    62.445154  170.384615
 4           4                     4.996990   55.076923,
    speaker_id  normalized_speech_frequency      count
 0           0                     3.030401  23.000000
 1           1                     2.476698  24.888889
 2           2                     0.690123  11.333333
 3           3                    12.646605  46.777778
 4           4                     3.682562  35.444444)

In [47]:
individual_ttest_results = perform_ttest(
    meeting_individual_metrics, chat_individual_metrics)
individual_ttest_results

{'normalized_speech_frequency': TtestResult(statistic=1.202620216166271, pvalue=0.2914084150235773, df=4.279542527940037),
 'count': TtestResult(statistic=2.369974734936637, pvalue=0.07031852136513916, df=4.447926333926117)}

In [48]:
dataframe_generator(individual_ttest_results,
                    meeting_individual_metrics, chat_individual_metrics)

Unnamed: 0,Variable,Group,Mean,Std,df,t-statistic,p-value
0,normalized_speech_frequency,Meeting,18.210715,25.048531,4,1.20262,0.291408
1,normalized_speech_frequency,Chat,4.505278,4.685186,4,,
2,count,Meeting,89.953846,56.612631,4,2.369975,0.070319
3,count,Chat,28.288889,13.417006,4,,
