In [10]:
# Adjusting and running the full analysis as described
# Importing necessary libraries
import pandas as pd
from scipy.stats import ttest_ind

file_path = 'data/dataset_collaboration_with_survey_scores.csv'
data = pd.read_csv(file_path)

data_filtered = data[(data['project'] == 4)]
data_filtered = data_filtered[~((data_filtered['speaker_id'] == 5) | (
    data_filtered['next_speaker_id'] == 5))]

before_feedback = data_filtered[data_filtered['meeting_number'] < 10]
after_feedback = data_filtered[data_filtered['meeting_number'] >= 10]

In [11]:
# Function to calculate team metrics adjusted for duplicated data
def calculate_team_meeting_metrics(meetings):
    # Removing duplicates by averaging values for each speaker per meeting
    unique_speech_frequencies = meetings.groupby(['meeting_number', 'speaker_id'])[
        'normalized_speech_frequency'].mean().reset_index()

    # Summing normalized speech frequencies per meeting
    meeting_metrics = unique_speech_frequencies.groupby('meeting_number').agg({
        'normalized_speech_frequency': 'sum'
    }).reset_index()

    # Summing interaction counts per meeting
    interaction_metrics = meetings.groupby('meeting_number').agg({
        'count': 'sum',
    }).reset_index()

    # Removing self interactions
    self_interactions = meetings[meetings['speaker_id']
                                 == meetings['next_speaker_id']]
    total_self_interactions = self_interactions.groupby(
        'meeting_number')['count'].sum().reset_index()
    interaction_metrics = interaction_metrics.merge(
        total_self_interactions, on='meeting_number', how='left', suffixes=('', '_self'))
    interaction_metrics['count'] = interaction_metrics['count'] - \
        interaction_metrics['count_self'].fillna(0)
    interaction_metrics.drop(columns=['count_self'], inplace=True)

    # Combining the metrics
    combined_metrics = meeting_metrics.merge(
        interaction_metrics, on='meeting_number')

    return combined_metrics


before_feedback_metrics = calculate_team_meeting_metrics(before_feedback)
after_feedback_metrics = calculate_team_meeting_metrics(after_feedback)
before_feedback_metrics, after_feedback_metrics

(   meeting_number  normalized_speech_frequency  count
 0               1                  6062.016807    233
 1               2                  5393.385827    244
 2               3                  5103.684211    383
 3               4                  5049.000000    213
 4               5                  5401.621622    254
 5               6                  5748.965517    155
 6               7                  5074.528302    226
 7               8                  5616.617647    716
 8               9                  5627.333333   1296,
    meeting_number  normalized_speech_frequency  count
 0              10                  5753.048780    677
 1              11                  6582.699387    913
 2              12                  4808.888889    270)

In [12]:
# Function to perform t-tests
def perform_ttest(group1, group2):
    ttest_results = {}
    ttest_results['normalized_speech_frequency'] = ttest_ind(
        group1['normalized_speech_frequency'], group2['normalized_speech_frequency'], equal_var=False)
    ttest_results['count'] = ttest_ind(
        group1['count'], group2['count'], equal_var=False)
    return ttest_results


def dataframe_generator(ttest_results, group1, group2):
    variables = ['normalized_speech_frequency', 'count']
    rows = []
    for var in variables:
        row_meeting = {
            'Variable': var,
            'Group': 'before_feedback',
            'Mean': group1[var].mean(),
            'Std': group1[var].std(),
            'df': len(group1[var]) - 1,
            't-statistic': ttest_results[var].statistic,
            'p-value': ttest_results[var].pvalue
        }
        row_chat = {
            'Variable': var,
            'Group': 'after_feedback',
            'Mean': group2[var].mean(),
            'Std': group2[var].std(),
            'df': len(group2[var]) - 1,
            't-statistic': '',
            'p-value': ''
        }
        rows.append(row_meeting)
        rows.append(row_chat)

    detailed_df = pd.DataFrame(rows)
    return detailed_df


team_ttest_results = perform_ttest(
    before_feedback_metrics, after_feedback_metrics)
team_ttest_results

{'normalized_speech_frequency': TtestResult(statistic=-0.49865938637089396, pvalue=0.663369423899092, df=2.204740216915547),
 'count': TtestResult(statistic=-0.9192314831223931, pvalue=0.41092409977527106, df=3.9243956996826674)}

In [13]:
dataframe_generator(team_ttest_results,
                    before_feedback_metrics, after_feedback_metrics)

Unnamed: 0,Variable,Group,Mean,Std,df,t-statistic,p-value
0,normalized_speech_frequency,before_feedback,5453.017029,344.661398,8,-0.498659,0.663369
1,normalized_speech_frequency,after_feedback,5714.879019,887.521052,2,,
2,count,before_feedback,413.333333,370.838914,8,-0.919231,0.410924
3,count,after_feedback,620.0,325.267582,2,,


In [14]:
# Function to calculate individual metrics adjusted for meeting count
def calculate_individual_metrics(meetings, meeting_count):
    # Removing duplicates by averaging values for each speaker per meeting
    unique_speech_frequencies = meetings.groupby(['meeting_number', 'speaker_id'])[
        'normalized_speech_frequency'].mean().reset_index()

    # Summing normalized speech frequencies per speaker
    individual_metrics = unique_speech_frequencies.groupby('speaker_id').agg({
        'normalized_speech_frequency': 'sum'
    }).reset_index()
    individual_metrics['normalized_speech_frequency'] /= meeting_count

    # Summing interaction counts per speaker
    interaction_metrics = meetings.groupby('speaker_id').agg({
        'count': 'sum'
    }).reset_index()

    # Removing self interactions
    self_interactions = meetings[meetings['speaker_id']
                                 == meetings['next_speaker_id']]
    total_self_interactions = self_interactions.groupby(
        'speaker_id')['count'].sum().reset_index()
    interaction_metrics = interaction_metrics.merge(
        total_self_interactions, on='speaker_id', how='left', suffixes=('', '_self'))
    interaction_metrics['count'] = interaction_metrics['count'] - \
        interaction_metrics['count_self'].fillna(0)
    interaction_metrics.drop(columns=['count_self'], inplace=True)
    interaction_metrics['count'] /= meeting_count

    # Combining the metrics
    combined_metrics = individual_metrics.merge(
        interaction_metrics, on='speaker_id')

    return combined_metrics


before_feedback_individual_metrics = calculate_individual_metrics(
    before_feedback, 9)
after_feedback_individual_metrics = calculate_individual_metrics(
    after_feedback, max(data_filtered['meeting_number'])-9)
before_feedback_individual_metrics, after_feedback_individual_metrics

(   speaker_id  normalized_speech_frequency       count
 0           0                   741.300116  101.444444
 1           1                   465.070739   74.777778
 2           2                   206.058777   26.111111
 3           3                  3717.658807  157.555556
 4           4                   322.928591   53.444444,
    speaker_id  normalized_speech_frequency       count
 0           0                  1107.869756  201.666667
 1           1                   306.990688   71.333333
 2           2                   136.206054   37.333333
 3           3                  3833.380920  231.333333
 4           4                   330.431600   78.333333)

In [15]:
individual_ttest_results = perform_ttest(
    before_feedback_individual_metrics, after_feedback_individual_metrics)


individual_ttest_results

{'normalized_speech_frequency': TtestResult(statistic=-0.054603980943287306, pvalue=0.9577956242002408, df=7.983918801249476),
 'count': TtestResult(statistic=-0.9242582614819932, pvalue=0.3887732909427246, df=6.419446384095277)}

In [16]:
dataframe_generator(individual_ttest_results,
                    before_feedback_individual_metrics, after_feedback_individual_metrics)

Unnamed: 0,Variable,Group,Mean,Std,df,t-statistic,p-value
0,normalized_speech_frequency,before_feedback,1090.603406,1482.098834,4,-0.054604,0.957796
1,normalized_speech_frequency,after_feedback,1142.975804,1550.177139,4,,
2,count,before_feedback,82.666667,50.188779,4,-0.924258,0.388773
3,count,after_feedback,124.0,86.491168,4,,


In [17]:
import numpy as np
results = []

for speaker_id in range(5):
    before_feedback_speaker = before_feedback[before_feedback['speaker_id']
                                              == speaker_id]['normalized_speech_frequency']
    after_feedback_speaker = after_feedback[after_feedback['speaker_id']
                                            == speaker_id]['normalized_speech_frequency']

    # t-test 독립표본 검정 수행
    t_statistic, p_value = ttest_ind(
        before_feedback_speaker, after_feedback_speaker)

    # 평균과 표준편차 계산
    mean_before = np.mean(before_feedback_speaker)
    std_before = np.std(before_feedback_speaker)
    mean_after = np.mean(after_feedback_speaker)
    std_after = np.std(after_feedback_speaker)

    # 자유도 계산
    df = len(before_feedback_speaker) + len(after_feedback_speaker) - 2

    # 결과 저장
    results.append({
        "Speaker ID": speaker_id,
        "Condition": "Before",
        "Mean": mean_before,
        "SD": std_before,
        "t-Statistic": t_statistic,
        "df": df,
        "p-Value": p_value
    })

    results.append({
        "Speaker ID": "",
        "Condition": "After",
        "Mean": mean_after,
        "SD": std_after,
        "t-Statistic": "",
        "df": "",
        "p-Value": ""
    })

# 데이터프레임 생성
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Speaker ID,Condition,Mean,SD,t-Statistic,df,p-Value
0,0.0,Before,741.300116,390.206339,-3.113605,58.0,0.002869
1,,After,1107.869756,382.306764,,,
2,1.0,Before,465.070739,165.603709,2.946298,58.0,0.004624
3,,After,306.990688,207.246193,,,
4,2.0,Before,206.058777,105.183232,2.518181,58.0,0.014578
5,,After,136.206054,16.77805,,,
6,3.0,Before,3717.658807,661.302247,-0.643354,58.0,0.52253
7,,After,3833.38092,308.953041,,,
8,4.0,Before,322.928591,160.657195,-0.155473,58.0,0.876988
9,,After,330.4316,154.520991,,,


In [18]:
grouped_before = before_feedback.groupby(['speaker_id', 'meeting_number'])[
    'normalized_speech_frequency'].mean().reset_index()
grouped_after = after_feedback.groupby(['speaker_id', 'meeting_number'])[
    'normalized_speech_frequency'].mean().reset_index()
grouped_before

Unnamed: 0,speaker_id,meeting_number,normalized_speech_frequency
0,0,1,652.436975
1,0,2,894.80315
2,0,3,1063.684211
3,0,4,148.5
4,0,5,719.459459
5,0,6,675.172414
6,0,7,99.056604
7,0,8,1090.588235
8,0,9,1328.0
9,1,1,358.487395


In [19]:
grouped_after

Unnamed: 0,speaker_id,meeting_number,normalized_speech_frequency
0,0,10,1033.170732
1,0,11,1608.957055
2,0,12,681.481481
3,1,10,594.146341
4,1,11,214.233129
5,1,12,112.592593
6,2,10,145.609756
7,2,11,112.638037
8,2,12,150.37037
9,3,10,3504.878049


In [20]:

# 결과를 저장할 리스트 초기화
results = []

# 각 speaker_id에 대해 루프 실행
for speaker_id in range(5):
    before_feedback_speaker = grouped_before[grouped_before['speaker_id']
                                             == speaker_id]['normalized_speech_frequency']
    after_feedback_speaker = grouped_after[grouped_after['speaker_id']
                                           == speaker_id]['normalized_speech_frequency']
    # t-test 수행
    t_statistic, p_value = ttest_ind(
        before_feedback_speaker, after_feedback_speaker)

    mean_before = np.mean(before_feedback_speaker)
    std_before = np.std(before_feedback_speaker)
    mean_after = np.mean(after_feedback_speaker)
    std_after = np.std(after_feedback_speaker)

    df = len(before_feedback_speaker) + len(after_feedback_speaker) - 2

    results.append({
        "Speaker ID": speaker_id,
        "Condition": "Before",
        "Mean": mean_before,
        "SD": std_before,
        "t-Statistic": t_statistic,
        "df": df,
        "p-Value": p_value
    })

    results.append({
        "Speaker ID": speaker_id,
        "Condition": "After",
        "Mean": mean_after,
        "SD": std_after,
        "t-Statistic": "",
        "df": "",
        "p-Value": ""
    })

# 결과를 데이터프레임으로 변환
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Speaker ID,Condition,Mean,SD,t-Statistic,df,p-Value
0,0,Before,741.300116,390.206339,-1.292854,10.0,0.225132
1,0,After,1107.869756,382.306764,,,
2,1,Before,465.070739,165.603709,1.223383,10.0,0.24923
3,1,After,306.990688,207.246193,,,
4,2,Before,206.058777,105.183232,1.045618,10.0,0.320354
5,2,After,136.206054,16.77805,,,
6,3,Before,3717.658807,661.302247,-0.267138,10.0,0.794791
7,3,After,3833.38092,308.953041,,,
8,4,Before,322.928591,160.657195,-0.064557,10.0,0.949799
9,4,After,330.4316,154.520991,,,


In [21]:
results = []

for speaker_id in range(5):
    before_feedback_speaker = before_feedback[before_feedback['speaker_id']
                                              == speaker_id]['count']
    after_feedback_speaker = after_feedback[after_feedback['speaker_id']
                                            == speaker_id]['count']

    # t-test 독립표본 검정 수행
    t_statistic, p_value = ttest_ind(
        before_feedback_speaker, after_feedback_speaker)

    # 평균과 표준편차 계산
    mean_before = np.mean(before_feedback_speaker)
    std_before = np.std(before_feedback_speaker)
    mean_after = np.mean(after_feedback_speaker)
    std_after = np.std(after_feedback_speaker)

    # 자유도 계산
    df = len(before_feedback_speaker) + len(after_feedback_speaker) - 2

    # 결과 저장
    results.append({
        "Speaker ID": speaker_id,
        "Condition": "Before",
        "Mean": mean_before,
        "SD": std_before,
        "t-Statistic": t_statistic,
        "df": df,
        "p-Value": p_value
    })

    results.append({
        "Speaker ID": "",
        "Condition": "After",
        "Mean": mean_after,
        "SD": std_after,
        "t-Statistic": "",
        "df": "",
        "p-Value": ""
    })

# 데이터프레임 생성
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Speaker ID,Condition,Mean,SD,t-Statistic,df,p-Value
0,0.0,Before,21.111111,38.570698,-1.464939,58.0,0.148337
1,,After,40.866667,58.718386,,,
2,1.0,Before,15.488889,25.079095,0.14484,58.0,0.88534
3,,After,14.466667,16.728685,,,
4,2.0,Before,5.466667,7.03515,-0.924616,58.0,0.358997
5,,After,7.466667,7.419494,,,
6,3.0,Before,92.377778,254.02767,0.609424,58.0,0.544623
7,,After,51.4,55.531132,,,
8,4.0,Before,12.688889,22.417079,-0.471603,58.0,0.638979
9,,After,15.8,19.633984,,,
