In [56]:
# Adjusting and running the full analysis as described
# Importing necessary libraries
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind

file_path = 'data/dataset_collaboration_with_survey_scores.csv'
data = pd.read_csv(file_path)

data_filtered = data[(data['project'] == 4)]
data_filtered = data_filtered[~((data_filtered['speaker_id'] == 5) | (
    data_filtered['next_speaker_id'] == 5))]

formal_speech = data_filtered[data_filtered['meeting_number'] < 9]
informal_speech = data_filtered[data_filtered['meeting_number'] >= 9]

In [57]:
num_meeting = informal_speech['speaker_id'].nunique()

In [58]:
# Function to calculate team metrics adjusted for duplicated data
def calculate_team_meeting_metrics(meetings):
    # Removing duplicates by averaging values for each speaker per meeting
    unique_speech_frequencies = meetings.groupby(['meeting_number', 'speaker_id'])[
        'normalized_speech_frequency'].mean().reset_index()

    # Summing normalized speech frequencies per meeting
    meeting_metrics = unique_speech_frequencies.groupby('meeting_number').agg({
        'normalized_speech_frequency': 'sum'
    }).reset_index()

    # Summing interaction counts per meeting
    interaction_metrics = meetings.groupby('meeting_number').agg({
        'count': 'sum',
    }).reset_index()

    # Removing self interactions
    self_interactions = meetings[meetings['speaker_id']
                                 == meetings['next_speaker_id']]
    total_self_interactions = self_interactions.groupby(
        'meeting_number')['count'].sum().reset_index()
    interaction_metrics = interaction_metrics.merge(
        total_self_interactions, on='meeting_number', how='left', suffixes=('', '_self'))
    interaction_metrics['count'] = interaction_metrics['count'] - \
        interaction_metrics['count_self'].fillna(0)
    interaction_metrics.drop(columns=['count_self'], inplace=True)

    # Combining the metrics
    combined_metrics = meeting_metrics.merge(
        interaction_metrics, on='meeting_number')

    return combined_metrics


formal_speech_metrics = calculate_team_meeting_metrics(formal_speech)
informal_speech_metrics = calculate_team_meeting_metrics(informal_speech)
formal_speech_metrics, informal_speech_metrics

(   meeting_number  normalized_speech_frequency  count
 0               1                  6062.016807    233
 1               2                  5393.385827    244
 2               3                  5103.684211    383
 3               4                  5049.000000    213
 4               5                  5401.621622    254
 5               6                  5748.965517    155
 6               7                  5074.528302    226
 7               8                  5616.617647    716,
    meeting_number  normalized_speech_frequency  count
 0               9                  5627.333333   1296
 1              10                  5753.048780    677
 2              11                  6582.699387    913
 3              12                  4808.888889    270)

In [59]:
# Function to perform t-tests
def perform_ttest(group1, group2):
    ttest_results = {}
    ttest_results['normalized_speech_frequency'] = ttest_ind(
        group1['normalized_speech_frequency'], group2['normalized_speech_frequency'], equal_var=False)
    ttest_results['count'] = ttest_ind(
        group1['count'], group2['count'], equal_var=False)
    return ttest_results


def dataframe_generator(ttest_results, group1, group2):
    variables = ['normalized_speech_frequency', 'count']
    rows = []
    for var in variables:
        row_meeting = {
            'Variable': var,
            'Group': 'before_feedback',
            'Mean': group1[var].mean(),
            'Std': group1[var].std(),
            'df': len(group1[var]) - 1,
            't-statistic': ttest_results[var].statistic,
            'p-value': ttest_results[var].pvalue
        }
        row_chat = {
            'Variable': var,
            'Group': 'after_feedback',
            'Mean': group2[var].mean(),
            'Std': group2[var].std(),
            'df': len(group2[var]) - 1,
            't-statistic': '',
            'p-value': ''
        }
        rows.append(row_meeting)
        rows.append(row_chat)

    detailed_df = pd.DataFrame(rows)
    return detailed_df


team_ttest_results = perform_ttest(
    formal_speech_metrics, informal_speech_metrics)
team_ttest_results

{'normalized_speech_frequency': TtestResult(statistic=-0.6801478780766557, pvalue=0.5359105541701332, df=3.766338073610715),
 'count': TtestResult(statistic=-2.169372138987271, pvalue=0.10496125977578503, df=3.5299325693813333)}

In [60]:
dataframe_generator(team_ttest_results,
                    formal_speech_metrics, informal_speech_metrics)

Unnamed: 0,Variable,Group,Mean,Std,df,t-statistic,p-value
0,normalized_speech_frequency,before_feedback,5431.227491,361.77092,7,-0.680148,0.535911
1,normalized_speech_frequency,after_feedback,5692.992597,725.978747,3,,
2,count,before_feedback,303.0,178.759216,7,-2.169372,0.104961
3,count,after_feedback,789.0,429.856565,3,,


In [61]:
# Function to calculate individual metrics adjusted for meeting count
def calculate_individual_metrics(meetings, meeting_count):
    # Removing duplicates by averaging values for each speaker per meeting
    unique_speech_frequencies = meetings.groupby(['meeting_number', 'speaker_id'])[
        'normalized_speech_frequency'].mean().reset_index()

    # Summing normalized speech frequencies per speaker
    individual_metrics = unique_speech_frequencies.groupby('speaker_id').agg({
        'normalized_speech_frequency': 'sum'
    }).reset_index()
    individual_metrics['normalized_speech_frequency'] /= meeting_count

    # Summing interaction counts per speaker
    interaction_metrics = meetings.groupby('speaker_id').agg({
        'count': 'sum'
    }).reset_index()

    # Removing self interactions
    self_interactions = meetings[meetings['speaker_id']
                                 == meetings['next_speaker_id']]
    total_self_interactions = self_interactions.groupby(
        'speaker_id')['count'].sum().reset_index()
    interaction_metrics = interaction_metrics.merge(
        total_self_interactions, on='speaker_id', how='left', suffixes=('', '_self'))
    interaction_metrics['count'] = interaction_metrics['count'] - \
        interaction_metrics['count_self'].fillna(0)
    interaction_metrics.drop(columns=['count_self'], inplace=True)
    interaction_metrics['count'] /= meeting_count

    # Combining the metrics
    combined_metrics = individual_metrics.merge(
        interaction_metrics, on='speaker_id')

    return combined_metrics


formal_speech_individual_metrics = calculate_individual_metrics(
    formal_speech, 8)
informal_speech_individual_metrics = calculate_individual_metrics(
    informal_speech, max(data_filtered['meeting_number'])-8)
formal_speech_individual_metrics, informal_speech_individual_metrics

(   speaker_id  normalized_speech_frequency    count
 0           0                   667.962631   68.125
 1           1                   455.204581   52.375
 2           2                   224.732790   25.875
 3           3                  3801.366158  125.125
 4           4                   281.961331   31.500,
    speaker_id  normalized_speech_frequency   count
 0           0                  1162.902317  243.25
 1           1                   366.243016  117.00
 2           2                   116.321207   35.00
 3           3                  3637.035690  277.75
 4           4                   410.490367  116.00)

In [62]:
individual_ttest_results = perform_ttest(
    formal_speech_individual_metrics, informal_speech_individual_metrics)


individual_ttest_results

{'normalized_speech_frequency': TtestResult(statistic=-0.055574334175484005, pvalue=0.9570473767766241, df=7.978580055137914),
 'count': TtestResult(statistic=-2.015361461031069, pvalue=0.09744377790873894, df=5.231684306071025)}

In [63]:
dataframe_generator(individual_ttest_results,
                    formal_speech_individual_metrics, informal_speech_individual_metrics)

Unnamed: 0,Variable,Group,Mean,Std,df,t-statistic,p-value
0,normalized_speech_frequency,before_feedback,1086.245498,1527.590469,4,-0.055574,0.957047
1,normalized_speech_frequency,after_feedback,1138.598519,1450.388257,4,,
2,count,before_feedback,60.6,39.813275,4,-2.015361,0.097444
3,count,after_feedback,157.8,100.226525,4,,


In [64]:
grouped_formal = formal_speech.groupby(['speaker_id', 'meeting_number'])[
    'normalized_speech_frequency'].mean().reset_index()
grouped_informal = informal_speech.groupby(['speaker_id', 'meeting_number'])[
    'normalized_speech_frequency'].mean().reset_index()

In [65]:
grouped_formal

Unnamed: 0,speaker_id,meeting_number,normalized_speech_frequency
0,0,1,652.436975
1,0,2,894.80315
2,0,3,1063.684211
3,0,4,148.5
4,0,5,719.459459
5,0,6,675.172414
6,0,7,99.056604
7,0,8,1090.588235
8,1,1,358.487395
9,1,2,499.370079


In [66]:
grouped_informal

Unnamed: 0,speaker_id,meeting_number,normalized_speech_frequency
0,0,9,1328.0
1,0,10,1033.170732
2,0,11,1608.957055
3,0,12,681.481481
4,1,9,544.0
5,1,10,594.146341
6,1,11,214.233129
7,1,12,112.592593
8,2,9,56.666667
9,2,10,145.609756


In [67]:
# 결과를 저장할 리스트 초기화
results = []

# 각 speaker_id에 대해 루프 실행
for speaker_id in range(num_meeting):
    formal_speech_speaker = grouped_formal[grouped_formal['speaker_id']
                                             == speaker_id]['normalized_speech_frequency']
    informal_speech_speaker = grouped_informal[grouped_informal['speaker_id']
                                           == speaker_id]['normalized_speech_frequency']
    # t-test 수행
    t_statistic, p_value = ttest_ind(
        formal_speech_speaker, informal_speech_speaker)

    mean_formal = np.mean(formal_speech_speaker)
    std_foraml = np.std(formal_speech_speaker)
    mean_informal = np.mean(informal_speech_speaker)
    std_informal = np.std(informal_speech_speaker)

    df = len(formal_speech_speaker) + len(informal_speech_speaker) - 2

    results.append({
        "Speaker ID": speaker_id,
        "Condition": "Formal",
        "Mean": mean_formal,
        "SD": std_foraml,
        "t-Statistic": t_statistic,
        "df": df,
        "p-Value": p_value
    })

    results.append({
        "Speaker ID": speaker_id,
        "Condition": "Informal",
        "Mean": mean_informal,
        "SD": std_informal,
        "t-Statistic": "",
        "df": "",
        "p-Value": ""
    })

# 결과를 데이터프레임으로 변환
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Speaker ID,Condition,Mean,SD,t-Statistic,df,p-Value
0,0,Formal,667.962631,350.554032,-2.116749,10.0,0.060353
1,0,Informal,1162.902317,344.535332,,,
2,1,Formal,455.204581,173.137496,0.71676,10.0,0.489924
3,1,Informal,366.243016,206.750462,,,
4,2,Formal,224.73279,96.477885,1.978657,10.0,0.076042
5,2,Informal,116.321207,37.381127,,,
6,3,Formal,3801.366158,654.922483,0.415045,10.0,0.686873
7,3,Informal,3637.03569,432.716229,,,
8,4,Formal,281.961331,118.034863,-1.301662,10.0,0.222218
9,4,Informal,410.490367,192.706437,,,


In [70]:
grouped_formal = formal_speech.groupby(['speaker_id', 'meeting_number'])[
    'count'].sum().reset_index()
grouped_informal = informal_speech.groupby(['speaker_id', 'meeting_number'])[
    'count'].sum().reset_index()

In [71]:
results = []

for speaker_id in range(num_meeting):
    formal_speech_speaker = grouped_formal[grouped_formal['speaker_id']
                                             == speaker_id]['count']
    informal_speech_speaker = grouped_informal[grouped_informal['speaker_id']
                                           == speaker_id]['count']

    # t-test 독립표본 검정 수행
    t_statistic, p_value = ttest_ind(
        formal_speech_speaker, informal_speech_speaker)

    # 평균과 표준편차 계산
    mean_formal = np.mean(formal_speech_speaker)
    std_formal = np.std(formal_speech_speaker)
    mean_informal = np.mean(informal_speech_speaker)
    std_informal = np.std(informal_speech_speaker)

    # 자유도 계산
    df = len(formal_speech_speaker) + len(informal_speech_speaker) - 2

    # 결과 저장
    results.append({
        "Speaker ID": speaker_id,
        "Condition": "Formal",
        "Mean": mean_formal,
        "SD": std_formal,
        "t-Statistic": t_statistic,
        "df": df,
        "p-Value": p_value
    })

    results.append({
        "Speaker ID": "",
        "Condition": "Informal",
        "Mean": mean_informal,
        "SD": std_informal,
        "t-Statistic": "",
        "df": "",
        "p-Value": ""
    })

# 데이터프레임 생성
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Speaker ID,Condition,Mean,SD,t-Statistic,df,p-Value
0,0.0,Formal,71.875,63.732718,-3.116773,10.0,0.010934
1,,Informal,247.0,113.681573,,,
2,1.0,Formal,55.375,34.535987,-1.66199,10.0,0.127496
3,,Informal,117.75,83.694011,,,
4,2.0,Formal,27.25,11.765947,-1.078302,10.0,0.306222
5,,Informal,35.0,8.215838,,,
6,3.0,Formal,465.125,517.72059,0.569617,10.0,0.581507
7,,Informal,301.75,111.138146,,,
8,4.0,Formal,42.375,32.88593,-2.18039,10.0,0.05421
9,,Informal,117.25,75.489652,,,
