In [5]:
# Adjusting and running the full analysis as described
# Importing necessary libraries
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind

file_path = 'data/dataset_collaboration_with_survey_scores.csv'
data = pd.read_csv(file_path)

data_filtered = data[(data['project'] == 4)]
data_filtered = data_filtered[~((data_filtered['speaker_id'] == 5) | (
    data_filtered['next_speaker_id'] == 5))]

# Splitting the data into online (meetings 1-7, 10) and offline (meetings 8, 9, 11)
online_meetings = data_filtered[data_filtered['meeting_number'].isin([
                                                                     1, 2, 3, 4, 5, 6, 7, 10])]
offline_meetings = data_filtered[data_filtered['meeting_number'].isin([
                                                                      8, 9, 11])]

In [6]:
num_meeting = offline_meetings['speaker_id'].nunique()

In [7]:
# Function to calculate team metrics adjusted for duplicated data
def calculate_team_meeting_metrics(meetings):
    # Removing duplicates by averaging values for each speaker per meeting
    unique_speech_frequencies = meetings.groupby(['meeting_number', 'speaker_id'])[
        'normalized_speech_frequency'].mean().reset_index()

    # Summing normalized speech frequencies per meeting
    meeting_metrics = unique_speech_frequencies.groupby('meeting_number').agg({
        'normalized_speech_frequency': 'sum'
    }).reset_index()

    # Summing interaction counts per meeting
    interaction_metrics = meetings.groupby('meeting_number').agg({
        'count': 'sum',
    }).reset_index()

    # Removing self interactions
    self_interactions = meetings[meetings['speaker_id']
                                 == meetings['next_speaker_id']]
    total_self_interactions = self_interactions.groupby(
        'meeting_number')['count'].sum().reset_index()
    interaction_metrics = interaction_metrics.merge(
        total_self_interactions, on='meeting_number', how='left', suffixes=('', '_self'))
    interaction_metrics['count'] = interaction_metrics['count'] - \
        interaction_metrics['count_self'].fillna(0)
    interaction_metrics.drop(columns=['count_self'], inplace=True)

    # Combining the metrics
    combined_metrics = meeting_metrics.merge(
        interaction_metrics, on='meeting_number')

    return combined_metrics


online_meetings_metrics = calculate_team_meeting_metrics(online_meetings)
offline_meetings_metrics = calculate_team_meeting_metrics(offline_meetings)
online_meetings_metrics, offline_meetings_metrics

(   meeting_number  normalized_speech_frequency  count
 0               1                  6062.016807    233
 1               2                  5393.385827    244
 2               3                  5103.684211    383
 3               4                  5049.000000    213
 4               5                  5401.621622    254
 5               6                  5748.965517    155
 6               7                  5074.528302    226
 7              10                  5753.048780    677,
    meeting_number  normalized_speech_frequency  count
 0               8                  5616.617647    716
 1               9                  5627.333333   1296
 2              11                  6582.699387    913)

In [8]:
# Function to perform t-tests
def perform_ttest(group1, group2):
    ttest_results = {}
    ttest_results['normalized_speech_frequency'] = ttest_ind(
        group1['normalized_speech_frequency'], group2['normalized_speech_frequency'], equal_var=False)
    ttest_results['count'] = ttest_ind(
        group1['count'], group2['count'], equal_var=False)
    return ttest_results


def dataframe_generator(ttest_results, group1, group2):
    variables = ['normalized_speech_frequency', 'count']
    rows = []
    for var in variables:
        row_meeting = {
            'Variable': var,
            'Group': 'before_feedback',
            'Mean': group1[var].mean(),
            'Std': group1[var].std(),
            'df': len(group1[var]) - 1,
            't-statistic': ttest_results[var].statistic,
            'p-value': ttest_results[var].pvalue
        }
        row_chat = {
            'Variable': var,
            'Group': 'after_feedback',
            'Mean': group2[var].mean(),
            'Std': group2[var].std(),
            'df': len(group2[var]) - 1,
            't-statistic': '',
            'p-value': ''
        }
        rows.append(row_meeting)
        rows.append(row_chat)

    detailed_df = pd.DataFrame(rows)
    return detailed_df


team_ttest_results = perform_ttest(
    online_meetings_metrics, offline_meetings_metrics)
team_ttest_results

{'normalized_speech_frequency': TtestResult(statistic=-1.425168225212563, pvalue=0.25808778677388095, df=2.7204207895569787),
 'count': TtestResult(statistic=-3.7582594126540707, pvalue=0.0450068408900011, df=2.4931279261188077)}

In [9]:
dataframe_generator(team_ttest_results,
                    online_meetings_metrics, offline_meetings_metrics)

Unnamed: 0,Variable,Group,Mean,Std,df,t-statistic,p-value
0,normalized_speech_frequency,before_feedback,5448.281383,374.741843,7,-1.425168,0.258088
1,normalized_speech_frequency,after_feedback,5942.216789,554.700076,2,,
2,count,before_feedback,298.125,165.960785,7,-3.758259,0.045007
3,count,after_feedback,975.0,294.928805,2,,


In [13]:
# Function to calculate individual metrics adjusted for meeting count
def calculate_individual_metrics(meetings, meeting_count):
    # Removing duplicates by averaging values for each speaker per meeting
    unique_speech_frequencies = meetings.groupby(['meeting_number', 'speaker_id'])[
        'normalized_speech_frequency'].mean().reset_index()

    # Summing normalized speech frequencies per speaker
    individual_metrics = unique_speech_frequencies.groupby('speaker_id').agg({
        'normalized_speech_frequency': 'sum'
    }).reset_index()
    individual_metrics['normalized_speech_frequency'] /= meeting_count

    # Summing interaction counts per speaker
    interaction_metrics = meetings.groupby('speaker_id').agg({
        'count': 'sum'
    }).reset_index()

    # Removing self interactions
    self_interactions = meetings[meetings['speaker_id']
                                 == meetings['next_speaker_id']]
    total_self_interactions = self_interactions.groupby(
        'speaker_id')['count'].sum().reset_index()
    interaction_metrics = interaction_metrics.merge(
        total_self_interactions, on='speaker_id', how='left', suffixes=('', '_self'))
    interaction_metrics['count'] = interaction_metrics['count'] - \
        interaction_metrics['count_self'].fillna(0)
    interaction_metrics.drop(columns=['count_self'], inplace=True)
    interaction_metrics['count'] /= meeting_count

    # Combining the metrics
    combined_metrics = individual_metrics.merge(
        interaction_metrics, on='speaker_id')

    return combined_metrics


online_meetings_individual_metrics = calculate_individual_metrics(
    online_meetings, 8)
offline_meetings_individual_metrics = calculate_individual_metrics(
    offline_meetings, max(data_filtered['meeting_number'])-8)
online_meetings_individual_metrics, offline_meetings_individual_metrics

(   speaker_id  normalized_speech_frequency    count
 0           0                   660.785443   64.000
 1           1                   466.660374   48.875
 2           2                   216.573716   23.250
 3           3                  3796.700179  125.500
 4           4                   307.561672   36.500,
    speaker_id  normalized_speech_frequency   count
 0           0                  1006.886323  230.25
 1           1                   315.183282  117.00
 2           2                    95.046764   32.50
 3           3                  2709.330612  251.00
 4           4                   330.215611  100.50)

In [14]:
individual_ttest_results = perform_ttest(
    online_meetings_individual_metrics, offline_meetings_individual_metrics)


individual_ttest_results

{'normalized_speech_frequency': TtestResult(statistic=0.23812109357362748, pvalue=0.8184365051496384, df=7.184296018080158),
 'count': TtestResult(statistic=-1.9310079869419285, pvalue=0.10664166856560967, df=5.443283088222623)}

In [15]:
dataframe_generator(individual_ttest_results,
                    online_meetings_individual_metrics, offline_meetings_individual_metrics)

Unnamed: 0,Variable,Group,Mean,Std,df,t-statistic,p-value
0,normalized_speech_frequency,before_feedback,1089.656277,1522.669195,4,0.238121,0.818437
1,normalized_speech_frequency,after_feedback,891.332518,1072.303697,4,,
2,count,before_feedback,59.625,39.787325,4,-1.931008,0.106642
3,count,after_feedback,146.25,92.081825,4,,


In [19]:
grouped_online = online_meetings.groupby(['speaker_id', 'meeting_number'])[
    'normalized_speech_frequency'].mean().reset_index()
grouped_offline = offline_meetings.groupby(['speaker_id', 'meeting_number'])[
    'normalized_speech_frequency'].mean().reset_index()

In [20]:
grouped_online

Unnamed: 0,speaker_id,meeting_number,normalized_speech_frequency
0,0,1,652.436975
1,0,2,894.80315
2,0,3,1063.684211
3,0,4,148.5
4,0,5,719.459459
5,0,6,675.172414
6,0,7,99.056604
7,0,10,1033.170732
8,1,1,358.487395
9,1,2,499.370079


In [21]:
grouped_offline

Unnamed: 0,speaker_id,meeting_number,normalized_speech_frequency
0,0,8,1090.588235
1,0,9,1328.0
2,0,11,1608.957055
3,1,8,502.5
4,1,9,544.0
5,1,11,214.233129
6,2,8,210.882353
7,2,9,56.666667
8,2,11,112.638037
9,3,8,3542.205882


In [22]:
# 결과를 저장할 리스트 초기화
results = []

# 각 speaker_id에 대해 루프 실행
for speaker_id in range(num_meeting):
    online_meetings_speaker = grouped_online[grouped_online['speaker_id']
                                             == speaker_id]['normalized_speech_frequency']
    offline_meetings_speaker = grouped_offline[grouped_offline['speaker_id']
                                           == speaker_id]['normalized_speech_frequency']
    # t-test 수행
    t_statistic, p_value = ttest_ind(
        online_meetings_speaker, offline_meetings_speaker)

    mean_online = np.mean(online_meetings_speaker)
    std_online = np.std(online_meetings_speaker)
    mean_offline = np.mean(offline_meetings_speaker)
    std_offline = np.std(offline_meetings_speaker)

    df = len(online_meetings_speaker) + len(offline_meetings_speaker) - 2

    results.append({
        "Speaker ID": speaker_id,
        "Condition": "Online",
        "Mean": mean_online,
        "SD": std_online,
        "t-Statistic": t_statistic,
        "df": df,
        "p-Value": p_value
    })

    results.append({
        "Speaker ID": speaker_id,
        "Condition": "Offline",
        "Mean": mean_offline,
        "SD": std_offline,
        "t-Statistic": "",
        "df": "",
        "p-Value": ""
    })

# 결과를 데이터프레임으로 변환
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Speaker ID,Condition,Mean,SD,t-Statistic,df,p-Value
0,0,Online,660.785443,342.318838,-2.917559,9.0,0.0171
1,0,Offline,1342.515097,211.871934,,,
2,1,Online,466.660374,178.826317,0.363399,9.0,0.724696
3,1,Offline,420.244376,146.653874,,,
4,2,Online,216.573716,99.999946,1.311244,9.0,0.222241
5,2,Offline,126.729019,63.741856,,,
6,3,Online,3796.700179,656.882282,0.399448,9.0,0.698876
7,3,Offline,3612.440816,492.050027,,,
8,4,Online,307.561672,133.903067,-1.259123,9.0,0.239661
9,4,Offline,440.287481,157.850228,,,


In [24]:
grouped_online = online_meetings.groupby(['speaker_id', 'meeting_number'])[
    'count'].sum().reset_index()
grouped_offline = offline_meetings.groupby(['speaker_id', 'meeting_number'])[
    'count'].sum().reset_index()

In [25]:
results = []

for speaker_id in range(num_meeting):
    online_meetings_speaker = grouped_online[grouped_online['speaker_id']
                                             == speaker_id]['count']
    offline_meetings_speaker = grouped_offline[grouped_offline['speaker_id']
                                           == speaker_id]['count']

    # t-test 독립표본 검정 수행
    t_statistic, p_value = ttest_ind(
        online_meetings_speaker, offline_meetings_speaker)

    # 평균과 표준편차 계산
    mean_online = np.mean(online_meetings_speaker)
    std_online = np.std(online_meetings_speaker)
    mean_offline = np.mean(offline_meetings_speaker)
    std_offline = np.std(offline_meetings_speaker)

    # 자유도 계산
    df = len(online_meetings_speaker) + len(offline_meetings_speaker) - 2

    # 결과 저장
    results.append({
        "Speaker ID": speaker_id,
        "Condition": "Online",
        "Mean": mean_online,
        "SD": std_online,
        "t-Statistic": t_statistic,
        "df": df,
        "p-Value": p_value
    })

    results.append({
        "Speaker ID": "",
        "Condition": "Offline",
        "Mean": mean_offline,
        "SD": std_offline,
        "t-Statistic": "",
        "df": "",
        "p-Value": ""
    })

# 데이터프레임 생성
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Speaker ID,Condition,Mean,SD,t-Statistic,df,p-Value
0,0.0,Online,67.75,53.63243,-5.859394,9.0,0.000241
1,,Offline,311.666667,60.598863,,,
2,1.0,Online,52.125,27.369863,-3.105685,9.0,0.012606
3,,Offline,156.0,72.970314,,,
4,2.0,Online,24.625,7.175261,-2.982666,9.0,0.015383
5,,Offline,43.333333,10.964589,,,
6,3.0,Online,466.875,517.084722,0.342927,9.0,0.739527
7,,Offline,353.333333,67.36138,,,
8,4.0,Online,47.5,37.070878,-2.354661,9.0,0.042976
9,,Offline,135.333333,73.775937,,,
