In [1]:
# Adjusting and running the full analysis as described
# Importing necessary libraries
import pandas as pd
from scipy.stats import ttest_ind

# Loading the dataset
file_path = 'data/dataset_collaboration_with_survey_scores.csv'
data = pd.read_csv(file_path)

# Filtering the data for the specific project and meetings
data_filtered = data[(data['project'] == 4)]

before_feedback = data_filtered[data_filtered['meeting_number'] < 10]
after_feedback = data_filtered[data_filtered['meeting_number'] >= 10]

In [2]:
# Function to calculate team metrics adjusted for duplicated data
def calculate_team_meeting_metrics(meetings):
    # Removing duplicates by averaging values for each speaker per meeting
    unique_speech_frequencies = meetings.groupby(['meeting_number', 'speaker_id'])[
        'normalized_speech_frequency'].mean().reset_index()

    # Summing normalized speech frequencies per meeting
    meeting_metrics = unique_speech_frequencies.groupby('meeting_number').agg({
        'normalized_speech_frequency': 'sum'
    }).reset_index()

    # Summing interaction counts per meeting
    interaction_metrics = meetings.groupby('meeting_number').agg({
        'count': 'sum',
    }).reset_index()

    # Removing self interactions
    self_interactions = meetings[meetings['speaker_id']
                                 == meetings['next_speaker_id']]
    total_self_interactions = self_interactions.groupby(
        'meeting_number')['count'].sum().reset_index()
    interaction_metrics = interaction_metrics.merge(
        total_self_interactions, on='meeting_number', how='left', suffixes=('', '_self'))
    interaction_metrics['count'] = interaction_metrics['count'] - \
        interaction_metrics['count_self'].fillna(0)
    interaction_metrics.drop(columns=['count_self'], inplace=True)

    # Combining the metrics
    combined_metrics = meeting_metrics.merge(
        interaction_metrics, on='meeting_number')

    return combined_metrics


before_feedback_metrics = calculate_team_meeting_metrics(before_feedback)
after_feedback_metrics = calculate_team_meeting_metrics(after_feedback)
before_feedback_metrics, after_feedback_metrics

(   meeting_number  normalized_speech_frequency  count
 0               1                  6062.016807    233
 1               2                  5393.385827    244
 2               3                  5103.684211    383
 3               4                  5049.000000    213
 4               5                  5401.621622    254
 5               6                  5748.965517    155
 6               7                  5074.528302    226
 7               8                  5616.617647    716
 8               9                  5627.333333   1296,
    meeting_number  normalized_speech_frequency  count
 0              10                  5753.048780    677
 1              11                  6582.699387    913
 2              12                  6640.740741    613)

In [3]:
# Function to perform t-tests
def perform_ttest(group1, group2):
    ttest_results = {}
    ttest_results['normalized_speech_frequency'] = ttest_ind(
        group1['normalized_speech_frequency'], group2['normalized_speech_frequency'], equal_var=False)
    ttest_results['count'] = ttest_ind(
        group1['count'], group2['count'], equal_var=False)
    return ttest_results


def dataframe_generator(ttest_results, group1, group2):
    variables = ['normalized_speech_frequency', 'count']
    rows = []
    for var in variables:
        row_meeting = {
            'Variable': var,
            'Group': 'before_feedback',
            'Mean': group1[var].mean(),
            'Std': group1[var].std(),
            'df': len(group1[var]) - 1,
            't-statistic': ttest_results[var].statistic,
            'p-value': ttest_results[var].pvalue
        }
        row_chat = {
            'Variable': var,
            'Group': 'after_feedback',
            'Mean': group2[var].mean(),
            'Std': group2[var].std(),
            'df': len(group2[var]) - 1,
            't-statistic': '',
            'p-value': ''
        }
        rows.append(row_meeting)
        rows.append(row_chat)

    detailed_df = pd.DataFrame(rows)
    return detailed_df


team_ttest_results = perform_ttest(
    before_feedback_metrics, after_feedback_metrics)
team_ttest_results

{'normalized_speech_frequency': TtestResult(statistic=-2.8246992744316333, pvalue=0.07588783940455864, df=2.6765626220057124),
 'count': TtestResult(statistic=-2.0894459464989907, pvalue=0.06719149160185099, df=8.729655284837744)}

In [4]:
dataframe_generator(team_ttest_results,
                    before_feedback_metrics, after_feedback_metrics)

Unnamed: 0,Variable,Group,Mean,Std,df,t-statistic,p-value
0,normalized_speech_frequency,before_feedback,5453.017029,344.661398,8,-2.824699,0.075888
1,normalized_speech_frequency,after_feedback,6325.496303,496.602783,2,,
2,count,before_feedback,413.333333,370.838914,8,-2.089446,0.067191
3,count,after_feedback,734.333333,158.004219,2,,


In [5]:
# Function to calculate individual metrics adjusted for meeting count
def calculate_individual_metrics(meetings, meeting_count):
    # Removing duplicates by averaging values for each speaker per meeting
    unique_speech_frequencies = meetings.groupby(['meeting_number', 'speaker_id'])[
        'normalized_speech_frequency'].mean().reset_index()

    # Summing normalized speech frequencies per speaker
    individual_metrics = unique_speech_frequencies.groupby('speaker_id').agg({
        'normalized_speech_frequency': 'sum'
    }).reset_index()
    individual_metrics['normalized_speech_frequency'] /= meeting_count

    # Summing interaction counts per speaker
    interaction_metrics = meetings.groupby('speaker_id').agg({
        'count': 'sum'
    }).reset_index()

    # Removing self interactions
    self_interactions = meetings[meetings['speaker_id']
                                 == meetings['next_speaker_id']]
    total_self_interactions = self_interactions.groupby(
        'speaker_id')['count'].sum().reset_index()
    interaction_metrics = interaction_metrics.merge(
        total_self_interactions, on='speaker_id', how='left', suffixes=('', '_self'))
    interaction_metrics['count'] = interaction_metrics['count'] - \
        interaction_metrics['count_self'].fillna(0)
    interaction_metrics.drop(columns=['count_self'], inplace=True)
    interaction_metrics['count'] /= meeting_count

    # Combining the metrics
    combined_metrics = individual_metrics.merge(
        interaction_metrics, on='speaker_id')

    return combined_metrics


before_feedback_individual_metrics = calculate_individual_metrics(
    before_feedback, 9)
after_feedback_individual_metrics = calculate_individual_metrics(
    after_feedback, max(data_filtered['meeting_number'])-9)
before_feedback_individual_metrics, after_feedback_individual_metrics

(   speaker_id  normalized_speech_frequency       count
 0           0                   741.300116  101.444444
 1           1                   465.070739   74.777778
 2           2                   206.058777   26.111111
 3           3                  3717.658807  157.555556
 4           4                   322.928591   53.444444,
    speaker_id  normalized_speech_frequency       count
 0           0                  1107.869756  208.666667
 1           1                   306.990688   72.666667
 2           2                   136.206054   38.333333
 3           3                  3833.380920  278.000000
 4           4                   330.431600   79.666667
 5           5                   610.617284   57.000000)

In [6]:
individual_ttest_results = perform_ttest(
    before_feedback_individual_metrics, after_feedback_individual_metrics)


individual_ttest_results

{'normalized_speech_frequency': TtestResult(statistic=0.04149390678518426, pvalue=0.9678669605492061, df=8.440886699594564),
 'count': TtestResult(statistic=-0.8708927773011449, pvalue=0.4100736547594944, df=7.720556688412689)}

In [7]:
dataframe_generator(individual_ttest_results,
                    before_feedback_individual_metrics, after_feedback_individual_metrics)

Unnamed: 0,Variable,Group,Mean,Std,df,t-statistic,p-value
0,normalized_speech_frequency,before_feedback,1090.603406,1482.098834,4,0.041494,0.967867
1,normalized_speech_frequency,after_feedback,1054.249384,1403.450603,5,,
2,count,before_feedback,82.666667,50.188779,4,-0.870893,0.410074
3,count,after_feedback,122.388889,97.259656,5,,
