In [48]:
# Adjusting and running the full analysis as described
# Importing necessary libraries
import pandas as pd
from scipy.stats import ttest_ind

# Loading the dataset
file_path = 'data/dataset_collaboration_with_survey_scores.csv'
data = pd.read_csv(file_path)

# Filtering the data for the specific project and meetings
data_filtered = data[(data['project'] == 4) & (
    data['meeting_number'].isin([1, 2, 3, 4, 5, 6, 7, 8, 9, 11]))]

# Splitting the data into online (meetings 1-7) and offline (meetings 8, 9, 11)
online_meetings = data_filtered[data_filtered['meeting_number'].isin([
                                                                     1, 2, 3, 4, 5, 6, 7])]
offline_meetings = data_filtered[data_filtered['meeting_number'].isin([
                                                                      8, 9, 11])]

In [49]:
# Function to calculate team metrics adjusted for duplicated data
def calculate_team_meeting_metrics(meetings):
    # Removing duplicates by averaging values for each speaker per meeting
    unique_speech_frequencies = meetings.groupby(['meeting_number', 'speaker_id'])[
        'normalized_speech_frequency'].mean().reset_index()

    # Summing normalized speech frequencies per meeting
    meeting_metrics = unique_speech_frequencies.groupby('meeting_number').agg({
        'normalized_speech_frequency': 'sum'
    }).reset_index()

    # Summing interaction counts per meeting
    interaction_metrics = meetings.groupby('meeting_number').agg({
        'count': 'sum',
    }).reset_index()

    # Removing self interactions
    self_interactions = meetings[meetings['speaker_id']
                                 == meetings['next_speaker_id']]
    total_self_interactions = self_interactions.groupby(
        'meeting_number')['count'].sum().reset_index()
    interaction_metrics = interaction_metrics.merge(
        total_self_interactions, on='meeting_number', how='left', suffixes=('', '_self'))
    interaction_metrics['count'] = interaction_metrics['count'] - \
        interaction_metrics['count_self'].fillna(0)
    interaction_metrics.drop(columns=['count_self'], inplace=True)

    # Combining the metrics
    combined_metrics = meeting_metrics.merge(
        interaction_metrics, on='meeting_number')

    return combined_metrics


# Calculating team metrics for online and offline meetings
online_team_metrics = calculate_team_meeting_metrics(online_meetings)
offline_team_metrics = calculate_team_meeting_metrics(offline_meetings)
online_team_metrics, offline_team_metrics

(   meeting_number  normalized_speech_frequency  count
 0               1                   101.033613    233
 1               2                    89.889764    244
 2               3                    85.061404    383
 3               4                    84.150000    213
 4               5                    90.027027    254
 5               6                    95.816092    155
 6               7                    84.575472    226,
    meeting_number  normalized_speech_frequency  count
 0               8                    93.610294    716
 1               9                    93.788889   1296
 2              11                   109.711656    913)

In [50]:
# Function to perform t-tests
def perform_ttest(group1, group2):
    ttest_results = {}
    ttest_results['normalized_speech_frequency'] = ttest_ind(
        group1['normalized_speech_frequency'], group2['normalized_speech_frequency'], equal_var=False)
    ttest_results['count'] = ttest_ind(
        group1['count'], group2['count'], equal_var=False)
    return ttest_results


def dataframe_generator(ttest_results, group1, group2):
    variables = ['normalized_speech_frequency', 'count']
    rows = []
    for var in variables:
        row_meeting = {
            'Variable': var,
            'Group': 'Online',
            'Mean': group1[var].mean(),
            'Std': group1[var].std(),
            'df': len(group1[var]) - 1,
            't-statistic': ttest_results[var].statistic,
            'p-value': ttest_results[var].pvalue
        }
        row_chat = {
            'Variable': var,
            'Group': 'Offline',
            'Mean': group2[var].mean(),
            'Std': group2[var].std(),
            'df': len(group2[var]) - 1,
            't-statistic': '',
            'p-value': ''
        }
        rows.append(row_meeting)
        rows.append(row_chat)

    detailed_df = pd.DataFrame(rows)
    return detailed_df


# Performing t-tests for team metrics
team_ttest_results = perform_ttest(online_team_metrics, offline_team_metrics)
team_ttest_results

{'normalized_speech_frequency': TtestResult(statistic=-1.5297691602424754, pvalue=0.22795626906929248, df=2.857637349377727),
 'count': TtestResult(statistic=-4.243206216860626, pvalue=0.04725971358250978, df=2.0951543531545225)}

In [51]:
dataframe_generator(team_ttest_results,
                    online_team_metrics, offline_team_metrics)

Unnamed: 0,Variable,Group,Mean,Std,df,t-statistic,p-value
0,normalized_speech_frequency,Online,90.079053,6.371479,6,-1.529769,0.227956
1,normalized_speech_frequency,Offline,99.036946,9.245001,2,,
2,count,Online,244.0,69.219458,6,-4.243206,0.04726
3,count,Offline,975.0,294.928805,2,,


In [52]:
# Function to calculate individual metrics adjusted for meeting count
def calculate_individual_metrics(meetings, meeting_count):
    # Removing duplicates by averaging values for each speaker per meeting
    unique_speech_frequencies = meetings.groupby(['meeting_number', 'speaker_id'])[
        'normalized_speech_frequency'].mean().reset_index()

    # Summing normalized speech frequencies per speaker
    individual_metrics = unique_speech_frequencies.groupby('speaker_id').agg({
        'normalized_speech_frequency': 'sum'
    }).reset_index()
    individual_metrics['normalized_speech_frequency'] /= meeting_count

    # Summing interaction counts per speaker
    interaction_metrics = meetings.groupby('speaker_id').agg({
        'count': 'sum'
    }).reset_index()

    # Removing self interactions
    self_interactions = meetings[meetings['speaker_id']
                                 == meetings['next_speaker_id']]
    total_self_interactions = self_interactions.groupby(
        'speaker_id')['count'].sum().reset_index()
    interaction_metrics = interaction_metrics.merge(
        total_self_interactions, on='speaker_id', how='left', suffixes=('', '_self'))
    interaction_metrics['count'] = interaction_metrics['count'] - \
        interaction_metrics['count_self'].fillna(0)
    interaction_metrics.drop(columns=['count_self'], inplace=True)
    interaction_metrics['count'] /= meeting_count

    # Combining the metrics
    combined_metrics = individual_metrics.merge(
        interaction_metrics, on='speaker_id')

    return combined_metrics


# Calculating individual metrics for online and offline meetings
online_individual_metrics = calculate_individual_metrics(online_meetings, 7)
offline_individual_metrics = calculate_individual_metrics(offline_meetings, 3)
online_individual_metrics, offline_individual_metrics

(   speaker_id  normalized_speech_frequency       count
 0           0                    10.126459   45.857143
 1           1                     7.474135   40.571429
 2           2                     3.778524   22.000000
 3           3                    63.973151  107.000000
 4           4                     4.726784   28.571429,
    speaker_id  normalized_speech_frequency       count
 0           0                    22.375252  307.000000
 1           1                     7.004073  156.000000
 2           2                     2.112150   43.333333
 3           3                    60.207347  334.666667
 4           4                     7.338125  134.000000)

In [53]:
individual_ttest_results = perform_ttest(
    online_individual_metrics, offline_individual_metrics)


individual_ttest_results

{'normalized_speech_frequency': TtestResult(statistic=-0.11404132556857623, pvalue=0.9120326485485214, df=7.949487726880425),
 'count': TtestResult(statistic=-2.5667576193095156, pvalue=0.054283857620414454, df=4.605627527034095)}

In [54]:
dataframe_generator(individual_ttest_results,
                    online_individual_metrics, offline_individual_metrics)

Unnamed: 0,Variable,Group,Mean,Std,df,t-statistic,p-value
0,normalized_speech_frequency,Online,18.015811,25.810582,4,-0.114041,0.912033
1,normalized_speech_frequency,Offline,19.807389,23.828971,4,,
2,count,Online,48.8,33.878625,4,-2.566758,0.054284
3,count,Offline,195.0,122.775767,4,,
