In [1]:
import pandas as pd
import re
from datetime import datetime, timedelta
from networkx import betweenness_centrality, closeness_centrality, eigenvector_centrality
from collections import defaultdict

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
import pandas as pd
import re
file_path = 'transcriptions/kakao/kakaotalk_conversation.txt'

with open(file_path, 'r', encoding='utf-8') as file:
    data = file.read()


def extract_data(data):
    pattern = re.compile(
        r'(\d{4}\. \d{1,2}\. \d{1,2})\. (오전|오후) \d{1,2}:\d{2}, (Speaker SPEAKER_\d{2}) : (.+)')
    matches = pattern.findall(data)

    extracted_data = []
    for match in matches:
        date_str = match[0]
        datetime_obj = datetime.strptime(date_str, '%Y. %m. %d')
        date = datetime_obj.strftime('%Y-%m-%d')
        speaker = match[2].replace('Speaker ', '')
        text = match[3]
        word_count = len(text.split())
        extracted_data.append([date, speaker, text, word_count])

    return extracted_data


extracted_data = extract_data(data)
df = pd.DataFrame(extracted_data, columns=[
                  'Date', 'Speaker', 'Text', 'word_count'])

df['Date'] = pd.to_datetime(df['Date'])


def assign_meeting_number(date):
    if '2024-05-21' <= date <= '2024-05-24':
        return 1
    elif '2024-05-25' <= date <= '2024-05-27':
        return 2
    elif date == '2024-05-28':
        return 3
    elif date == '2024-05-29':
        return 4
    elif date == '2024-05-30':
        return 5
    elif date == '2024-05-31':
        return 6
    elif '2024-06-01' <= date <= '2024-06-10':
        return 7
    elif '2024-06-11' <= date <= '2024-06-12':
        return 8
    elif '2024-06-13' <= date <= '2024-06-14':
        return 9
    elif '2024-06-15' <= date <= '2024-06-17':
        return 10
    elif '2024-06-18' <= date <= '2024-06-19':
        return 11
    elif '2024-06-20' <= date <= '2024-06-21':
        return 12
    elif '2024-06-22' == date <= '2024-06-23':
        return 13
    elif '2024-06-24' == date:
        return 14
    elif '2024-06-25' <= date <= '2024-06-26':
        return 15
    elif '2024-06-27' <= date <= '2024-06-28':
        return 16
    elif '2024-06-29' <= date:
        return 17


df['meeting_number'] = df['Date'].apply(
    lambda x: assign_meeting_number(x.strftime('%Y-%m-%d')))

print(df)

           Date     Speaker  \
0    2024-05-21  SPEAKER_03   
1    2024-05-21  SPEAKER_03   
2    2024-05-21  SPEAKER_01   
3    2024-05-21  SPEAKER_00   
4    2024-05-21  SPEAKER_04   
...         ...         ...   
4744 2024-06-29  SPEAKER_03   
4745 2024-06-29  SPEAKER_03   
4746 2024-06-29  SPEAKER_03   
4747 2024-06-29  SPEAKER_03   
4748 2024-06-29  SPEAKER_01   

                                                   Text  word_count  \
0                                             여러분 반가워요!           2   
1                                   준영님 노션에 번호 올려주세요 ㅋㅋ           5   
2                                              안녕하세요 !            2   
3                                                반갑습니다~           1   
4                                          아 넵! 올렸습니다!            3   
...                                                 ...         ...   
4744                                                 사진           1   
4745                                                 사진    

In [3]:


def create_dataset(df, project_number):
    dataset = []
    speaker_word_counts = df.groupby(['meeting_number', 'Speaker'])[
        'word_count'].sum().to_dict()
    total_words = df.groupby('meeting_number')['word_count'].sum().to_dict()
    meeting_dates = df.groupby('meeting_number')['Date'].agg([
        'min', 'max']).to_dict('index')
    df['meeting_number'] = df['meeting_number'].astype('int')
    for (meeting_number, speaker), word_count in speaker_word_counts.items():
        start_date = meeting_dates[meeting_number]['min']
        end_date = meeting_dates[meeting_number]['max']
        duration = ((end_date - start_date).days + 1) * 24

        dataset.append({
            'id': f'{project_number}_{meeting_number}_{speaker}',
            'project': project_number,
            'meeting_number': meeting_number,
            'speaker_number': int(speaker.split('_')[1]),
            'speech_frequency': word_count,
            'total_words': total_words[meeting_number],
            'duration': duration
        })
    return pd.DataFrame(dataset)


dataset_project4 = create_dataset(df, 4)

dataset_project4 = dataset_project4.sort_values(
    by=['meeting_number', 'speaker_number'])


dataset_project4

Unnamed: 0,id,project,meeting_number,speaker_number,speech_frequency,total_words,duration
0,4_1_SPEAKER_00,4,1,0,1,22,24
1,4_1_SPEAKER_01,4,1,1,2,22,24
2,4_1_SPEAKER_02,4,1,2,2,22,24
3,4_1_SPEAKER_03,4,1,3,7,22,24
4,4_1_SPEAKER_04,4,1,4,10,22,24
...,...,...,...,...,...,...,...
67,4_16_SPEAKER_04,4,16,4,531,3640,48
68,4_17_SPEAKER_01,4,17,1,4,200,24
69,4_17_SPEAKER_02,4,17,2,3,200,24
70,4_17_SPEAKER_03,4,17,3,182,200,24


In [4]:
dataset_project4['normalized_speech_frequency'] = dataset_project4['speech_frequency'] / \
    dataset_project4['duration']

In [5]:


def compute_interaction_frequency(df, project_number):
    interaction_records = []
    for meeting_number in df['meeting_number'].unique():
        meeting_df = df[df['meeting_number'] == meeting_number]
        interaction_counts = defaultdict(lambda: defaultdict(int))
        for i in range(len(meeting_df)):
            prev_speaker = meeting_df.iloc[i]['Speaker']
            if i < len(meeting_df) - 1:
                next_speaker = meeting_df.iloc[i + 1]['Speaker']
            else:
                next_speaker = meeting_df.iloc[i]['Speaker']
            interaction_counts[prev_speaker][next_speaker] += 1
        for prev_speaker, next_speakers in interaction_counts.items():
            for next_speaker, count in next_speakers.items():
                interaction_records.append({
                    'project': project_number,
                    'meeting_number': meeting_number,
                    'speaker_id': int(prev_speaker.split('_')[1]),
                    'next_speaker_id': int(next_speaker.split('_')[1]),
                    'count': count
                })
    return pd.DataFrame(interaction_records)


interaction_frequency_df = compute_interaction_frequency(df, 4)
interaction_frequency_df.sort_values(
    by=['meeting_number', 'speaker_id', 'next_speaker_id'])

Unnamed: 0,project,meeting_number,speaker_id,next_speaker_id,count
3,4,1,0,4,1
2,4,1,1,0,1
6,4,1,2,2,1
1,4,1,3,1,1
0,4,1,3,3,1
...,...,...,...,...,...
285,4,17,3,1,2
284,4,17,3,2,1
282,4,17,3,3,18
283,4,17,3,4,4


In [6]:
def generate_all_pairs(interaction_records, dataset):
    all_pairs = []
    for (project, meeting), group in dataset.groupby(['project', 'meeting_number']):
        speakers = group['speaker_number'].unique()
        for speaker1 in speakers:
            for speaker2 in speakers:
                if not interaction_records[(interaction_records['project'] == project) & (interaction_records['meeting_number'] == meeting) & (interaction_records['speaker_id'] == speaker1) & (interaction_records['next_speaker_id'] == speaker2)].empty:
                    continue
                all_pairs.append({
                    'project': project,
                    'meeting_number': meeting,
                    'speaker_id': speaker1,
                    'next_speaker_id': speaker2,
                    'count': 0
                })
    return pd.DataFrame(all_pairs)


all_pairs = generate_all_pairs(interaction_frequency_df, dataset_project4)
interaction_records = pd.concat(
    [interaction_frequency_df, all_pairs], ignore_index=True)
interaction_records = interaction_records.sort_values(
    by=['project', 'meeting_number', 'speaker_id', 'next_speaker_id']).reset_index(drop=True)

In [7]:
combined_dataset = pd.merge(dataset_project4, interaction_records, how='left', left_on=[
                            'project', 'meeting_number', 'speaker_number'], right_on=['project', 'meeting_number', 'speaker_id'])
combined_dataset['count'] = combined_dataset['count'].fillna(0).astype(int)

In [8]:
combined_dataset

Unnamed: 0,id,project,meeting_number,speaker_number,speech_frequency,total_words,duration,normalized_speech_frequency,speaker_id,next_speaker_id,count
0,4_1_SPEAKER_00,4,1,0,1,22,24,0.041667,0,0,0
1,4_1_SPEAKER_00,4,1,0,1,22,24,0.041667,0,1,0
2,4_1_SPEAKER_00,4,1,0,1,22,24,0.041667,0,2,0
3,4_1_SPEAKER_00,4,1,0,1,22,24,0.041667,0,3,0
4,4_1_SPEAKER_00,4,1,0,1,22,24,0.041667,0,4,1
...,...,...,...,...,...,...,...,...,...,...,...
345,4_17_SPEAKER_03,4,17,3,182,200,24,7.583333,3,4,4
346,4_17_SPEAKER_04,4,17,4,11,200,24,0.458333,4,1,0
347,4_17_SPEAKER_04,4,17,4,11,200,24,0.458333,4,2,0
348,4_17_SPEAKER_04,4,17,4,11,200,24,0.458333,4,3,4


## Compute Network Density


In [9]:
import networkx as nx

meeting = [1,  2,  3, 6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17]


def compute_density(G):
    num_nodes = len(G)
    if num_nodes < 2:
        return 0
    possible_edges = num_nodes * (num_nodes - 1)  # For directed graph
    actual_edges = sum(1 for u, v, data in G.edges(
        data=True) if u != v and data['weight'] > 0)
    return actual_edges / possible_edges


def weighted_density(G):
    if len(G) == 0:
        return 0
    total_weight = sum(data['weight'] for u, v, data in G.edges(data=True))
    # total_weight = sum(data['weight'] for u, v, data in G.edges(data=True) if u != v)
    num_nodes = len(G)
    max_weight = max(data['weight'] for u, v, data in G.edges(data=True))
    # max_weight = max(data['weight'] for u, v, data in G.edges(data=True) if u != v)
    possible_edges = num_nodes * (num_nodes - 1)  # For directed graph
    return total_weight / (possible_edges * max_weight) if possible_edges > 0 else 0


def create_interaction_graphs(df):
    graphs = []
    for meeting_number in df['meeting_number'].unique():
        G = nx.DiGraph()
        meeting_data = df[df['meeting_number'] == meeting_number]
        for i in range(len(meeting_data)):
            # Convert speaker IDs to integers before formatting
            prev_speaker_id = int(meeting_data.iloc[i]['speaker_id'])
            next_speaker_id = int(meeting_data.iloc[i]['next_speaker_id'])
            prev_speaker = f"SPEAKER_{prev_speaker_id:02d}"
            next_speaker = f"SPEAKER_{next_speaker_id:02d}"
            count = meeting_data.iloc[i]['count']
            if count > 0:
                if G.has_edge(prev_speaker, next_speaker):
                    G[prev_speaker][next_speaker]['weight'] += count
                else:
                    G.add_edge(prev_speaker, next_speaker, weight=count)

                if G.has_edge(next_speaker, prev_speaker) and prev_speaker != next_speaker:
                    G[next_speaker][prev_speaker]['weight'] += count
                else:
                    G.add_edge(next_speaker, prev_speaker, weight=count)

        graphs.append(G)
    return graphs


interaction_graphs = create_interaction_graphs(interaction_frequency_df)
densities = [compute_density(G) for G in interaction_graphs]
weighted_density = [weighted_density(G) for G in interaction_graphs]
densities
interaction_graphs

[<networkx.classes.digraph.DiGraph at 0x208d6f308e0>,
 <networkx.classes.digraph.DiGraph at 0x208f75d6950>,
 <networkx.classes.digraph.DiGraph at 0x208f75d6470>,
 <networkx.classes.digraph.DiGraph at 0x208f75d6290>,
 <networkx.classes.digraph.DiGraph at 0x208f75d5ab0>,
 <networkx.classes.digraph.DiGraph at 0x208f75d6110>,
 <networkx.classes.digraph.DiGraph at 0x208f75f9fc0>,
 <networkx.classes.digraph.DiGraph at 0x208f75fb070>,
 <networkx.classes.digraph.DiGraph at 0x208f75f9a50>,
 <networkx.classes.digraph.DiGraph at 0x208f75f8f70>,
 <networkx.classes.digraph.DiGraph at 0x208f75fa200>,
 <networkx.classes.digraph.DiGraph at 0x208f75f9f60>,
 <networkx.classes.digraph.DiGraph at 0x208f75f9ba0>,
 <networkx.classes.digraph.DiGraph at 0x208f75f8f10>,
 <networkx.classes.digraph.DiGraph at 0x208f75f8a30>]

## Compute Centralities


In [10]:
# Define centrality measures function


def compute_centralities(G):
    if len(G) == 0:
        centralities = {
            'degree_centrality': {},
            'indegree_centrality': {},
            'outdegree_centrality': {},
            'betweenness_centrality': {},
            'closeness_centrality': {},
            'eigenvector_centrality': {},
            'pagerank': {}
        }
    else:
        try:
            eigenvector_cent = eigenvector_centrality(
                G, max_iter=2000, weight='weight')
        except nx.PowerIterationFailedConvergence:
            eigenvector_cent = {node: 0 for node in G.nodes()}
        num_nodes = len(G)
        possible_edges = num_nodes * (num_nodes - 1)
        max_weight = max(data['weight']
                         for u, v, data in G.edges(data=True) if u != v)
        centralities = {
            'degree_centrality': {k: v / (possible_edges * max_weight) for k, v in dict(G.degree(weight='weight')).items()},
            'indegree_centrality': {k: v / (possible_edges * max_weight) for k, v in dict(G.in_degree(weight='weight')).items()},
            'outdegree_centrality': {k: v / (possible_edges * max_weight) for k, v in dict(G.out_degree(weight='weight')).items()},
            'betweenness_centrality': {k: v / (possible_edges * max_weight) for k, v in betweenness_centrality(G, weight='weight').items()},
            'closeness_centrality': {k: v / (possible_edges * max_weight) for k, v in closeness_centrality(G, distance='weight').items()},
            'eigenvector_centrality': {k: v / (possible_edges * max_weight) for k, v in eigenvector_cent.items()},
            'pagerank': {k: v / (possible_edges * max_weight) for k, v in nx.pagerank(G, weight='weight').items()}
        }
    return centralities


centralities = [compute_centralities(G) for G in interaction_graphs]
centralities

[{'degree_centrality': {'SPEAKER_03': 0.2,
   'SPEAKER_01': 0.2,
   'SPEAKER_00': 0.2,
   'SPEAKER_04': 0.3,
   'SPEAKER_02': 0.2},
  'indegree_centrality': {'SPEAKER_03': 0.1,
   'SPEAKER_01': 0.1,
   'SPEAKER_00': 0.1,
   'SPEAKER_04': 0.15,
   'SPEAKER_02': 0.1},
  'outdegree_centrality': {'SPEAKER_03': 0.1,
   'SPEAKER_01': 0.1,
   'SPEAKER_00': 0.1,
   'SPEAKER_04': 0.15,
   'SPEAKER_02': 0.1},
  'betweenness_centrality': {'SPEAKER_03': 0.0,
   'SPEAKER_01': 0.025,
   'SPEAKER_00': 0.03333333333333333,
   'SPEAKER_04': 0.025,
   'SPEAKER_02': 0.0},
  'closeness_centrality': {'SPEAKER_03': 0.02,
   'SPEAKER_01': 0.02857142857142857,
   'SPEAKER_00': 0.03333333333333333,
   'SPEAKER_04': 0.02857142857142857,
   'SPEAKER_02': 0.02},
  'eigenvector_centrality': {'SPEAKER_03': 0.009511362441813855,
   'SPEAKER_01': 0.012700876483011589,
   'SPEAKER_00': 0.020149595906435287,
   'SPEAKER_04': 0.03435561714858308,
   'SPEAKER_02': 0.025727554804354923},
  'pagerank': {'SPEAKER_03': 0.009

## Add Centralities and Network Density to Combined Dataset


In [11]:
for centrality_measure in ['degree_centrality', 'indegree_centrality', 'outdegree_centrality', 'betweenness_centrality', 'closeness_centrality', 'eigenvector_centrality', 'pagerank']:
    combined_dataset[centrality_measure] = 0
combined_dataset['network_density'] = 0
combined_dataset['weighted_network_density'] = 0


meeting_numbers = [1, 2, 3, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]

for i, centralities in enumerate(centralities):
    density = densities[i]
    weighted_density_value = weighted_density[i]
    meeting_number = meeting_numbers[i]
    for centrality_measure, centrality_values in centralities.items():
        for node, value in centrality_values.items():
            combined_dataset.loc[
                (combined_dataset['project'] == 4) &
                (combined_dataset['meeting_number'] == meeting_number) &
                (combined_dataset['speaker_number']
                 == int(node.split('_')[1])),
                centrality_measure] = value
    combined_dataset.loc[
        (combined_dataset['project'] == 4) &
        (combined_dataset['meeting_number'] == meeting_number),
        'network_density'] = density
    combined_dataset.loc[
        (combined_dataset['project'] == 4) &
        (combined_dataset['meeting_number'] == meeting_number),
        'weighted_network_density'] = weighted_density_value


combined_dataset

Unnamed: 0,id,project,meeting_number,speaker_number,speech_frequency,total_words,duration,normalized_speech_frequency,speaker_id,next_speaker_id,count,degree_centrality,indegree_centrality,outdegree_centrality,betweenness_centrality,closeness_centrality,eigenvector_centrality,pagerank,network_density,weighted_network_density
0,4_1_SPEAKER_00,4,1,0,1,22,24,0.041667,0,0,0,0.200000,0.100000,0.100000,0.033333,0.033333,0.020150,0.009161,0.4,0.550000
1,4_1_SPEAKER_00,4,1,0,1,22,24,0.041667,0,1,0,0.200000,0.100000,0.100000,0.033333,0.033333,0.020150,0.009161,0.4,0.550000
2,4_1_SPEAKER_00,4,1,0,1,22,24,0.041667,0,2,0,0.200000,0.100000,0.100000,0.033333,0.033333,0.020150,0.009161,0.4,0.550000
3,4_1_SPEAKER_00,4,1,0,1,22,24,0.041667,0,3,0,0.200000,0.100000,0.100000,0.033333,0.033333,0.020150,0.009161,0.4,0.550000
4,4_1_SPEAKER_00,4,1,0,1,22,24,0.041667,0,4,1,0.200000,0.100000,0.100000,0.033333,0.033333,0.020150,0.009161,0.4,0.550000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
345,4_17_SPEAKER_03,4,17,3,182,200,24,7.583333,3,4,4,0.645833,0.322917,0.322917,0.010417,0.002404,0.009634,0.006465,0.5,0.212963
346,4_17_SPEAKER_04,4,17,4,11,200,24,0.458333,4,1,0,0.166667,0.083333,0.083333,0.000000,0.001078,0.003567,0.001809,0.5,0.212963
347,4_17_SPEAKER_04,4,17,4,11,200,24,0.458333,4,2,0,0.166667,0.083333,0.083333,0.000000,0.001078,0.003567,0.001809,0.5,0.212963
348,4_17_SPEAKER_04,4,17,4,11,200,24,0.458333,4,3,4,0.166667,0.083333,0.083333,0.000000,0.001078,0.003567,0.001809,0.5,0.212963


## Compute Gini Coefficient


In [12]:

import numpy as np


def gini_coefficient(x):
    x = np.array(x, dtype=np.float64)
    if np.amin(x) < 0:
        x -= np.amin(x)  # values cannot be negative
    x += 0.0000001  # values cannot be 0
    x = np.sort(x)  # values must be sorted
    index = np.arange(1, x.shape[0] + 1)  # index per array element
    n = x.shape[0]
    return ((np.sum((2 * index - n - 1) * x)) / (n * np.sum(x)))


def compute_gini(df):
    gini_values = []
    meetings = df['meeting_number'].unique()
    for meeting_number in meetings:
        meeting_data = df[df['meeting_number'] == meeting_number]
        interaction_counts = [meeting_data[(meeting_data['speaker_id'] == speaker) & (
            meeting_data['speaker_id'] != meeting_data['next_speaker_id'])]['count'].sum() for speaker in meeting_data['speaker_id'].unique()]
        gini_values.append(gini_coefficient(interaction_counts))
    return gini_values


gini_values = compute_gini(combined_dataset)
combined_dataset['gini_coefficient'] = 0

for i, gini_value in enumerate(gini_values):
    meeting_number = meeting_numbers[i]
    combined_dataset.loc[
        (combined_dataset['project'] == 4) &
        (combined_dataset['meeting_number'] == meeting_number),
        'gini_coefficient'] = gini_value

## Compute Interaction Equality Index


In [13]:
# Define Interaction Equality Index function
def interaction_equality_index(x):
    x = np.array(x, dtype=np.float64)
    mean_x = np.mean(x)
    if mean_x == 0:
        return 0
    return 1 - (np.std(x) / mean_x)

# Compute Interaction Equality Index for each meeting


def compute_equality_index(df):
    equality_index_values = []
    meetings = df['meeting_number'].unique()
    for meeting_number in meetings:
        meeting_data = df[df['meeting_number'] == meeting_number]
        interaction_counts = [meeting_data[(meeting_data['speaker_number'] == speaker) & (
            meeting_data['speaker_number'] != meeting_data['next_speaker_id'])]['count'].sum() for speaker in meeting_data['speaker_number'].unique()]
        equality_index_values.append(
            interaction_equality_index(interaction_counts))
    return equality_index_values


equality_index = compute_equality_index(combined_dataset)
combined_dataset['interaction_equality_index'] = 0

for i, equality_index in enumerate(equality_index):
    meeting_number = meeting_numbers[i]
    combined_dataset.loc[
        (combined_dataset['project'] == 4) &
        (combined_dataset['meeting_number'] == meeting_number),
        'interaction_equality_index'] = equality_index

## Save Updated Combined Dataset to CSV


In [14]:
# Reorder columns
import os


columns_order = [
    'id', 'project', 'meeting_number', 'speaker_number', 'speech_frequency', 'total_words', 'duration', 'normalized_speech_frequency', 'speaker_id', 'next_speaker_id', 'count', 'network_density', 'weighted_network_density',
    'gini_coefficient', 'interaction_equality_index', 'degree_centrality', 'indegree_centrality', 'outdegree_centrality', 'betweenness_centrality', 'closeness_centrality', 'eigenvector_centrality', 'pagerank'
]
combined_dataset = combined_dataset[columns_order]

# Save the final dataset with centralities and density to a CSV file
os.makedirs('data', exist_ok=True)
combined_dataset.to_csv('data/kakao_data.csv', index=False)