In [2]:
import dominant_sets
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import random
import matplotlib.cm as cm
import os
from collections import defaultdict, Counter
import re
from sklearn.cluster import KMeans
import json

In [3]:
# Only consider children so far
children_ids = [
    "DS_STARFISH_2223_27", "DS_STARFISH_2223_28", "DS_STARFISH_2223_29",
    "DS_STARFISH_2223_30", "DS_STARFISH_2223_31", "DS_STARFISH_2223_32",
    "DS_STARFISH_2223_33", "DS_STARFISH_2223_41", "DS_STARFISH_2223_42",
    "DS_STARFISH_2223_43", "DS_STARFISH_2223_44", "DS_STARFISH_2223_45",
    "DS_STARFISH_2223_46"
]

teacher_ids = [
    "DS_STARFISH_2223_T1", "DS_STARFISH_2223_T4", "DS_STARFISH_2223_T3",
    "DS_STARFISH_2223_T10", "DS_STARFISH_2223_Lab1", "DS_STARFISH_2223_Lab2"
]

def filter_teachers(row):
    if row['SUBJECTID'] in children_ids:
        return True
    elif row['SUBJECTID'] in teacher_ids:
        return False
    return True

In [4]:
# # filter bad time
# def filter_badtime(df):
#     filtered_rows = df.copy()
#     filtered_rows = filtered_rows[filtered_rows['TIME'] > pd.Timestamp("2023/1/30  9:35:00")] #playground
#     filtered_rows = filtered_rows[filtered_rows['TIME'] > pd.Timestamp("2023/1/30  9:47:00")] #toilet
#     filtered_rows = filtered_rows[~((filtered_rows['TIME'] >= pd.Timestamp("2023/1/30 10:20:00")) & (filtered_rows['TIME'] <= pd.Timestamp("2023/1/30 10:27:00")))] #toilet
#     filtered_rows = filtered_rows[~((filtered_rows['TIME'] >= pd.Timestamp("2023/1/30 10:33:00")) & (filtered_rows['TIME'] <= pd.Timestamp("2023/1/30 10:37:00")))] #toilet
#     filtered_rows = filtered_rows[~((filtered_rows['TIME'] >= pd.Timestamp("2023/1/30 11:30:00")) & (filtered_rows['TIME'] <= pd.Timestamp("2023/1/30 11:32:00")))] #toilet
#     filtered_rows = filtered_rows[~((filtered_rows['TIME'] >= pd.Timestamp("2023/1/30 11:38:00")) & (filtered_rows['TIME'] <= pd.Timestamp("2023/1/30 11:41:00")))] #toilet
#     filtered_rows = filtered_rows[~((filtered_rows['TIME'] >= pd.Timestamp("2023/1/30 11:45:00")) & (filtered_rows['TIME'] < pd.Timestamp("2023/1/30 12:02:00")))] #toilet
#     filtered_rows = filtered_rows[~((filtered_rows['TIME'] >= pd.Timestamp("2023/1/30 12:19:00")) & (filtered_rows['TIME'] <= pd.Timestamp("2023/1/30 12:26:00")))] #toilet
#     filtered_rows = filtered_rows[filtered_rows['TIME'] < pd.Timestamp("2023/1/30  12:27:00")] #toilet
    
#     return filtered_rows


# Filter bad time (updated to use only with time and without date)
def filter_badtime(df):
    # Extract the time component from the datetime
    df['TimeOnly'] = df['TIME'].dt.time
    
    # Define the time intervals to filter
    time_intervals = [
        (pd.Timestamp("2023/1/30 08:55:00").time(), pd.Timestamp("2023/1/30 09:47:00").time()),
        (pd.Timestamp("2023/1/30 10:20:00").time(), pd.Timestamp("2023/1/30 10:27:00").time()),
        (pd.Timestamp("2023/1/30 10:33:00").time(), pd.Timestamp("2023/1/30 10:37:00").time()),
        (pd.Timestamp("2023/1/30 11:30:00").time(), pd.Timestamp("2023/1/30 11:32:00").time()),
        (pd.Timestamp("2023/1/30 11:38:00").time(), pd.Timestamp("2023/1/30 11:41:00").time()),
        (pd.Timestamp("2023/1/30 11:45:00").time(), pd.Timestamp("2023/1/30 12:02:00").time()),
        (pd.Timestamp("2023/1/30 12:19:00").time(), pd.Timestamp("2023/1/30 12:26:00").time())
    ]
    
    filtered_rows = df.copy()
    
    filtered_rows = filtered_rows[filtered_rows['TimeOnly'] > pd.Timestamp("2023/1/30 08:55:00").time()]
    
    for start, end in time_intervals:
        filtered_rows = filtered_rows[~((filtered_rows['TimeOnly'] >= start) & (filtered_rows['TimeOnly'] <= end))]
    
    filtered_rows = filtered_rows[filtered_rows['TimeOnly'] < pd.Timestamp("2023/1/30 12:27:00").time()]
    
    return filtered_rows

In [17]:
# build the dict: coordinates of each child at each time
def build_coords_dict(filtered_rows, version, day, method):

    # Step 1: Create a MultiIndex DataFrame
    pivot_df = filtered_rows.pivot_table(
        index='TIME', 
        columns='SUBJECTID', 
        values=['KC_X', 'KC_Y', 'KC_O'],
        aggfunc='first'
    )

    # # Step 2: Fill missing values with large_value
    # pivot_df = pivot_df.fillna(large_value)

    # Step 3: Flatten the MultiIndex columns
    pivot_df.columns = [f'{subject}_{feature}' for feature, subject in pivot_df.columns]

    # Step 3: Sort the columns to get the desired order
    sorted_columns = sorted(pivot_df.columns, key=lambda x: (x.split('_')[0], x.split('_')[1]))
    pivot_df = pivot_df[sorted_columns]


    # Step 4: Reset index to make TIME a column again
    pivot_df.reset_index(inplace=True)

    csv_file = "fformations_each_version/coord/" + method + "_coordinates_" + str(day) + "_" + version + ".csv"
    pivot_df.to_csv(csv_file, index=False)

    # Display the first few rows of the DataFrame
    print("\nCoordinates of each child for each time stamp:")
    print(pivot_df.head())
    
    # transfer the dataframe to dict
    result_dict = {
    row['TIME']: {
        subject: [
            row.get(f'{subject}_KC_X', np.nan),
            row.get(f'{subject}_KC_Y', np.nan),
            row.get(f'{subject}_KC_O', np.nan)
        ]
        for subject in set(col.split('_')[0] for col in sorted_columns)
    }
    for _, row in pivot_df.iterrows()
    }
    
    # dict_file = "fformations_each_version/coord/" + method + "_coordinates_" + str(day) + "_" + version + ".json"
    
    # with open(dict_file, 'w') as f:
    #     json.dump(result_dict, f)
    return pivot_df, result_dict

In [6]:

def affinity_calculation(data_dict, sigma = 1.0, method = 'P', dataset = "starfish"):
    affinity_matrices = {}
    valid_members = {}

    for time, subjects in data_dict.items():
        if dataset == "starfish":
        # Filter out subjects with coordinates [NaN, NaN]
            valid_subjects = {sub_id: coords for sub_id, coords in subjects.items() if not (np.isnan(coords[0]) or np.isnan(coords[1]) or np.isnan(coords[2]))}
            subject_ids = list(valid_subjects.keys())
            coords = np.array(list(valid_subjects.values()))
            positions = coords[:,:2]
            orientations = coords[:,2]
            num_subjects = len(subject_ids)
            valid_members[time] = subject_ids
        elif dataset == "idiap":
            valid_members[time] = subjects
            positions = np.array(subjects)
            num_subjects = len(positions)
            
        else:
            print(f"Doesn't support {dataset}!")
            break
            
        affinity_matrix = np.zeros((num_subjects, num_subjects))
        
        if method == "P":
            for i in range(num_subjects):
                for j in range(num_subjects):
                    # dont set affinity[i][i] to 1, otherwise it would be dominent
                    if i != j:
                        d_ij = np.linalg.norm(positions[i] - positions[j])
                        affinity_matrix[i, j] = np.exp(-d_ij / (2 * sigma**2))
        elif method == "PO":
            for i in range(num_subjects):
                for j in range(num_subjects):
                    if i != j:
                        d_q = np.linalg.norm(positions[i] - positions[j])
                        vector_ij = positions[j] - positions[i]
                        alpha_ij = np.arctan2(vector_ij[1], vector_ij[0])
                        vector_ji = positions[i] - positions[j]
                        alpha_ji = np.arctan2(vector_ji[1], vector_ji[0])
                        
                        theta_i = np.radians(orientations[i])
                        theta_j = np.radians(orientations[j])
                    
                        # Calculate Aori_ij1 and Aori_ij2
                        Aori_ij1 = np.exp(-d_q / (2 * sigma**2)) if -np.pi/2 <= theta_i - alpha_ij <= np.pi/2 else 0
                        Aori_ij2 = np.exp(-d_q / (2 * sigma**2)) if -np.pi/2 <= theta_j - alpha_ji <= np.pi/2 else 0

                        # Final affinity matrix element
                        affinity_matrix[i, j] = min(Aori_ij1, Aori_ij2)
        affinity_matrices[time] = affinity_matrix
    
    return affinity_matrices, valid_members

In [7]:
def fformation_extraction(affinities, members, interval = 10):
    all_time_keys = list(affinities.keys())
    selected_indices = list(range(0, len(all_time_keys), interval))
    selected_time_keys = [all_time_keys[i] for i in selected_indices]

    fformations = {}

    for time_key in selected_time_keys:
        print(time_key)
        matrix = affinities[time_key]
        raw_formations = dominant_sets.dominant_set_extraction(matrix, len(members[time_key]))
        print("All members at the scene", members[time_key])
        
        fformations[time_key] = {}
        for i, fformation in enumerate(raw_formations):
            group_members = [members[time_key][j] for j, in_group in enumerate(fformation) if in_group]
            fformations[time_key][i] = group_members
            print(f"Group {i}: {group_members}")
        print("\n")
    
    return fformations

In [8]:
def encode_groups(fformations):
    global group_encoding
    global group_id_counter
    
    # encode groups formed over all days
    for time_key, groups in fformations.items():
        for group_id, members in groups.items():
            members_tuple = tuple(sorted(members))  # Use sorted tuple of members to ensure uniqueness
            if members_tuple not in group_encoding:
                group_encoding[members_tuple] = group_id_counter
                group_id_counter += 1

In [9]:
def find_dominant_groups(group_encoding, fformations):
    # fformations is a dictionary where the key is a time_key and the value is another dictionary 
    # with group ID as the key and list of members as the value.


    # Count Pairwise Member Appearances in Groups
    pairwise_count = defaultdict(lambda: defaultdict(int))

    for time_key, groups in fformations.items():
        for group_id, members in groups.items():
            members_tuple = tuple(sorted(members))
            encoded_group_id = group_encoding[members_tuple]
            for i in range(len(members)):
                for j in range(i + 1, len(members)):
                    pair = (members[i], members[j])
                    pairwise_count[pair][encoded_group_id] += 1

    # Step 3: Identify Dominant Group for Each Pair
    dominant_group_for_pair = {}

    for pair, group_counts in pairwise_count.items():
        dominant_group_id = max(group_counts, key=group_counts.get)
        dominant_group_for_pair[pair] = dominant_group_id

    # Output the dominant groups for each member pair
    for pair, group_id in dominant_group_for_pair.items():
        original_group = [k for k, v in group_encoding.items() if v == group_id][0]
        print(f"Member Pair {pair}: Dominant Group Encoding {group_id}, Members in Dominant Group: {original_group}")
    return dominant_group_for_pair
        

In [10]:

def add_group_info(dominant_group_for_pair, csv_file, date):
    # Load the CSV file into a DataFrame
    sc_df = pd.read_csv(csv_file)
    # date = date.strftime("%Y/%m/%d").replace('/0', '/')
    sc_df['Date'] = pd.to_datetime(sc_df['Date']).dt.date
    sc_df = sc_df[sc_df['Date']==date]

    # Assuming 'dominant_group_for_pair' is a dictionary where keys are tuples of (member pairs, date)
    # and values are the corresponding dominant group encodings

    # Define a function to find the dominant group for a pair and date
    def find_dominant_group(subject, partner):
        subject = str(subject)
        partner = str(partner)
        pair = (subject, partner) if (subject, partner) in dominant_group_for_pair else (partner, subject)
        return dominant_group_for_pair.get(pair, "No dominant group")

# Apply the function to each row in the DataFrame
    sc_df['Group'] = sc_df.apply(lambda row: find_dominant_group(row['Subject'], row['Partner']), axis=1)
    
    return sc_df

In [11]:
# Function to extract date from filename
def extract_date_from_filename(file_pattern, filename):
    match = file_pattern.search(filename)
    if match:
        date_str = match.group(1)
        return pd.to_datetime(date_str, format='%m%d%y').date()
    return None


In [12]:
def count_frequent_formations(fformations, top_k=None):
    formation_counter = Counter()
    
    # Aggregate all groups into the counter
    for time_key in fformations:
        for group_key in fformations[time_key]:
            formation = tuple(sorted(fformations[time_key][group_key]))  # Use sorted tuple as hashable key
            formation_counter[formation] += 1
    
    # Convert Counter to a dict and sort by frequency
    frequent_formations = dict(formation_counter)
    
    if top_k:
        # Get top_k frequent formations
        frequent_formations = dict(sorted(frequent_formations.items(), key=lambda item: item[1], reverse=True)[:top_k])
    else:
        # Sort by frequency if no top_k specified
        frequent_formations = dict(sorted(frequent_formations.items(), key=lambda item: item[1], reverse=True))
        
    print(f"top {top_k} most frequent groups")
    print(frequent_formations)
    
    return frequent_formations

F-Formation extraction: Main

In [13]:
version = "3"

if version == "3":
    directory_path = "D:\OneDrive - Delft University of Technology\Thesis\DEBBIE_STARFISH_2223\Version 3_ Kalman_Fixed_Data + Overlap Filter\Synched_Data_GR0_2to2_ANGLE45\COTALK"
    file_pattern = re.compile(r'DAYCOTALK_(\d{6})_COTALK0_22_DEN_072924_V21588968411.CSV') #3
    
elif version == "2":
    directory_path = "D:\OneDrive - Delft University of Technology\Thesis\DEBBIE_STARFISH_2223\Version 2_Kalman_Fixed_Data\Synched_Data_GR0_2to2_ANGLE45\COTALK"
    file_pattern = re.compile(r'DAYCOTALK_(\d{6})_COTALK0_22_DEN_072224_V22072780461.CSV') # 2
    
else:
    print("FILE NOT FOUND")
    
files = [f for f in os.listdir(directory_path) if file_pattern.match(f)]
files, len(files)

(['DAYCOTALK_013023_COTALK0_22_DEN_072924_V21588968411.CSV',
  'DAYCOTALK_020123_COTALK0_22_DEN_072924_V21588968411.CSV',
  'DAYCOTALK_031323_COTALK0_22_DEN_072924_V21588968411.CSV',
  'DAYCOTALK_031523_COTALK0_22_DEN_072924_V21588968411.CSV',
  'DAYCOTALK_041723_COTALK0_22_DEN_072924_V21588968411.CSV',
  'DAYCOTALK_041923_COTALK0_22_DEN_072924_V21588968411.CSV',
  'DAYCOTALK_061523_COTALK0_22_DEN_072924_V21588968411.CSV',
  'DAYCOTALK_101922_COTALK0_22_DEN_072924_V21588968411.CSV',
  'DAYCOTALK_102122_COTALK0_22_DEN_072924_V21588968411.CSV',
  'DAYCOTALK_111422_COTALK0_22_DEN_072924_V21588968411.CSV',
  'DAYCOTALK_111622_COTALK0_22_DEN_072924_V21588968411.CSV',
  'DAYCOTALK_120522_COTALK0_22_DEN_072924_V21588968411.CSV',
  'DAYCOTALK_120722_COTALK0_22_DEN_072924_V21588968411.CSV'],
 13)

In [1]:
# Global group encoding and counter
group_encoding = {}  # To store unique encoding for each group across all days
group_id_counter = 0

# parameters for f-formations extractions
method = "PO"
sigma = 1.0
dataset = "starfish"
interval = 10 # 1s

ranked_fformations = []
all_fformations = {}

day = 0
for file in files:
    
    # read file
    file_path = os.path.join(directory_path, file)
    date = extract_date_from_filename(file_pattern, file)
    print(f"----------File {date} being processed")
    df = pd.read_csv(file_path).drop(columns=['VOCCHNCHF_LENAKF'])
    df['TIME'] = pd.to_datetime(df['TIME'])
    df['Date'] = date
    
    # covert angels to make them relative to +x axis
    df['KC_O'] = 90 - df['KC_O']
    df['KC_O'] = (df['KC_O'] + 180) % 360 - 180
    
    # process children ids
    df = df[df.apply(filter_teachers, axis=1)]
    df['SUBJECTID'] = df['SUBJECTID'].str.extract(r'(\d+)$')
    all_children = df['SUBJECTID'].unique()
    all_children_num = len(all_children)
    
    # filter bad time and entried out of the classroom range
    after_df = filter_badtime(df)
    filtered_rows = after_df[(after_df['KC_X'] <= 16) & (after_df['KC_Y'] <= 16)]
    
    # extract f-formations
    _, result_dict = build_coords_dict(filtered_rows, version= version, day= day, method=method)
    # affinities, members = affinity_calculation(result_dict, sigma = sigma, method = method, dataset = dataset)
    # fformations = fformation_extraction(affinities, members, interval = interval)
    # all_fformations[day] = fformations
    day += 1
    
    # select top k most frequent groups in a day, and return a ranked list
    top_k = None
    # means select all groups
    fre_result = count_frequent_formations(fformations)
    ranked_fformations.append(fre_result)
    
    # encode group ids
    encode_groups(fformations=fformations)
    
    # # identify pairwise dominant groups
    # might need to go out the loop 
    # dominant_group_for_pair = find_dominant_groups(group_encoding, fformations)
    # csv_file = 'final_social_contact_ratios_with_diagnosis_copy.csv'
    # partial_df = add_group_info(dominant_group_for_pair, csv_file, date)
    # all_fformations.append(partial_df)
    
    # Combine all results into a single DataFrame
# final_results_df = pd.concat(all_fformations, ignore_index=True)

# # Save or process final_results_df as needed
# save_file =  method + '_'+ 'grouped_final_social_contact_ratios_with_diagnosis_copy.csv'
# final_results_df.to_csv(save_file, index=False)
# print(final_results_df.head())

# Convert tuple keys to string keys
converted_group_encoding = {json.dumps(k): v for k, v in group_encoding.items()}

save_json = method + '_' + "Group_encoding.json"
# Save the converted dictionary to a JSON file
with open(save_json, 'w') as json_file:
    json.dump(converted_group_encoding, json_file, indent=4)

print("Dictionary successfully saved")
print(ranked_fformations)

SyntaxError: invalid syntax (2065477242.py, line 47)

Extend dydas to groups

Select the n most frequent group over each day

In [26]:
all_fformations

{1: {Timestamp('2023-01-30 09:47:00.100000'): {0: ['32', '29'],
   1: ['46', '28'],
   2: ['41', '33']},
  Timestamp('2023-01-30 09:47:01.100000'): {0: ['32', '33', '29'],
   1: ['46', '28']},
  Timestamp('2023-01-30 09:47:02.100000'): {0: ['32', '33', '29'],
   1: ['46', '28']},
  Timestamp('2023-01-30 09:47:03.100000'): {0: ['41', '33'], 1: ['31', '46']},
  Timestamp('2023-01-30 09:47:04.100000'): {0: ['41', '33'], 1: ['31', '46']},
  Timestamp('2023-01-30 09:47:05.100000'): {0: ['32', '33', '29'],
   1: ['46', '28']},
  Timestamp('2023-01-30 09:47:06.100000'): {0: ['32', '33'], 1: ['27', '28']},
  Timestamp('2023-01-30 09:47:07.100000'): {0: ['31', '46'], 1: ['33', '29']},
  Timestamp('2023-01-30 09:47:08.100000'): {0: ['33', '29'], 1: ['31', '46']},
  Timestamp('2023-01-30 09:47:09.100000'): {0: ['32', '33', '29'],
   1: ['46', '28']},
  Timestamp('2023-01-30 09:47:10.100000'): {0: ['45', '32'], 1: ['46', '28']},
  Timestamp('2023-01-30 09:47:11.100000'): {0: ['44', '33', '29'],
  

In [27]:
# save the json file
import json
from pandas import Timestamp

# Helper function to convert Timestamp keys to strings recursively
def convert_timestamps(obj):
    if isinstance(obj, dict):
        return {str(k) if isinstance(k, Timestamp) else k: convert_timestamps(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_timestamps(i) for i in obj]
    else:
        return obj

# Convert Timestamps in the dictionary to strings
converted_results = convert_timestamps(all_fformations)

# Specify the file path
file_path = method + "_" + 'f-formations_psecond.json'

# Save the converted dictionary to a JSON file
with open(file_path, 'w') as json_file:
    json.dump(converted_results, json_file, indent=4)


In [28]:
ranked_fformations[0]

{('28', '46'): 907,
 ('32', '45'): 563,
 ('30', '46'): 487,
 ('27', '44'): 484,
 ('31', '33'): 461,
 ('33', '44'): 406,
 ('45', '46'): 349,
 ('29', '32'): 320,
 ('33', '41'): 310,
 ('28', '30'): 307,
 ('32', '44'): 305,
 ('27', '28'): 291,
 ('30', '31'): 284,
 ('27', '46'): 273,
 ('29', '44'): 271,
 ('29', '30'): 268,
 ('32', '41'): 241,
 ('27', '33'): 234,
 ('31', '46'): 218,
 ('29', '33'): 214,
 ('32', '33'): 213,
 ('29', '45'): 213,
 ('41', '44'): 203,
 ('41', '45'): 202,
 ('27', '41'): 199,
 ('31', '45'): 197,
 ('41', '46'): 188,
 ('28', '31'): 187,
 ('30', '32'): 186,
 ('28', '44'): 185,
 ('32', '46'): 177,
 ('28', '32'): 175,
 ('29', '41'): 174,
 ('31', '41'): 171,
 ('30', '44'): 171,
 ('29', '31'): 158,
 ('31', '44'): 156,
 ('27', '31'): 152,
 ('31', '32'): 146,
 ('28', '29'): 142,
 ('33', '46'): 141,
 ('27', '30'): 140,
 ('30', '45'): 139,
 ('27', '32'): 134,
 ('33', '45'): 130,
 ('27', '33', '44'): 129,
 ('30', '41'): 125,
 ('28', '33'): 122,
 ('28', '41'): 118,
 ('27', '45'):

In [30]:
fre_f = {}
for i, fformation_aday in enumerate(ranked_fformations):
    # n = len(fformation_aday)
    n = 50
    first_n_items = list(fformation_aday.items())[:n]
    # Convert back to a dictionary with string keys for JSON serialization
    first_n_dict = {str(key): value for key, value in first_n_items}
    fre_f[i] = first_n_dict

print(fre_f)

def save_to_json(data, filename):
    with open(filename, 'w') as f:
        json.dump(data, f, indent=4)

# Specify the filename
filename = method + "_" + 'aggregated_frequent_formations.json'

# Save the data
save_to_json(fre_f, filename)

{0: {"('28', '46')": 907, "('32', '45')": 563, "('30', '46')": 487, "('27', '44')": 484, "('31', '33')": 461, "('33', '44')": 406, "('45', '46')": 349, "('29', '32')": 320, "('33', '41')": 310, "('28', '30')": 307, "('32', '44')": 305, "('27', '28')": 291, "('30', '31')": 284, "('27', '46')": 273, "('29', '44')": 271, "('29', '30')": 268, "('32', '41')": 241, "('27', '33')": 234, "('31', '46')": 218, "('29', '33')": 214, "('32', '33')": 213, "('29', '45')": 213, "('41', '44')": 203, "('41', '45')": 202, "('27', '41')": 199, "('31', '45')": 197, "('41', '46')": 188, "('28', '31')": 187, "('30', '32')": 186, "('28', '44')": 185, "('32', '46')": 177, "('28', '32')": 175, "('29', '41')": 174, "('31', '41')": 171, "('30', '44')": 171, "('29', '31')": 158, "('31', '44')": 156, "('27', '31')": 152, "('31', '32')": 146, "('28', '29')": 142, "('33', '46')": 141, "('27', '30')": 140, "('30', '45')": 139, "('27', '32')": 134, "('33', '45')": 130, "('27', '33', '44')": 129, "('30', '41')": 125, "(

In [31]:
# The correct one
map_HL = {    
    '27': 'HL',
    '28': 'HL',
    '29': 'HL',
    '30': 'HL',
    '31': 'HL',
    '32': 'TH',
    '33': 'HL',
    '41': 'TH',
    '42': 'TH',
    '43': 'TH',
    '44': 'TH',
    '45': 'TH',
    '46': 'TH'
}

In [32]:
def load_from_json(filename):
    with open(filename, 'r') as f:
        data = json.load(f)
    return data

# Specify the filename
filename = method + "_" + 'aggregated_frequent_formations.json'

# Recover the fre_f data from the JSON file
fre_f = load_from_json(filename)

# Print the recovered data
print(fre_f)

{'0': {"('28', '46')": 907, "('32', '45')": 563, "('30', '46')": 487, "('27', '44')": 484, "('31', '33')": 461, "('33', '44')": 406, "('45', '46')": 349, "('29', '32')": 320, "('33', '41')": 310, "('28', '30')": 307, "('32', '44')": 305, "('27', '28')": 291, "('30', '31')": 284, "('27', '46')": 273, "('29', '44')": 271, "('29', '30')": 268, "('32', '41')": 241, "('27', '33')": 234, "('31', '46')": 218, "('29', '33')": 214, "('32', '33')": 213, "('29', '45')": 213, "('41', '44')": 203, "('41', '45')": 202, "('27', '41')": 199, "('31', '45')": 197, "('41', '46')": 188, "('28', '31')": 187, "('30', '32')": 186, "('28', '44')": 185, "('32', '46')": 177, "('28', '32')": 175, "('29', '41')": 174, "('31', '41')": 171, "('30', '44')": 171, "('29', '31')": 158, "('31', '44')": 156, "('27', '31')": 152, "('31', '32')": 146, "('28', '29')": 142, "('33', '46')": 141, "('27', '30')": 140, "('30', '45')": 139, "('27', '32')": 134, "('33', '45')": 130, "('27', '33', '44')": 129, "('30', '41')": 125, 

In [33]:
def calculate_majority_condition_ratio(group, map_HL):
    condition_counts = {}
    for subject in group:
        condition = map_HL.get(subject, 'Unknown')
        condition_counts[condition] = condition_counts.get(condition, 0) + 1

    majority_condition_count = max(condition_counts.values())
    return majority_condition_count / len(group)

# Create a list to store data for the DataFrame
homo_data = []

# Process each day and group
for day, groups in fre_f.items():
    for group_str, count in groups.items():
        group = eval(group_str)
        homophily_degree = calculate_majority_condition_ratio(group, map_HL)
        homo_data.append((day, group_str, homophily_degree))

# Create a DataFrame
df_homo = pd.DataFrame(homo_data, columns=['day', 'group', 'homophily_degree'])
df_homo

Unnamed: 0,day,group,homophily_degree
0,0,"('28', '46')",0.500000
1,0,"('32', '45')",1.000000
2,0,"('30', '46')",0.500000
3,0,"('27', '44')",0.500000
4,0,"('31', '33')",1.000000
...,...,...,...
645,12,"('33', '44')",0.500000
646,12,"('27', '33')",1.000000
647,12,"('42', '44', '46')",1.000000
648,12,"('33', '41')",0.500000


Later is for another abandoned analysis 

In [8]:
# Directory containing the files
# Function to filter bad time based on timedelta
import networkx as nx

def filter_badtime(df):
    df = df.copy()
    df.rename(columns={' Interaction Time': 'Interaction Time'}, inplace=True)  # Remove leading space
    df['Interaction Time'] = pd.to_timedelta(df['Interaction Time'])
    df = df[df['Interaction Time'] > pd.to_timedelta("9:35:00")]  # playground
    df = df[df['Interaction Time'] > pd.to_timedelta("9:47:00")]  # toilet
    # df = df[~((df['Interaction Time'] >= pd.to_timedelta("10:20:00")) & (df['Interaction Time'] <= pd.to_timedelta("10:27:00")))]  # toilet
    # df = df[~((df['Interaction Time'] >= pd.to_timedelta("10:33:00")) & (df['Interaction Time'] <= pd.to_timedelta("10:37:00")))]  # toilet
    # df = df[~((df['Interaction Time'] >= pd.to_timedelta("11:30:00")) & (df['Interaction Time'] <= pd.to_timedelta("11:32:00")))]  # toilet
    # df = df[~((df['Interaction Time'] >= pd.to_timedelta("11:38:00")) & (df['Interaction Time'] <= pd.to_timedelta("11:41:00")))]  # toilet
    # df = df[~((df['Interaction Time'] >= pd.to_timedelta("11:45:00")) & (df['Interaction Time'] < pd.to_timedelta("12:02:00")))]  # toilet
    # df = df[~((df['Interaction Time'] >= pd.to_timedelta("12:19:00")) & (df['Interaction Time'] <= pd.to_timedelta("12:26:00")))]  # toilet
    df = df[df['Interaction Time'] < pd.to_timedelta("12:27:00")]  # toilet
    return df


directory_path = r"d:\OneDrive - Delft University of Technology\Thesis\DEBBIE_STARFISH_2223\Synched_Data_GR0_22_DEN\PAIRANGLES (to be uploaded by Laura)"

# File pattern to match
file_pattern = re.compile(r'DAILY_ANGLES(\d{6})_GR0_22_DEN_010224_V21201895560.CSV')

# Columns to delete
columns_to_delete = [' Angle1', ' Angle2', ' Leftx', 'Lefty', 'Rightx', 'Righty',
                     ' Leftx2', 'Lefty2', 'Rightx2', 'Righty2', 
                     ' Gender1', ' Gender2', ' WasTalking1', ' WasTalking2 ']


# Function to extract date from filename
def extract_date_from_filename(filename):
    match = file_pattern.search(filename)
    if match:
        date_str = match.group(1)
        return pd.to_datetime(date_str, format='%m%d%y').date()
    return None

# List of all files in the directory
files = [f for f in os.listdir(directory_path) if file_pattern.match(f)]
all_fformations = []

chunk_size = 100000
# Process each file incrementally
day = 0
for file in files:
    file_path = os.path.join(directory_path, file)
    date = extract_date_from_filename(file)
    
    processed_chunks = []
    # Incremental processing
    for chunk in pd.read_csv(file_path, chunksize=chunk_size):
        chunk.drop(columns=columns_to_delete, inplace=True, errors='ignore')
        chunk = chunk[~((chunk['Type1'] == 'TEACHER') | (chunk[' Type2'] == 'TEACHER'))]
        chunk = filter_badtime(chunk)
        processed_chunks.append(chunk)
        
    processed_df = pd.concat(processed_chunks, ignore_index=True)
    processed_df.columns = processed_df.columns.str.strip()
    processed_df.rename(columns={'Person 1': 'Person1'}, inplace=True) 
    processed_df['Person1'] = processed_df['Person1'].str.extract(r'(\d+)$')
    processed_df['Person2'] = processed_df['Person2'].str.extract(r'(\d+)$')
        
    processed_df['Time'] = pd.to_timedelta(processed_df['Interaction Time'])
    print("day ", day)
    print(processed_df)
    
    
    temp_df = df_homo[df_homo['day'] == str(day)]
    
    for i, row in temp_df.iterrows():
        group = eval(row['group'])  # Convert string representation of the group to a tuple
        interaction_counts = 0
        
        # Create a set of all unique pairs in the group
        group_pairs = [(group[i], group[j]) for i in range(len(group)) for j in range(i+1, len(group))]
        
        # Count interactions where all group members have contacts with each other
        for pair in group_pairs:
            person1, person2 = pair
            pair_interactions = processed_df[
                (((processed_df['Person1'] == person1) & (processed_df['Person2'] == person2)) |
                ((processed_df['Person1'] == person2) & (processed_df['Person2'] == person1))) & 
                (processed_df['45Interaction'] == 0.1)
            ]
            interaction_counts += len(pair_interactions['Interaction'])
        
        sclag = interaction_counts * 100 / 9598000
        df_homo.loc[i, 'sclag'] = sclag
        print(interaction_counts, sclag)
    
    day += 1

    all_fformations.append(df_homo)

# Combine all results into a single DataFrame
final_results_df = pd.concat(all_fformations, ignore_index=True)

# Save or process final_results_df as needed
final_results_df.to_csv('3_group.csv', index=False)
print(final_results_df.head())


day  0
        Person1 Person2 Interaction Time  Interaction Millisecond  \
0            27      28  0 days 09:47:01                        0   
1            27      29  0 days 09:47:01                        0   
2            27      30  0 days 09:47:01                        0   
3            27      31  0 days 09:47:01                        0   
4            27      32  0 days 09:47:01                        0   
...         ...     ...              ...                      ...   
5273505      44      46  0 days 12:26:59                      900   
5273506      33      44  0 days 12:26:59                      900   
5273507      45      46  0 days 12:26:59                      900   
5273508      33      45  0 days 12:26:59                      900   
5273509      33      46  0 days 12:26:59                      900   

         Interaction  45Interaction  Type1  Type2 Diagnosis1 Diagnosis2  \
0                0.1            0.0  CHILD  CHILD         HL         HL   
1             

KeyboardInterrupt: 

In [11]:
# Directory containing the files
# Function to filter bad time based on timedelta
import networkx as nx

def filter_badtime(df):
    df = df.copy()
    df.rename(columns={' Interaction Time': 'Interaction Time'}, inplace=True)  # Remove leading space
    df['Interaction Time'] = pd.to_timedelta(df['Interaction Time'])
    df = df[df['Interaction Time'] > pd.to_timedelta("9:35:00")]  # playground
    df = df[df['Interaction Time'] > pd.to_timedelta("9:47:00")]  # toilet
    # df = df[~((df['Interaction Time'] >= pd.to_timedelta("10:20:00")) & (df['Interaction Time'] <= pd.to_timedelta("10:27:00")))]  # toilet
    # df = df[~((df['Interaction Time'] >= pd.to_timedelta("10:33:00")) & (df['Interaction Time'] <= pd.to_timedelta("10:37:00")))]  # toilet
    # df = df[~((df['Interaction Time'] >= pd.to_timedelta("11:30:00")) & (df['Interaction Time'] <= pd.to_timedelta("11:32:00")))]  # toilet
    # df = df[~((df['Interaction Time'] >= pd.to_timedelta("11:38:00")) & (df['Interaction Time'] <= pd.to_timedelta("11:41:00")))]  # toilet
    # df = df[~((df['Interaction Time'] >= pd.to_timedelta("11:45:00")) & (df['Interaction Time'] < pd.to_timedelta("12:02:00")))]  # toilet
    # df = df[~((df['Interaction Time'] >= pd.to_timedelta("12:19:00")) & (df['Interaction Time'] <= pd.to_timedelta("12:26:00")))]  # toilet
    df = df[df['Interaction Time'] < pd.to_timedelta("12:27:00")]  # toilet
    return df


directory_path = r"d:\OneDrive - Delft University of Technology\Thesis\DEBBIE_STARFISH_2223\Synched_Data_GR0_22_DEN\PAIRANGLES (to be uploaded by Laura)"

# File pattern to match
file_pattern = re.compile(r'DAILY_ANGLES(\d{6})_GR0_22_DEN_010224_V21201895560.CSV')

# Columns to delete
columns_to_delete = [' Angle1', ' Angle2', ' Leftx', 'Lefty', 'Rightx', 'Righty',
                     ' Leftx2', 'Lefty2', 'Rightx2', 'Righty2', 
                     ' Gender1', ' Gender2', ' WasTalking1', ' WasTalking2 ']


# Function to extract date from filename
def extract_date_from_filename(filename):
    match = file_pattern.search(filename)
    if match:
        date_str = match.group(1)
        return pd.to_datetime(date_str, format='%m%d%y').date()
    return None

# List of all files in the directory
files = [f for f in os.listdir(directory_path) if file_pattern.match(f)]
all_fformations = []

chunk_size = 100000
# Process each file incrementally
day = 0
for file in files:
    file_path = os.path.join(directory_path, file)
    date = extract_date_from_filename(file)
    
    processed_chunks = []
    # Incremental processing
    for chunk in pd.read_csv(file_path, chunksize=chunk_size):
        chunk.drop(columns=columns_to_delete, inplace=True, errors='ignore')
        chunk = chunk[~((chunk['Type1'] == 'TEACHER') | (chunk[' Type2'] == 'TEACHER'))]
        chunk = filter_badtime(chunk)
        processed_chunks.append(chunk)
        
    processed_df = pd.concat(processed_chunks, ignore_index=True)
    processed_df.columns = processed_df.columns.str.strip()
    processed_df.rename(columns={'Person 1': 'Person1'}, inplace=True) 
    processed_df['Person1'] = processed_df['Person1'].str.extract(r'(\d+)$')
    processed_df['Person2'] = processed_df['Person2'].str.extract(r'(\d+)$')
        
    processed_df['Time'] = pd.to_timedelta(processed_df['Interaction Time'])
    print("day ", day)
    print(processed_df)
    
    
    temp_df = df_homo[df_homo['day'] == str(day)]
    
    grouped_by_timestamp = processed_df.groupby('Interaction Time')
    for i, row in temp_df.iterrows():
        group = eval(row['group'])  # Convert string representation of the group to a tuple
        interaction_counts = 0

        # Create a graph outside of the loop and add nodes for each person in the group
        G = nx.Graph()
        G.add_nodes_from(group)

        # Iterate over each group of interactions per timestamp
        for timestamp, timestamp_df in grouped_by_timestamp:
            # Filter interactions for the current group in one go
            timestamp_group_df = timestamp_df[
                ((timestamp_df['Person1'].isin(group)) & (timestamp_df['Person2'].isin(group)) & timestamp_df['Interaction'] == 0.1)
            ]

            # Add edges for all interactions in the group at this timestamp
            for _, interaction in timestamp_group_df.iterrows():
                G.add_edge(interaction['Person1'], interaction['Person2'])

            # Check if the group forms a connected subgraph at this timestamp
            if nx.is_connected(G.subgraph(group)):
                interaction_counts += len(timestamp_group_df['Interaction'])

            # Remove the edges of this timestamp to avoid re-adding
            G.remove_edges_from(G.edges())

        # Calculate sclag based on total interaction counts across all timestamps
        sclag = interaction_counts * 100 / 9598000
        df_homo.loc[i, 'sclag'] = sclag
        print(interaction_counts, sclag)
    
    day += 1

    all_fformations.append(df_homo)

# Combine all results into a single DataFrame
final_results_df = pd.concat(all_fformations, ignore_index=True)

# Save or process final_results_df as needed
final_results_df.to_csv('3_group.csv', index=False)
print(final_results_df.head())


day  0
        Person1 Person2 Interaction Time  Interaction Millisecond  \
0            27      28  0 days 09:47:01                        0   
1            27      29  0 days 09:47:01                        0   
2            27      30  0 days 09:47:01                        0   
3            27      31  0 days 09:47:01                        0   
4            27      32  0 days 09:47:01                        0   
...         ...     ...              ...                      ...   
5273505      44      46  0 days 12:26:59                      900   
5273506      33      44  0 days 12:26:59                      900   
5273507      45      46  0 days 12:26:59                      900   
5273508      33      45  0 days 12:26:59                      900   
5273509      33      46  0 days 12:26:59                      900   

         Interaction  45Interaction  Type1  Type2 Diagnosis1 Diagnosis2  \
0                0.1            0.0  CHILD  CHILD         HL         HL   
1             

MemoryError: Unable to allocate 40.2 MiB for an array with shape (5273510,) and data type int64

K-means

In [None]:
def k_means(pivot_df, time_interval = 600):

    coord_columns = [col for col in pivot_df.columns if '_KC_X' in col or '_KC_Y' in col]

    members_in_a_scene = {}
    groups_detected = {}

    # Group by TIME and apply K-means clustering
    for i in range(0, len(pivot_df), time_interval):
        group = pivot_df.iloc[i]
        # Extract coordinate data for the current time frame
        coord_data = group[coord_columns]
        time = group['TIME']
        
        # Handle missing values by dropping rows with NaN values
        coord_data_cleaned = coord_data.dropna()
        
        coord_data_cleaned = np.array(coord_data_cleaned).reshape([-1,2])
        
        cluster_labels = []
        if coord_data_cleaned.shape[0] > 2:  # Ensure there's data to cluster
            k = 3  # Example number of clusters; you should choose based on your needs
            kmeans = KMeans(n_clusters=k, random_state=0)
            kmeans.fit(coord_data_cleaned)
            
            # Assign cluster labels to the current group
            cluster_labels = kmeans.labels_
            
        member_ids = [col.split('_')[0] for col in coord_columns[::2]]
        members_in_a_scene[time] =  member_ids
        
        clusters = {}
        for label, member_id in zip(cluster_labels, member_ids):
            if label not in clusters:
                clusters[label] = []
            clusters[label].append(member_id)      
        groups_detected[time] = clusters   
        
    return groups_detected

In [None]:
all_fformations = []
# Global group encoding and counter
group_encoding = {}  # To store unique encoding for each group across all files
group_id_counter = 0

for file in files:
    file_path = os.path.join(directory_path, file)
    date = extract_date_from_filename(file)
    print(f"----------File {date} being examined")
    df = pd.read_csv(file_path).drop(columns=['VOCCHNCHF_LENAKF'])
    df['TIME'] = pd.to_datetime(df['TIME'])
    df['Date'] = date
    
    df['KC_O'] = 90 - df['KC_O']
    df['KC_O'] = (df['KC_O'] + 180) % 360 - 180
    
    df = df[df.apply(filter_teachers, axis=1)]

    df['SUBJECTID'] = df['SUBJECTID'].str.extract(r'(\d+)$')

    all_children = df['SUBJECTID'].unique()
    all_children_num = len(all_children)
    
    after_df = filter_badtime(df)

    filtered_rows = after_df[(after_df['KC_X'] <= 14.072) & (after_df['KC_Y'] <= 8.377)]
    
    pivot_df, _ = build_coords_dict(filtered_rows)
    
    groups_detected = k_means(pivot_df, time_interval = 60 *10)
    
    dominant_group_for_pair = find_dominant_groups(groups_detected)
    
    csv_file = 'final_social_contact_ratios_with_diagnosis_copy.csv'
    partial_df = add_group_info(dominant_group_for_pair, csv_file, date)
    all_fformations.append(partial_df)
    
    # Combine all results into a single DataFrame
final_results_df = pd.concat(all_fformations, ignore_index=True)

# Save or process final_results_df as needed
final_results_df.to_csv('kmeans_final_social_contact_ratios_with_diagnosis_copy.csv', index=False)
print(final_results_df.head())
