In [1]:
import pandas as pd
import numpy as np
import os
import json
from collections import defaultdict


def calculate_jaccard(set1, set2):
 """Jaccard"""
 intersection = len(set1.intersection(set2))
 union = len(set1.union(set2))
 return intersection / union if union > 0 else 0


def calculate_overlap(set1, set2):
 """: / """
 intersection = len(set1.intersection(set2))
 denominator = min(len(set1), len(set2))
 return intersection / denominator if denominator > 0 else 0


def calculate_modified_similarity(set1, set2, lambda_param=0.6):
 """, community
 Jaccard, λ"""
 jaccard = calculate_jaccard(set1, set2)
 overlap1 = len(set1.intersection(set2)) / len(set1) if len(set1) > 0 else 0
 overlap2 = len(set1.intersection(set2)) / len(set2) if len(set2) > 0 else 0

 #  max_overlap = max(overlap1, overlap2)

 #  return lambda_param * jaccard + (1 - lambda_param) * max_overlap


def load_community_data(date):
 """communitydata"""
 file_path = f'../visualization/assets/data/{date}/handle/rank{date}.csv'
 if not os.path.exists(file_path):
 print(f"fileexists: {file_path}")
 return None

 df = pd.read_csv(file_path)

 # community {communityID: node}
 communities = {}
 for _, row in df.iterrows():
 comm_id = row['community']
 node_id = row['id']

 if comm_id not in communities:
 communities[comm_id] = set()
 communities[comm_id].add(node_id)

 return communities


def analyze_community_events(t0_date, t1_date):
 """timecommunity events
 process, community, event"""
 print(f" {t0_date} {t1_date} community events")

 # timecommunitydata
 t0_communities = load_community_data(t0_date)
 t1_communities = load_community_data(t1_date)

 if t0_communities is None or t1_communities is None:
 print("communitydata, skip")
 return None

 # eventcommunity
 t0_assigned = set()
 t1_assigned = set()

 # event
 events = []

 # communitycommunitycommunity
 t0_large_communities = {cid: nodes for cid, nodes in t0_communities.items() if len(nodes) >= 3000}
 t0_small_communities = {cid: nodes for cid, nodes in t0_communities.items() if len(nodes) < 3000}
 t1_large_communities = {cid: nodes for cid, nodes in t1_communities.items() if len(nodes) >= 3000}
 t1_small_communities = {cid: nodes for cid, nodes in t1_communities.items() if len(nodes) < 3000}

 print(f"t0community: {len(t0_large_communities)}, community: {len(t0_small_communities)}")
 print(f"t1community: {len(t1_large_communities)}, community: {len(t1_small_communities)}")

 # 1: processcommunityevent ()
 for t0_comm_id, t0_nodes in sorted(t0_large_communities.items(), key=lambda x: len(x[1]), reverse=True):
 if t0_comm_id in t0_assigned:
 continue

 best_match = None
 best_similarity = 0
 for t1_comm_id, t1_nodes in sorted(t1_large_communities.items(), key=lambda x: len(x[1]), reverse=True):
 if t1_comm_id in t1_assigned:
 continue

 #  similarity = calculate_modified_similarity(t0_nodes, t1_nodes)
 if similarity > best_similarity:
 best_similarity = similarity
 best_match = (t1_comm_id, t1_nodes)

 # event
 if best_match and best_similarity >= 0.5:
 t1_comm_id, t1_nodes = best_match

 #  t0_size = len(t0_nodes)
 t1_size = len(t1_nodes)
 size_change_ratio = t1_size / t0_size if t0_size > 0 else float('inf')

 event_type = ""
 #  if size_change_ratio > 1.2: # 20%
 event_type = "add"
 elif size_change_ratio < 0.8: # 20%
 event_type = ""

 events.append({
 "source_date": t0_date,
 "source_community": t0_comm_id,
 "target_date": t1_date,
 "target_community": t1_comm_id,
 "event_type": event_type,
 "similarity": best_similarity,
 "size_change_ratio": size_change_ratio
 })

 t0_assigned.add(t0_comm_id)
 t1_assigned.add(t1_comm_id)

 # 2: processcommunityevent -  for t0_comm_id, t0_nodes in sorted(t0_large_communities.items(), key=lambda x: len(x[1]), reverse=True):
 if t0_comm_id in t0_assigned:
 continue

 # target(community)
 split_candidates = []

 # checkcommunity
 for t1_comm_id, t1_nodes in sorted(t1_large_communities.items(), key=lambda x: len(x[1]), reverse=True):
 if t1_comm_id in t1_assigned:
 continue

 overlap = len(t0_nodes.intersection(t1_nodes)) / len(t0_nodes)
 # 0.20.15
 if overlap >= 0.15:
 split_candidates.append((t1_comm_id, overlap, len(t1_nodes)))

 # checkcommunity
 for t1_comm_id, t1_nodes in sorted(t1_small_communities.items(), key=lambda x: len(x[1]), reverse=True):
 if t1_comm_id in t1_assigned:
 continue

 overlap = len(t0_nodes.intersection(t1_nodes)) / len(t0_nodes)
 # 0.10.05
 if overlap >= 0.05:
 split_candidates.append((t1_comm_id, overlap, len(t1_nodes)))

 #  split_candidates.sort(key=lambda x: x[1], reverse=True)

 #  if len(split_candidates) >= 1:
 cumulative_overlap = sum(overlap for _, overlap, _ in split_candidates)

 # ,  if (len(split_candidates) >= 2 and cumulative_overlap >= 0.4) or \
 (len(split_candidates) == 1 and cumulative_overlap >= 0.6):
 selected_targets = []
 selected_overlap = 0

 # condition
 for t1_comm_id, overlap, _ in split_candidates:
 if selected_overlap >= 0.7: # 0.80.7
 break

 selected_targets.append(t1_comm_id)
 selected_overlap += overlap
 t1_assigned.add(t1_comm_id)

 if selected_targets:
 events.append({
 "source_date": t0_date,
 "source_community": t0_comm_id,
 "target_date": t1_date,
 "target_community": selected_targets,
 "event_type": "",
 "overlap_score": selected_overlap
 })
 t0_assigned.add(t0_comm_id)

 # 3: processcommunityevent -  for t1_comm_id, t1_nodes in sorted(t1_large_communities.items(), key=lambda x: len(x[1]), reverse=True):
 if t1_comm_id in t1_assigned:
 continue

 # (community)
 merge_candidates = []

 # checkcommunity
 for t0_comm_id, t0_nodes in sorted(t0_large_communities.items(), key=lambda x: len(x[1]), reverse=True):
 if t0_comm_id in t0_assigned:
 continue

 overlap = len(t0_nodes.intersection(t1_nodes)) / len(t1_nodes)
 # 0.20.15
 if overlap >= 0.15:
 merge_candidates.append((t0_comm_id, overlap, len(t0_nodes)))

 # checkcommunity
 for t0_comm_id, t0_nodes in sorted(t0_small_communities.items(), key=lambda x: len(x[1]), reverse=True):
 if t0_comm_id in t0_assigned:
 continue

 overlap = len(t0_nodes.intersection(t1_nodes)) / len(t1_nodes)
 # 0.10.05
 if overlap >= 0.05:
 merge_candidates.append((t0_comm_id, overlap, len(t0_nodes)))

 #  merge_candidates.sort(key=lambda x: x[1], reverse=True)

 #  if len(merge_candidates) >= 1:
 cumulative_overlap = sum(overlap for _, overlap, _ in merge_candidates)

 # ,  if (len(merge_candidates) >= 2 and cumulative_overlap >= 0.4) or \
 (len(merge_candidates) == 1 and cumulative_overlap >= 0.6):
 selected_sources = []
 selected_overlap = 0

 # condition
 for t0_comm_id, overlap, _ in merge_candidates:
 if selected_overlap >= 0.7: # 0.80.7
 break

 selected_sources.append(t0_comm_id)
 selected_overlap += overlap
 t0_assigned.add(t0_comm_id)

 if selected_sources:
 events.append({
 "source_date": t0_date,
 "source_community": selected_sources,
 "target_date": t1_date,
 "target_community": t1_comm_id,
 "event_type": "",
 "overlap_score": selected_overlap
 })
 t1_assigned.add(t1_comm_id)

 # 4: processcommunityevent
 for t0_comm_id, t0_nodes in t0_large_communities.items():
 if t0_comm_id not in t0_assigned and len(t0_nodes) >= 100:
 events.append({
 "source_date": t0_date,
 "source_community": t0_comm_id,
 "target_date": t1_date,
 "target_community": None,
 "event_type": "",
 "similarity": 0
 })
 t0_assigned.add(t0_comm_id)

 for t1_comm_id, t1_nodes in t1_large_communities.items():
 if t1_comm_id not in t1_assigned and len(t1_nodes) >= 100:
 events.append({
 "source_date": t0_date,
 "source_community": None,
 "target_date": t1_date,
 "target_community": t1_comm_id,
 "event_type": "",
 "similarity": 0
 })
 t1_assigned.add(t1_comm_id)

 # 5: processcommunity events()
 # processcommunityevent
 for t0_comm_id, t0_nodes in sorted(t0_small_communities.items(), key=lambda x: len(x[1]), reverse=True):
 if t0_comm_id in t0_assigned:
 continue

 best_match = None
 best_overlap = 0

 for t1_comm_id, t1_nodes in sorted(t1_small_communities.items(), key=lambda x: len(x[1]), reverse=True):
 if t1_comm_id in t1_assigned:
 continue

 # processcommunity
 overlap = calculate_overlap(t0_nodes, t1_nodes)
 if overlap > best_overlap:
 best_overlap = overlap
 best_match = (t1_comm_id, t1_nodes)

 # addcommunityevent
 if best_match and best_overlap >= 0.5: # 0.60.5
 t1_comm_id, t1_nodes = best_match

 #  t0_size = len(t0_nodes)
 t1_size = len(t1_nodes)
 size_change_ratio = t1_size / t0_size if t0_size > 0 else float('inf')

 event_type = ""
 #  if size_change_ratio > 1.3: # community
 event_type = "add"
 elif size_change_ratio < 0.7:
 event_type = ""

 events.append({
 "source_date": t0_date,
 "source_community": t0_comm_id,
 "target_date": t1_date,
 "target_community": t1_comm_id,
 "event_type": event_type,
 "similarity": best_overlap,
 "size_change_ratio": size_change_ratio
 })

 t0_assigned.add(t0_comm_id)
 t1_assigned.add(t1_comm_id)

 # 6: processcommunityevent
 for t0_comm_id, t0_nodes in sorted(t0_small_communities.items(), key=lambda x: len(x[1]), reverse=True):
 if t0_comm_id in t0_assigned or len(t0_nodes) < 1: # community
 continue

 # target
 split_candidates = []

 for t1_comm_id, t1_nodes in sorted(t1_small_communities.items(), key=lambda x: len(x[1]), reverse=True):
 if t1_comm_id in t1_assigned:
 continue

 #  overlap = calculate_overlap(t0_nodes, t1_nodes)
 if overlap >= 0.3: # community
 split_candidates.append((t1_comm_id, overlap, len(t1_nodes)))

 # 2community
 if len(split_candidates) >= 2:
 cumulative_overlap = sum(overlap for _, overlap, _ in split_candidates)

 if cumulative_overlap >= 0.6: # community
 selected_targets = []

 for t1_comm_id, overlap, _ in split_candidates:
 selected_targets.append(t1_comm_id)
 t1_assigned.add(t1_comm_id)

 if selected_targets:
 events.append({
 "source_date": t0_date,
 "source_community": t0_comm_id,
 "target_date": t1_date,
 "target_community": selected_targets,
 "event_type": "",
 "overlap_score": cumulative_overlap
 })
 t0_assigned.add(t0_comm_id)

 # 7: processcommunityevent
 for t1_comm_id, t1_nodes in sorted(t1_small_communities.items(), key=lambda x: len(x[1]), reverse=True):
 if t1_comm_id in t1_assigned or len(t1_nodes) < 1: # community
 continue

 #  merge_candidates = []

 for t0_comm_id, t0_nodes in sorted(t0_small_communities.items(), key=lambda x: len(x[1]), reverse=True):
 if t0_comm_id in t0_assigned:
 continue

 #  overlap = calculate_overlap(t1_nodes, t0_nodes)
 if overlap >= 0.3: # community
 merge_candidates.append((t0_comm_id, overlap, len(t0_nodes)))

 # 2community
 if len(merge_candidates) >= 2:
 cumulative_overlap = sum(overlap for _, overlap, _ in merge_candidates)

 if cumulative_overlap >= 0.6: # community
 selected_sources = []

 for t0_comm_id, overlap, _ in merge_candidates:
 selected_sources.append(t0_comm_id)
 t0_assigned.add(t0_comm_id)

 if selected_sources:
 events.append({
 "source_date": t0_date,
 "source_community": selected_sources,
 "target_date": t1_date,
 "target_community": t1_comm_id,
 "event_type": "",
 "overlap_score": cumulative_overlap
 })
 t1_assigned.add(t1_comm_id)

 return events


def generate_all_events():
 """generatecommunity events"""
 all_events = []
 months = list(range(202401, 202411))

 for i in range(len(months) - 1):
 t0 = months[i]
 t1 = months[i + 1]

 events = analyze_community_events(t0, t1)
 if events:
 all_events.extend(events)

 # saveeventdata
 output_dir = '../visualization/assets/data/events'
 os.makedirs(output_dir, exist_ok=True)

 # JSONcolumnformatsave
 events_json = []
 for event in all_events:
 event_copy = event.copy()
 # column
 if isinstance(event_copy.get('source_community'), list) and len(event_copy['source_community']) > 0:
 if isinstance(event_copy['source_community'][0], set):
 event_copy['source_community'] = [list(comm) for comm in event_copy['source_community']]
 if isinstance(event_copy.get('target_community'), list) and len(event_copy['target_community']) > 0:
 if isinstance(event_copy['target_community'][0], set):
 event_copy['target_community'] = [list(comm) for comm in event_copy['target_community']]
 events_json.append(event_copy)

 with open(f'{output_dir}/community_events.json', 'w') as f:
 json.dump(events_json, f, indent=2)

 # generateCSVformat
 events_df = pd.DataFrame(all_events)
 events_df.to_csv(f'{output_dir}/community_events.csv', index=False)

 print(f"generate {len(all_events)} community events")
 return all_events


# generateevent
all_events = generate_all_events()

 202401 202402 community events
t0community: 5, community: 16
t1community: 4, community: 55
 202402 202403 community events
t0community: 4, community: 55
t1community: 5, community: 16
 202403 202404 community events
t0community: 5, community: 16
t1community: 5, community: 18
 202404 202405 community events
t0community: 5, community: 18
t1community: 5, community: 22
 202405 202406 community events
t0community: 5, community: 22
t1community: 4, community: 29
 202406 202407 community events
t0community: 4, community: 29
t1community: 5, community: 25
 202407 202408 community events
t0community: 5, community: 25
t1community: 5, community: 24
 202408 202409 community events
t0community: 5, community: 24
t1community: 5, community: 19
 202409 202410 community events
t0community: 5, community: 19
t1community: 6, community: 39
generate 147 community events
