In [1]:
import pandas as pd


In [2]:
def getdict(rank):
    community_mapping = {comm: idx for idx, comm in enumerate(sorted(rank['community'].unique()))}
    rank['community_number'] = rank['community'].map(community_mapping)
    community_to_ids = (
        rank.groupby('community_number')['id']
        .apply(set)
        .to_dict()
    )
    return community_to_ids


In [3]:
all_community_to_ids = {}
for date in range(202404, 202411):
    rank = pd.read_csv(f'./{date}/handle/rank{date}.csv')
    community_to_ids = getdict(rank)
    all_community_to_ids[date] = community_to_ids

In [4]:
len(all_community_to_ids.get(202401))

TypeError: object of type 'NoneType' has no len()

In [4]:
import numpy as np


def adjusted_jaccard_index(set1, set2, total_nodes):
    """Calculate the adjusted Jaccard Index between two sets."""
    intersection = len(set1 & set2)
    union = len(set1 | set2)
    expected_intersection = (len(set1) * len(set2)) / total_nodes
    if union == 0:
        return 0  # Avoid division by zero
    adjusted_j = (intersection - expected_intersection) / (union - expected_intersection)
    return max(0, adjusted_j)


def export_advanced_community_events_to_csv(all_community_to_ids, similarity_threshold,
                                            output_file="advanced_community_events.csv"):
    """Track community events with advanced rules and export results to CSV."""
    time_slices = sorted(all_community_to_ids.keys())
    results = []
    total_nodes = len(set.union(*[set.union(*time_slice.values()) for time_slice in all_community_to_ids.values()]))

    for t1, t2 in zip(time_slices[:-1], time_slices[1:]):
        communities_t1 = all_community_to_ids[t1]
        communities_t2 = all_community_to_ids[t2]

        # Similarity matrix
        similarity_matrix = np.zeros((len(communities_t1), len(communities_t2)))
        t1_ids = list(communities_t1.keys())
        t2_ids = list(communities_t2.keys())

        for i, id1 in enumerate(t1_ids):
            for j, id2 in enumerate(t2_ids):
                similarity = adjusted_jaccard_index(
                    communities_t1[id1], communities_t2[id2], total_nodes
                )
                similarity_matrix[i, j] = similarity

        # Track each event
        row_sums = np.sum(similarity_matrix >= similarity_threshold, axis=1)
        col_sums = np.sum(similarity_matrix >= similarity_threshold, axis=0)

        for i, id1 in enumerate(t1_ids):
            connected = np.where(similarity_matrix[i, :] >= similarity_threshold)[0]
            if len(connected) == 0:  # No continuation
                results.append({
                    "Previous Time Slice": int(t1),
                    "Previous Community": int(id1),
                    "Current Time Slice": t2,
                    "Current Community": -1,
                    "Similarity": None,
                    "Event": "Dies"
                })
            elif len(connected) > 1:  # Splits into multiple
                for j in connected:
                    results.append({
                        "Previous Time Slice": int(t1),
                        "Previous Community": int(id1),
                        "Current Time Slice": t2,
                        "Current Community": int(t2_ids[j]),
                        "Similarity": similarity_matrix[i, j],
                        "Event": "Splits"
                    })
            else:  # Continues normally
                j = connected[0]
                results.append({
                    "Previous Time Slice": int(t1),
                    "Previous Community": int(id1),
                    "Current Time Slice": t2,
                    "Current Community": int(t2_ids[j]),
                    "Similarity": similarity_matrix[i, j],
                    "Event": "Continues"
                })

        for j, id2 in enumerate(t2_ids):
            connected = np.where(similarity_matrix[:, j] >= similarity_threshold)[0]
            if len(connected) == 0:  # No originating community
                results.append({
                    "Previous Time Slice": int(t1),
                    "Previous Community": -1,
                    "Current Time Slice": t2,
                    "Current Community": int(id2),
                    "Similarity": None,
                    "Event": "Born"
                })
            elif len(connected) > 1:  # Merges from multiple
                for i in connected:
                    results.append({
                        "Previous Time Slice": int(t1),
                        "Previous Community": int(t1_ids[i]),
                        "Current Time Slice": t2,
                        "Current Community": int(id2),
                        "Similarity": similarity_matrix[i, j],
                        "Event": "Merges"
                    })

    # Convert results to DataFrame
    df = pd.DataFrame(results)
    # Export to CSV
    df.to_csv(output_file, index=False)
    return df


# Run the function and save advanced results
advanced_community_events_df = export_advanced_community_events_to_csv(all_community_to_ids, 0.2)

In [5]:
advanced_community_events_df.to_csv("advanced_community_events.csv", index=False)

In [43]:
def community_similarity_to_target(community_id, time_slice, target_time_slice, all_community_to_ids, threshold):
    """Calculate the similarity of a specific community to all communities in the target time slice."""
    # Step 1: Get the community set in the specified time slice
    community_set = all_community_to_ids.get(time_slice, {}).get(community_id, set())

    if not community_set:
        raise ValueError(f"Community {community_id} does not exist in time slice {time_slice}")

    # Step 2: Calculate total number of nodes across all time slices
    total_nodes = len(set.union(*[set.union(*time_slice.values()) for time_slice in all_community_to_ids.values()]))

    # Step 3: Prepare a list to store communities with similarity above the threshold
    similarity_results = []

    # Step 4: Get the communities in the target time slice
    target_communities = all_community_to_ids.get(target_time_slice, {})
    if not target_communities:
        raise ValueError(f"Target time slice {target_time_slice} does not exist in the dataset.")

    # Step 5: Loop through all communities in the target time slice and calculate the similarity
    for target_community_id, target_community_set in target_communities.items():
        similarity = adjusted_jaccard_index(community_set, target_community_set, total_nodes)

        # Only add to results if similarity is above the threshold
        if similarity > threshold:
            similarity_results.append((target_community_id, similarity))

    # Return a list of tuples (community_id, similarity) for communities that passed the threshold
    return similarity_results


# Example Usage:
community_id = 5
time_slice = 202405  # Current time slice
target_time_slice = 202406  # Target time slice
threshold = 0  # Minimum similarity threshold

# Assuming all_community_to_ids is already defined
similarities = community_similarity_to_target(community_id, time_slice, target_time_slice, all_community_to_ids,
                                              threshold)


In [44]:
similarities

[(54, 0.9582716973885506)]

In [51]:
print(len(all_community_to_ids.get(202405).get(5)))
print(len(set(all_community_to_ids.get(202406).get(54)) & set(all_community_to_ids.get(202405).get(5))))
print(len(set(all_community_to_ids.get(202406).get(71)) & set(all_community_to_ids.get(202405).get(5))))
print(len(all_community_to_ids.get(202406).get(54)))



116
115
1
119


In [49]:
def community_similarity_to_target(community_id, time_slice, target_time_slice, all_community_to_ids, threshold):
    """Calculate the similarity of a specific community to all communities in the target time slice using the overlap coefficient."""

    # Step 1: Get the community set in the specified time slice
    community_set = all_community_to_ids.get(time_slice, {}).get(community_id, set())

    if not community_set:
        raise ValueError(f"Community {community_id} does not exist in time slice {time_slice}")

    # Step 2: Calculate total number of nodes across all time slices
    total_nodes = len(set.union(*[set.union(*time_slice.values()) for time_slice in all_community_to_ids.values()]))

    # Step 3: Prepare a list to store communities with similarity above the threshold
    similarity_results = []

    # Step 4: Get the communities in the target time slice
    target_communities = all_community_to_ids.get(target_time_slice, {})
    if not target_communities:
        raise ValueError(f"Target time slice {target_time_slice} does not exist in the dataset.")

    # Step 5: Loop through all communities in the target time slice and calculate the overlap coefficient
    for target_community_id, target_community_set in target_communities.items():
        similarity = overlap_coefficient(community_set, target_community_set)

        # Only add to results if similarity is above the threshold
        if similarity > threshold:
            similarity_results.append((target_community_id, similarity))

    # Return a list of tuples (community_id, similarity) for communities that passed the threshold
    return similarity_results

def overlap_coefficient(set1, set2):
    """Calculate the overlap coefficient between two sets."""
    intersection_size = len(set1.intersection(set2))
    return intersection_size / len(set1)

# Example Usage:
community_id = 5
time_slice = 202405  # Current time slice
target_time_slice = 202406  # Target time slice
threshold = 0  # Minimum similarity threshold (adjust as needed)

# Assuming all_community_to_ids is already defined, where it's a dictionary mapping time slices to community sets
similarities = community_similarity_to_target(community_id, time_slice, target_time_slice, all_community_to_ids, threshold)



In [50]:
similarities

[(54, 0.9913793103448276), (71, 0.008620689655172414)]

In [None]:
# 获取top 20的节点
rank4 = pd.read_csv('./202404/handle/rank202404.csv')
top20 = rank4.drop_duplicates(subset=['community'], keep="first").head(20)['community']


In [None]:
len(all_community_to_ids.get(202404).get(3))

In [None]:
community_similarity_to_target(2, 202407, 202408, all_community_to_ids, 0.1)

In [None]:
# 社区标签追踪

In [None]:
time_slices = [202404, 202405, 202406, 202407, 202408, 202409, 202410]
df = pd.DataFrame(columns=time_slices)
df[202404] = top20

In [None]:
def fill_community_labels(df, community_events_df):
    # 遍历时间片列（202405 到 202410）
    for time_idx in range(1, len(df.columns)):
        previous_time = df.columns[time_idx - 1]
        current_time = df.columns[time_idx]

        # 对每一行（每个社区），检查其在 previous_time 中的社区标签，并查找其在当前时间片的标签
        for idx, community_id in enumerate(df[previous_time]):
            # 如果当前社区标签为 -1，表示已经死亡，不需要继续处理
            if community_id == -1:
                df[current_time].iloc[idx] = -1
                continue

            # 查找该社区的行为
            matching_rows = community_events_df[(community_events_df['Previous Time Slice'] == previous_time) &
                                                (community_events_df['Previous Community'] == community_id) &
                                                (community_events_df['Current Time Slice'] == current_time)]

            if matching_rows.empty:
                # 如果没有找到匹配的行为，表示该社区没有发生变化，保留原有标签
                df[current_time].iloc[idx] = community_id
            else:
                # 如果找到行为，按照行为类型填充新的社区标签
                event = matching_rows.iloc[0]['Event']

                if event == "Dies":
                    df[current_time].iloc[idx] = -1
                elif event == "Continues":
                    df[current_time].iloc[idx] = matching_rows.iloc[0]['Current Community']
                elif event == "Merges":
                    df[current_time].iloc[idx] = matching_rows.iloc[0]['Current Community']
                elif event == "Splits":
                    df[current_time].iloc[idx] = matching_rows.iloc[0]['Current Community']

    return df


# 假设 df 和 advanced_community_events_df 已经被定义
# 填充 df 中的缺失社区标签
df_filled = fill_community_labels(df, advanced_community_events_df)

In [None]:
community_similarity_to_target(6, 202404, 202405, all_community_to_ids, 0.3)

In [None]:
df_filled = df_filled.reset_index(drop=True)
df_filled

In [None]:
import numpy as np

# 遍历每一行
for index, row in df_filled.iterrows():
    # 找到-1的索引
    minus_one_indices = np.where(row == -1)[0]

    if len(minus_one_indices) >= 2:
        died_start = minus_one_indices[0] - 1
        curr_idx = minus_one_indices[0] + 1

        while curr_idx < len(row):
            # 使用相似度计算来填充缺失值
            similarities = community_similarity_to_target(df_filled.iloc[index, died_start],
                                                          time_slices[died_start],
                                                          time_slices[curr_idx],
                                                          all_community_to_ids, 0.3)

            if similarities:
                # 如果相似度计算有结果，填充当前列，并切换到现有数据填充策略
                df_filled.iloc[index, curr_idx] = similarities[0][0]
                curr_idx += 1

                # 进入现有数据填充模式，直到填充数据为-1
                while curr_idx < len(row):

                    # 查找根据已有数据填充
                    matching_df = advanced_community_events_df[
                        (advanced_community_events_df['Previous Time Slice'] == time_slices[died_start]) &
                        (advanced_community_events_df['Previous Community'] == df_filled.iloc[index, died_start]) &
                        (advanced_community_events_df['Current Time Slice'] == time_slices[curr_idx])]

                    # 检查是否找到匹配的行
                    if not matching_df.empty:
                        # 找到匹配的行后获取填充值
                        find_ans = int(matching_df['Current Community'].iloc[0])

                        if find_ans != -1:
                            df_filled.iloc[index, curr_idx] = find_ans
                            curr_idx += 1
                        else:
                            # 如果填充值为-1，保持为-1，不覆盖，并跳出，返回到相似度计算策略
                            df_filled.iloc[index, curr_idx] = -1
                            died_start = curr_idx
                            curr_idx += 1
                            break  # 跳出内层 while，重新进入相似度计算
                    else:
                        # 如果没有找到匹配的行，当前列填充-1，然后返回到相似度计算策略
                        died_start = curr_idx - 1
                        break  # 跳出内层 while，重新进入相似度计算

            else:
                # 如果相似度计算没有结果，填充为-1，并继续尝试下一列
                df_filled.iloc[index, curr_idx] = -1
                curr_idx += 1



In [None]:
df_filled.to_csv("filled_community_labels.csv", index=False)

In [None]:
df_filled

In [None]:
# 获取对应社区的节点大小
# 绘制矩阵的社区时间图，悬停可以查看当前时间片下的社区属性状态
# 点击，可以获取当前社区内部的所有节点和连接关系，通过ascore绘制

In [None]:
import os

df_size = pd.read_csv("filled_community_labels.csv")

base_dir = './'

# 遍历 df 中的每一行
for idx, row in df_size.iterrows():
    # 遍历每一列
    for col in df_size.columns:
        value = row[col]
        if value == -1:
            # 如果值为 -1，直接将其设置为 0
            df_size.at[idx, col] = 0
        else:
            # 否则，根据列名读取对应的文件
            file_path = os.path.join(base_dir, f'{col}/handle/rank{col}.csv')

            if os.path.exists(file_path):
                # 如果文件存在，读取文件
                temp_df = pd.read_csv(file_path)

                # 获取与 value 对应的 community 的大小
                community_size = len(temp_df[temp_df['community'] == value])

                # 将 df 中相应位置的值替换为社区的大小
                df_size.at[idx, col] = community_size
            else:
                # 如果文件不存在，你可以决定是否要填充为某个默认值
                print(f"Warning: {file_path} does not exist.")
                df_size.at[idx, col] = 0  # 或者选择其他合适的默认值

In [None]:
df_size.to_csv("community_sizes.csv", index=False)

In [None]:
# die community需要重新追踪，从消亡前的编号与后续时间片下的社区是否存在关联，如果有关联则继续追踪，否则则进行下一个时间片的追踪
# 这里我需要获取任意一个社区编号，与其他时间片下的所有社区之间的相似度计算，如果大于阈值，且只有一个则是延续，两个以上则是分裂，如果没有则是消亡

In [None]:
## 处理local-core的数据
"""
1. 相关社区的节点（这里坐标需要重新计算）和连接关系
2. 连接关系和节点转为fdeb数据格式
3. 聚合以后的节点和连接数据；包括fdeb
"""

In [8]:
import os
import pandas as pd
import json

handle_df = pd.read_csv("matrix/new_community_evolution.csv")

In [9]:
# 与上下文无关
# handle坐标需要重新计算，最大半径320
import numpy as np

RAD = np.pi / 180


def polarToCartesian(asrank, attr, radius=350):
    """
    计算xy坐标的实现函数
    :param asrank: 需要处理的asrank数据，保证为DataFrame格式，且必须要拥有longitude、attr字段
    :param attr: 指定as的某个属性，这里使用cone
    :param radius: 节点的最大极半径
    :return:
    """
    # 映射值域cone的min-max  映射到   0-350
    min_cone = asrank[attr].min()
    max_cone = asrank[attr].max()

    # cone原值域太大，进行值域缩放，用来后面求半径
    max_Domain = 1 - np.log((min_cone + 1) / (max_cone + 1))

    # cone的log处理
    asrank['cone_map'] = asrank[attr].apply(lambda x: (1 - np.log((x + 1) / (max_cone + 1))))

    # log处理后，对cone进行一个0-350区间映射
    asrank['r'] = asrank['cone_map'].apply(lambda x: (((x - 1) / (max_Domain - 1)) * radius))

    # 得到xy坐标 x=初始位置+rcosθ
    x = 1900 / 2 + asrank['r'] * np.cos(asrank['longitude'] * RAD)
    y = 1000 / 2 - asrank['r'] * np.sin(asrank['longitude'] * RAD)

    asrank['x'] = x
    asrank['y'] = y
    # 将rank 的r字段全部数值变为绝对值
    asrank.sort_values(by=attr, ascending=False, inplace=True)
    asrank.drop_duplicates(subset=['id', attr], keep='last', inplace=True)

    return asrank

# for date in range(202404, 202411):
#     cord_rank = pd.read_csv(f'./{date}/handle/rank{date}.csv')
#     cord_rank1 = pd.read_csv(f'./{date}/rank{date}.csv')
#     cord_rank = polarToCartesian(cord_rank, 'cone', 320)
#     cord_rank1 = polarToCartesian(cord_rank1, 'cone', 320)
#     cord_rank.fillna(0, inplace=True)
#     cord_rank1.fillna(0, inplace=True)
#     cord_rank.to_csv(f'./{date}/handle/rank{date}.csv', index=False)
#     cord_rank1.to_csv(f'./{date}/rank{date}.csv', index=False)



In [10]:
base_dir = './'

# 初始化 rankjson 和 reljson
rankjson = {}
reljson = {}

# 获取 handle_df 中每一列的所有唯一值
for col in handle_df.columns:
    unique_values = handle_df[col].unique()  # 获取该列的所有唯一值
    rankjson[col] = {}  # 创建该列对应的 rankjson 结构
    reljson[col] = {}  # 创建该列对应的 reljson 结构

    # 遍历该列的所有唯一值
    for value in unique_values:
        # 将 value 转换为字符串，以确保 JSON 兼容
        value_str = str(value)

        # 根据列名获取对应的 rank 和 rel 文件路径
        rank_path = os.path.join(base_dir, f'{col}/handle/rank{col}.csv')
        rel_path = os.path.join(base_dir, f'{col}/handle/rel{col}.csv')

        # 读取 rank 和 rel 文件
        temp_rank = pd.read_csv(rank_path)
        temp_rel = pd.read_csv(rel_path)

        # 根据条件筛选数据
        filtered_rank = temp_rank[temp_rank['community'] == value]
        filtered_rel = temp_rel[temp_rel['type'] == str(value)]
        filtered_rank = polarToCartesian(filtered_rank, 'cone', 320)

        if len(filtered_rank) != 0 and len(filtered_rel) != 0:
            filtered_rel['cone'] = filtered_rel.apply(
                lambda row: max(
                    filtered_rank.loc[filtered_rank['id'] == row['source'], 'cone'].iloc[0],  # 获取 source 对应的 cone 值
                    filtered_rank.loc[filtered_rank['id'] == row['target'], 'cone'].iloc[0]  # 获取 target 对应的 cone 值
                ),
                axis=1
            )

        os.makedirs(f'./local_core/{col}/raw', exist_ok=True)
        # 将筛选结果存储到对应的 rankjson 和 reljson 中
        rankjson[col][value_str] = filtered_rank.to_dict(orient='records')
        reljson[col][value_str] = filtered_rel.to_dict(orient='records')

    with open(f'./local_core/{col}/raw/rank.json', 'w', encoding='utf-8') as rank_file:
        json.dump(rankjson, rank_file, ensure_ascii=False, indent=4)

    with open(f'./local_core/{col}/raw/rel.json', 'w', encoding='utf-8') as rel_file:
        json.dump(reljson, rel_file, ensure_ascii=False, indent=4)
    print(f"Saved data for {col}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  asrank['cone_map'] = asrank[attr].apply(lambda x: (1 - np.log((x + 1) / (max_cone + 1))))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  asrank['r'] = asrank['cone_map'].apply(lambda x: (((x - 1) / (max_Domain - 1)) * radius))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  asrank['x'] = x
A value i

Saved data for 202401


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  asrank['cone_map'] = asrank[attr].apply(lambda x: (1 - np.log((x + 1) / (max_cone + 1))))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  asrank['r'] = asrank['cone_map'].apply(lambda x: (((x - 1) / (max_Domain - 1)) * radius))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  asrank['x'] = x
A value i

Saved data for 202402


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  asrank['cone_map'] = asrank[attr].apply(lambda x: (1 - np.log((x + 1) / (max_cone + 1))))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  asrank['r'] = asrank['cone_map'].apply(lambda x: (((x - 1) / (max_Domain - 1)) * radius))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  asrank['x'] = x
A value i

Saved data for 202403


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  asrank['cone_map'] = asrank[attr].apply(lambda x: (1 - np.log((x + 1) / (max_cone + 1))))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  asrank['r'] = asrank['cone_map'].apply(lambda x: (((x - 1) / (max_Domain - 1)) * radius))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  asrank['x'] = x
A value i

Saved data for 202404


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  asrank['cone_map'] = asrank[attr].apply(lambda x: (1 - np.log((x + 1) / (max_cone + 1))))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  asrank['r'] = asrank['cone_map'].apply(lambda x: (((x - 1) / (max_Domain - 1)) * radius))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  asrank['x'] = x
A value i

KeyboardInterrupt: 

In [None]:
# 节点聚合，根据国家进行聚合，然后相应的连接关系也进行聚合，并导出

In [None]:
def process_rank_and_rel(t1, t2):
    t1.fillna(0, inplace=True)
    t1_agg = t1.loc[t1.groupby('country')['cone'].idxmax()]

    t1_agg['cone'] = t1_agg['country'].apply(
        lambda country: t1[t1['country'] == country]['cone'].sum()
    )
    t1_agg = polarToCartesian(t1_agg, 'cone', 320)

    id_mapping = {}
    for idx, row in t1.iterrows():
        country = row['country']
        key = row['id']
        value = t1_agg[t1_agg['country'] == country]['id'].values[0]
        id_mapping[key] = value

    t2['source'] = t2['source'].map(id_mapping).fillna(t2['source'])
    t2['target'] = t2['target'].map(id_mapping).fillna(t2['target'])
    t2['source'] = t2['source'].astype(int)
    t2['target'] = t2['target'].astype(int)
    t2 = t2[t2['source'] != t2['target']]
    t2['source_target'] = t2.apply(lambda row: tuple(sorted([row['source'], row['target']])), axis=1)
    t2['reverse_relation'] = t2.apply(lambda row: tuple(sorted([row['target'], row['source']])), axis=1)
    t2 = t2.drop_duplicates(subset=['source_target'])
    t2 = t2.drop_duplicates(subset=['reverse_relation'])
    t2 = t2.drop_duplicates(subset=['source', 'target'])
    t2 = t2.drop(columns=['source_target'])
    t2 = t2.drop(columns=['reverse_relation'])

    if len(t1_agg) != 0 and len(t2) != 0:
        t2['cone'] = t2.apply(
            lambda row: max(
                t1_agg.loc[t1_agg['id'] == row['source'], 'cone'].iloc[0],  # 获取 source 对应的 cone 值
                t1_agg.loc[t1_agg['id'] == row['target'], 'cone'].iloc[0]  # 获取 target 对应的 cone 值
            ),
            axis=1
        )

    return t1_agg, t2

In [None]:
base_dir = './'

# 初始化 rankjson 和 reljson
rankjson = {}
reljson = {}

# 获取 handle_df 中每一列的所有唯一值
for col in handle_df.columns:
    unique_values = handle_df[col].unique()  # 获取该列的所有唯一值
    rankjson[col] = {}
    reljson[col] = {}
    os.makedirs(f'./local_core/{col}/agg', exist_ok=True)
    # 遍历该列的所有唯一值
    for value in unique_values:
        # 将 value 转换为字符串，以确保 JSON 兼容
        value_str = str(value)

        # 根据列名获取对应的 rank 和 rel 文件路径
        rank_path = os.path.join(base_dir, f'{col}/handle/rank{col}.csv')
        rel_path = os.path.join(base_dir, f'{col}/handle/rel{col}.csv')

        # 读取 rank 和 rel 文件
        temp_rank = pd.read_csv(rank_path)
        temp_rel = pd.read_csv(rel_path)

        # 根据条件筛选数据
        filtered_rank = temp_rank[temp_rank['community'] == value]
        filtered_rel = temp_rel[temp_rel['type'] == str(value)]
        t1, t2 = process_rank_and_rel(filtered_rank, filtered_rel)

        if len(t1) == 1:
            t1.loc[t1.index[0], 'x'] = 1900 / 2
            t1.loc[t1.index[0], 'y'] = 1000 / 2
            t1.loc[t1.index[0], 'r'] = 0
        # 将筛选结果存储到对应的 rankjson 和 reljson 中
        rankjson[col][value_str] = t1.to_dict(orient='records')
        reljson[col][value_str] = t2.to_dict(orient='records')

    with open(f'./local_core/{col}/agg/rank.json', 'w', encoding='utf-8') as rank_file:
        json.dump(rankjson, rank_file, ensure_ascii=False, indent=4)

    with open(f'./local_core/{col}/agg/rel.json', 'w', encoding='utf-8') as rel_file:
        json.dump(reljson, rel_file, ensure_ascii=False, indent=4)
    print(f"Saved data for {col}")

In [None]:
### 处理fdeb的逻辑
from common.utils.As_rank_rel import fdeb_format
base_dir = './'

# 初始化 rankjson 和 reljson
rankjson = {}
reljson = {}

# 获取 handle_df 中每一列的所有唯一值
for col in handle_df.columns:
    unique_values = handle_df[col].unique()  # 获取该列的所有唯一值
    rankjson[col] = {}
    reljson[col] = {}
    os.makedirs(f'./local_core/{col}/fdeb', exist_ok=True)
    # 遍历该列的所有唯一值
    for value in unique_values:
        # 将 value 转换为字符串，以确保 JSON 兼容
        value_str = str(value)

        # 根据列名获取对应的 rank 和 rel 文件路径
        rank_path = os.path.join(base_dir, f'{col}/handle/rank{col}.csv')
        rel_path = os.path.join(base_dir, f'{col}/handle/rel{col}.csv')

        # 读取 rank 和 rel 文件
        temp_rank = pd.read_csv(rank_path)
        temp_rel = pd.read_csv(rel_path)

        # 根据条件筛选数据
        filtered_rank = temp_rank[temp_rank['community'] == value]
        filtered_rel = temp_rel[temp_rel['type'] == str(value)]
        t1, t2 = process_rank_and_rel(filtered_rank, filtered_rel)

        if len(t1) == 1:
            t1.loc[t1.index[0], 'x'] = 1900 / 2
            t1.loc[t1.index[0], 'y'] = 1000 / 2
            t1.loc[t1.index[0], 'r'] = 0

        r1, r2 = fdeb_format(t1,t2)
        rankjson[col][value_str] = r1
        reljson[col][value_str] = r2
    with open(f'./local_core/{col}/fdeb/rank.json', 'w', encoding='utf-8') as rank_file:
        json.dump(rankjson, rank_file, ensure_ascii=False, indent=4)

    with open(f'./local_core/{col}/fdeb/rel.json', 'w', encoding='utf-8') as rel_file:
        json.dump(reljson, rel_file, ensure_ascii=False, indent=4)
    print(f"Saved data for {col}")