In [118]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [119]:
# Load main dataset and cluster labels, then merge to create a dataframe with cluster assignments
import pandas as pd

main_data_path = '../data/raw_data/new_raw_data_polygon.csv'
cluster_labels_path = '../data/raw_data/interaction_mode_results/main_clustering/hdbscan_results/cluster_labels.csv'

# Load the main dataset
main_df = pd.read_csv(main_data_path)

# Load the cluster labels
main_cluster_labels = pd.read_csv(cluster_labels_path)

# Merge cluster labels with main dataset (on 'id' if present, else by index)
if 'id' in main_df.columns and 'id' in main_cluster_labels.columns:
    merged_main = pd.merge(main_df, main_cluster_labels, on='id', how='left')
else:
    merged_main = main_df.copy()
    merged_main['cluster_label'] = main_cluster_labels['cluster_label']

# merged_main now contains all data and their dedicated cluster labels

In [120]:
feature_analysis_results = {}
for cluster in sorted(merged_main['cluster_label'].dropna().unique()):
    cluster_df = merged_main[merged_main['cluster_label'] == cluster]
    # Use the feature analysis function from interaction_mode_features_v2.py
    # Replace 'your_feature_analysis_function' with the actual function name
    # Example: result = interaction_mode_features_v2.analyze_features(cluster_df)
    # For demonstration, we'll just compute means for all numeric columns
    result = cluster_df.describe().T[['mean', 'std', 'min', 'max']]
    feature_analysis_results[cluster] = result
# Combine all results into a single DataFrame for easy comparison
combined_feature_analysis = pd.concat(feature_analysis_results, names=['cluster_label'])
combined_feature_analysis

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,min,max
cluster_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
-1,TX_PER_MONTH,19.100927,4.957916e+01,1.00,2.221000e+03
-1,TOKEN_DIVERSITY,4.242555,2.560051e+00,1.00,3.300000e+01
-1,PROTOCOL_DIVERSITY,10.772005,1.441968e+01,1.00,6.750000e+02
-1,TOTAL_TRANSFER_USD,101970.263210,3.222417e+06,0.03,1.671072e+08
-1,INTERACTION_DIVERSITY,9.119457,4.015523e+00,1.00,3.900000e+01
...,...,...,...,...,...
20,FLOTSAM_EVENTS,0.000000,0.000000e+00,0.00,0.000000e+00
20,BRIDGE_OUTFLOW_COUNT,101.991905,2.125126e+03,0.00,7.523600e+04
20,BRIDGE_INFLOW_COUNT,0.000000,0.000000e+00,0.00,0.000000e+00
20,BRIDGE_TOTAL_VOLUME_USD,120443.300143,1.994746e+06,0.00,6.795146e+07


DATAFRAME TO FIND KEY VALUES FOR FEATURES IN EACH CLUSTER IS ABOVE

The below is now focused on looking at the other features' values for the clusters selected in the output of interaction_mode_cluster_selection_v2.py - this includes 7, 4, 5 and 12 for DEX_EVENTS, CEX_EVENTS, DEFI_EVENTS and BRIDGE_EVENTS respectively.

In [121]:
# Load cluster selections from interaction_mode_cluster_selections_v2.json and analyze selected clusters, including cluster statistics
import json
cluster_selections_path = '../data/processed_data/interaction_mode_cluster_selections_v2.json'
with open(cluster_selections_path, 'r') as f:
    cluster_selections = json.load(f)

selected_clusters = {}
for feature, info in cluster_selections['datasets']['main']['feature_selections'].items():
    selected_clusters[feature] = info['selected_cluster']

print('Selected clusters for each feature:', selected_clusters)

# For each selected cluster, print feature analysis and cluster statistics
for feature, cluster_label in selected_clusters.items():
    print(f'\nFeature: {feature} | Selected Cluster: {cluster_label}')
    if cluster_label in combined_feature_analysis.index.get_level_values('cluster_label'):
        feature_stats = combined_feature_analysis.loc[cluster_label]
        print(feature_stats)
        # Cluster statistics
        cluster_df = merged_main[merged_main['cluster_label'] == cluster_label]
        cluster_size = len(cluster_df)
        print(f"Cluster Size: {cluster_size}")
        # Additional measures from the cluster selection output
        info = cluster_selections['datasets']['main']['feature_selections'][feature]
        activity_level = info.get('activity_level', None)
        meets_activity = info.get('meets_activity_level', None)
        median_value = info.get('median', None)
        print(f"Activity Level Threshold: {activity_level}")
        print(f"Meets Activity Level: {meets_activity}")
        print(f"Median Value (used as feature value): {median_value}")
    else:
        print(f'Cluster {cluster_label} not found in combined_feature_analysis.')

Selected clusters for each feature: {'DEX_EVENTS': 7, 'CEX_EVENTS': 4, 'DEFI_EVENTS': 5, 'BRIDGE_EVENTS': 12}

Feature: DEX_EVENTS | Selected Cluster: 7
                                mean           std     min            max
TX_PER_MONTH               24.781063     94.519772   1.000    1484.833333
TOKEN_DIVERSITY             5.822622     16.759423   1.000     326.000000
PROTOCOL_DIVERSITY         14.190231     28.603489   2.000     358.000000
TOTAL_TRANSFER_USD       8691.690977  30991.783835   0.060  364022.360000
INTERACTION_DIVERSITY       7.398458      4.360448   1.000      33.000000
ACTIVE_DURATION_DAYS      112.000000     50.306842  15.000     181.000000
AVG_TRANSFER_USD          113.967324    321.032872   0.003    3550.870435
USD_TRANSFER_STDDEV       170.694452    512.776018   0.000    4464.483052
DEX_EVENTS                  1.933162      2.592745   0.000      16.000000
GAMES_EVENTS               16.904884     45.845198   0.000     191.000000
CEX_EVENTS                  1.316