In [86]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [87]:
# Load main dataset and cluster labels, then merge to create a dataframe with cluster assignments
import pandas as pd

main_data_path = '../data/raw_data/new_raw_data_polygon.csv'
cluster_labels_path = '../data/raw_data/interaction_mode_results/main_clustering/hdbscan_results/cluster_labels.csv'

# Load the main dataset
main_df = pd.read_csv(main_data_path)

# Load the cluster labels
main_cluster_labels = pd.read_csv(cluster_labels_path)

# Merge cluster labels with main dataset (on 'id' if present, else by index)
if 'id' in main_df.columns and 'id' in main_cluster_labels.columns:
    merged_main = pd.merge(main_df, main_cluster_labels, on='id', how='left')
else:
    merged_main = main_df.copy()
    merged_main['cluster_label'] = main_cluster_labels['cluster_label']

# merged_main now contains all data and their dedicated cluster labels

In [88]:
feature_analysis_results = {}
for cluster in sorted(merged_main['cluster_label'].dropna().unique()):
    cluster_df = merged_main[merged_main['cluster_label'] == cluster]
    # Use the feature analysis function from interaction_mode_features_v2.py
    # Replace 'your_feature_analysis_function' with the actual function name
    # Example: result = interaction_mode_features_v2.analyze_features(cluster_df)
    # For demonstration, we'll just compute means for all numeric columns
    result = cluster_df.describe().T[['mean', 'std', 'min', 'max']]
    feature_analysis_results[cluster] = result
# Combine all results into a single DataFrame for easy comparison
combined_feature_analysis = pd.concat(feature_analysis_results, names=['cluster_label'])
combined_feature_analysis

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,min,max
cluster_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
-1,TX_PER_MONTH,20.673600,5.276391e+01,1.00,2.221000e+03
-1,TOKEN_DIVERSITY,4.328756,2.572299e+00,1.00,3.300000e+01
-1,PROTOCOL_DIVERSITY,11.151454,1.558106e+01,1.00,6.750000e+02
-1,TOTAL_TRANSFER_USD,119759.468013,3.557831e+06,0.03,1.671072e+08
-1,INTERACTION_DIVERSITY,9.418417,3.833743e+00,1.00,3.900000e+01
...,...,...,...,...,...
24,FLOTSAM_EVENTS,0.000000,0.000000e+00,0.00,0.000000e+00
24,BRIDGE_OUTFLOW_COUNT,101.991905,2.125126e+03,0.00,7.523600e+04
24,BRIDGE_INFLOW_COUNT,0.000000,0.000000e+00,0.00,0.000000e+00
24,BRIDGE_TOTAL_VOLUME_USD,120443.300143,1.994746e+06,0.00,6.795146e+07


DATAFRAME TO FIND KEY VALUES FOR FEATURES IN EACH CLUSTER IS ABOVE

The below is now focused on looking at the other features' values for the clusters selected in the output of interaction_mode_cluster_selection_v2.py - this includes 0, 9, 7 and 4 for DEX_EVENTS, CEX_EVENTS, DEFI_EVENTS and BRIDGE_EVENTS respectively.

In [89]:
#Looking at cluster 0 - selected for DEX_EVENTS
print("Cluster 0 Feature Analysis:")
print(combined_feature_analysis.loc[0])

Cluster 0 Feature Analysis:
                                 mean           std      min           max
TX_PER_MONTH                14.638436     28.028928   1.0000  4.635000e+02
TOKEN_DIVERSITY              3.765370      2.359394   1.0000  1.600000e+01
PROTOCOL_DIVERSITY           9.075282     16.734157   2.0000  4.020000e+02
TOTAL_TRANSFER_USD       12759.227792  82943.886944   0.0100  1.745577e+06
INTERACTION_DIVERSITY        5.710163      3.836162   1.0000  2.800000e+01
ACTIVE_DURATION_DAYS        99.148055     47.767161  15.0000  1.810000e+02
AVG_TRANSFER_USD           200.306945   1474.088025   0.0008  4.020501e+04
USD_TRANSFER_STDDEV        192.503040   1216.585305   0.0000  3.008555e+04
DEX_EVENTS                   0.000000      0.000000   0.0000  0.000000e+00
GAMES_EVENTS                 4.107905     21.494148   0.0000  1.840000e+02
CEX_EVENTS                   0.000000      0.000000   0.0000  0.000000e+00
DAPP_EVENTS                  0.357591      5.058091   0.0000  1.380000e+

In [90]:
#Looking at cluster 9 - selected for CEX_EVENTS
print("Cluster 9 Feature Analysis:")
print(combined_feature_analysis.loc[9])

Cluster 9 Feature Analysis:
                                mean           std     min            max
TX_PER_MONTH               24.781063     94.519772   1.000    1484.833333
TOKEN_DIVERSITY             5.822622     16.759423   1.000     326.000000
PROTOCOL_DIVERSITY         14.190231     28.603489   2.000     358.000000
TOTAL_TRANSFER_USD       8691.690977  30991.783835   0.060  364022.360000
INTERACTION_DIVERSITY       7.398458      4.360448   1.000      33.000000
ACTIVE_DURATION_DAYS      112.000000     50.306842  15.000     181.000000
AVG_TRANSFER_USD          113.967324    321.032872   0.003    3550.870435
USD_TRANSFER_STDDEV       170.694452    512.776018   0.000    4464.483052
DEX_EVENTS                  1.933162      2.592745   0.000      16.000000
GAMES_EVENTS               16.904884     45.845198   0.000     191.000000
CEX_EVENTS                  1.316195      2.255639   0.000      10.000000
DAPP_EVENTS                 0.491003      3.037234   0.000      54.000000
CHADMIN_EV

In [91]:
#Looking at cluster 7 - selected for DEFI_EVENTS
print("Cluster 7 Feature Analysis:")
print(combined_feature_analysis.loc[7]) 

Cluster 7 Feature Analysis:
                                 mean           std         min            max
TX_PER_MONTH                18.120940     12.884763    2.166667      79.666667
TOKEN_DIVERSITY              3.866667      1.159588    3.000000      13.000000
PROTOCOL_DIVERSITY          11.238462      3.769320    5.000000      48.000000
TOTAL_TRANSFER_USD       15270.348282  26978.205714  119.200000  278686.420000
INTERACTION_DIVERSITY       10.958974      2.003431    4.000000      16.000000
ACTIVE_DURATION_DAYS       103.341026     37.187693   15.000000     180.000000
AVG_TRANSFER_USD            75.964758    197.304339    0.847545    2544.557500
USD_TRANSFER_STDDEV        195.413735    398.651670    1.394417    4000.705466
DEX_EVENTS                  12.882051      8.704898    3.000000      37.000000
GAMES_EVENTS                 0.000000      0.000000    0.000000       0.000000
CEX_EVENTS                   0.000000      0.000000    0.000000       0.000000
DAPP_EVENTS             

In [92]:
#Looking at cluster 4 - selected for BRIDGE_EVENTS
print("Cluster 4 Feature Analysis:")
print(combined_feature_analysis.loc[4])

Cluster 4 Feature Analysis:
                                mean           std        min            max
TX_PER_MONTH               10.910256     20.437675   1.000000     143.166667
TOKEN_DIVERSITY             3.062937      2.870826   1.000000      25.000000
PROTOCOL_DIVERSITY          7.867133     12.531637   2.000000     160.000000
TOTAL_TRANSFER_USD       8271.577483  26756.645043   0.200000  190509.820000
INTERACTION_DIVERSITY       5.374126      4.289918   1.000000      21.000000
ACTIVE_DURATION_DAYS       90.961538     58.816592  15.000000     181.000000
AVG_TRANSFER_USD          226.200797    898.208291   0.033333   11928.565333
USD_TRANSFER_STDDEV       298.378119   1453.005936   0.000000   20940.835011
DEX_EVENTS                  0.000000      0.000000   0.000000       0.000000
GAMES_EVENTS                0.513986      7.304755   0.000000     123.000000
CEX_EVENTS                  0.000000      0.000000   0.000000       0.000000
DAPP_EVENTS                 0.594406      8.8775

In [93]:
# Load cluster selections from interaction_mode_cluster_selections_v2.json and analyze selected clusters
import json
cluster_selections_path = '../data/processed_data/interaction_mode_cluster_selections_v2.json'
with open(cluster_selections_path, 'r') as f:
    cluster_selections = json.load(f)
selected_clusters = {}
for feature, info in cluster_selections['datasets']['main']['feature_selections'].items():
    selected_clusters[feature] = info['selected_cluster']
print('Selected clusters for each feature:', selected_clusters)

# For each selected cluster, print feature analysis for that cluster
for feature, cluster_label in selected_clusters.items():
    print(f'\nFeature: {feature} | Selected Cluster: {cluster_label}')
    if cluster_label in combined_feature_analysis.index.get_level_values('cluster_label'):
        print(combined_feature_analysis.loc[cluster_label])
    else:
        print(f'Cluster {cluster_label} not found in combined_feature_analysis.')

Selected clusters for each feature: {'DEX_EVENTS': 17, 'CEX_EVENTS': 6, 'DEFI_EVENTS': 7, 'BRIDGE_EVENTS': 15}

Feature: DEX_EVENTS | Selected Cluster: 17
                                 mean            std        min           max
TX_PER_MONTH                 7.519144      13.532809   1.000000  1.076667e+02
TOKEN_DIVERSITY              3.628378       1.983937   1.000000  1.400000e+01
PROTOCOL_DIVERSITY           7.949324       5.008549   2.000000  5.800000e+01
TOTAL_TRANSFER_USD       26918.912365  347676.626779   0.040000  5.978658e+06
INTERACTION_DIVERSITY        6.692568       2.662370   2.000000  2.000000e+01
ACTIVE_DURATION_DAYS        79.912162      46.489208  15.000000  1.810000e+02
AVG_TRANSFER_USD           476.656709    1992.588691   0.001373  2.959732e+04
USD_TRANSFER_STDDEV       1547.801546   21385.577625   0.000000  3.677216e+05
DEX_EVENTS                   0.777027       0.775217   0.000000  2.000000e+00
GAMES_EVENTS                10.750000      36.585238   0.000000  

In [94]:
# --- Integrate clustered data from interaction_mode_results and produce feature analysis for selected clusters ---
# 1. Load cluster labels from interaction_mode_results for the main dataset
import os
cluster_labels_path = '../data/raw_data/interaction_mode_results/main_clustering/hdbscan_results/cluster_labels.csv'
main_cluster_labels = pd.read_csv(cluster_labels_path)
# 2. Load the main dataset
main_data_path = '../data/raw_data/new_raw_data_polygon.csv'
main_df = pd.read_csv(main_data_path)
# 3. Merge cluster labels with main dataset (on 'id' if present, else by index)
if 'id' in main_df.columns and 'id' in main_cluster_labels.columns:
    merged_main = pd.merge(main_df, main_cluster_labels, on='id', how='left')
else:
    merged_main = main_df.copy()
    merged_main['cluster_label'] = main_cluster_labels['cluster_label']
# 4. Load cluster selections from interaction_mode_cluster_selections_v2.json
import json
cluster_selections_path = '../data/processed_data/interaction_mode_cluster_selections_v2.json'
with open(cluster_selections_path, 'r') as f:
    cluster_selections = json.load(f)
selected_clusters = {}
for feature, info in cluster_selections['datasets']['main']['feature_selections'].items():
    selected_clusters[feature] = info['selected_cluster']
print('Selected clusters for each feature:', selected_clusters)
# 5. For each selected cluster, produce feature analysis for that cluster
feature_analysis_results = {}
for feature, cluster_label in selected_clusters.items():
    cluster_df = merged_main[merged_main['cluster_label'] == cluster_label]
    # Example analysis: describe all numeric columns
    result = cluster_df.describe().T[['mean', 'std', 'min', 'max']]
    feature_analysis_results[feature] = result
    print(f'\nFeature: {feature} | Selected Cluster: {cluster_label}')
    print(result)

Selected clusters for each feature: {'DEX_EVENTS': 17, 'CEX_EVENTS': 6, 'DEFI_EVENTS': 7, 'BRIDGE_EVENTS': 15}

Feature: DEX_EVENTS | Selected Cluster: 17
                                 mean            std        min           max
TX_PER_MONTH                 7.519144      13.532809   1.000000  1.076667e+02
TOKEN_DIVERSITY              3.628378       1.983937   1.000000  1.400000e+01
PROTOCOL_DIVERSITY           7.949324       5.008549   2.000000  5.800000e+01
TOTAL_TRANSFER_USD       26918.912365  347676.626779   0.040000  5.978658e+06
INTERACTION_DIVERSITY        6.692568       2.662370   2.000000  2.000000e+01
ACTIVE_DURATION_DAYS        79.912162      46.489208  15.000000  1.810000e+02
AVG_TRANSFER_USD           476.656709    1992.588691   0.001373  2.959732e+04
USD_TRANSFER_STDDEV       1547.801546   21385.577625   0.000000  3.677216e+05
DEX_EVENTS                   0.777027       0.775217   0.000000  2.000000e+00
GAMES_EVENTS                10.750000      36.585238   0.000000  