In [2]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Carregar o grafo
G = nx.read_gml("GraphMissingEdges.gml")

# Configurar estilo dos plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("üîç AN√ÅLISE ESTRUTURAL DO GRAFO")
print("=" * 50)

# 1. AN√ÅLISE DE COMPONENTES CONECTADOS
print("\n1. COMPONENTES CONECTADOS:")
num_components = nx.number_connected_components(G)
largest_cc = max(nx.connected_components(G), key=len)
print(f"N√∫mero de componentes conectados: {num_components}")
print(f"Tamanho do maior componente: {len(largest_cc)}")
print(f"Percentual do grafo no maior componente: {len(largest_cc)/len(G.nodes)*100:.1f}%")

# Tamanhos dos componentes
component_sizes = [len(c) for c in nx.connected_components(G)]
component_sizes.sort(reverse=True)
print(f"Top 5 tamanhos de componentes: {component_sizes[:5]}")

# 2. M√âTRICAS DE CENTRALIDADE
print("\n2. AN√ÅLISE DE CENTRALIDADE:")
degree_centrality = nx.degree_centrality(G)
betweenness_centrality = nx.betweenness_centrality(G, k=1000)  # Sample para grafos grandes
closeness_centrality = nx.closeness_centrality(G)

# N√≥s mais centrais
top_degree = sorted(degree_centrality.items(), key=lambda x: x[1], reverse=True)[:5]
top_betweenness = sorted(betweenness_centrality.items(), key=lambda x: x[1], reverse=True)[:5]

print("Top 5 n√≥s por grau de centralidade:")
for node, cent in top_degree:
    print(f"  Node {node}: {cent:.4f}")

print("Top 5 n√≥s por betweenness centrality:")
for node, cent in top_betweenness:
    print(f"  Node {node}: {cent:.4f}")

# 3. PREPARAR DATAFRAME PARA AN√ÅLISES
df_nodes = pd.DataFrame.from_dict(dict(G.nodes(data=True)), orient="index")
df_nodes["degree"] = [G.degree(n) for n in G.nodes()]
df_nodes["degree_centrality"] = [degree_centrality[n] for n in G.nodes()]
df_nodes["betweenness_centrality"] = [betweenness_centrality[n] for n in G.nodes()]
df_nodes["closeness_centrality"] = [closeness_centrality[n] for n in G.nodes()]

# Converter colunas num√©ricas
df_nodes["stars"] = pd.to_numeric(df_nodes["stars"], errors="coerce")
df_nodes["reviewCount"] = pd.to_numeric(df_nodes["reviewCount"], errors="coerce")

# Processar categorias
df_nodes["num_categories"] = df_nodes["categories"].apply(lambda x: len([c.strip() for c in x.split(",") if c.strip()]))
df_nodes["primary_category"] = df_nodes["categories"].apply(lambda x: x.split(",")[0].strip() if x.strip() else "Unknown")

# 4. AN√ÅLISE POR CATEGORIAS
print("\n3. AN√ÅLISE POR CATEGORIAS:")
category_stats = df_nodes.groupby("primary_category").agg({
    'stars': ['count', 'mean', 'std'],
    'reviewCount': ['mean', 'std'],
    'degree': ['mean', 'std'],
    'degree_centrality': 'mean'
}).round(3)

category_stats.columns = ['_'.join(col).strip() for col in category_stats.columns]
category_stats = category_stats.sort_values('stars_count', ascending=False)

print("Top 10 categorias por n√∫mero de estabelecimentos:")
print(category_stats.head(10)[['stars_count', 'stars_mean', 'reviewCount_mean', 'degree_mean']])

# 5. VISUALIZA√á√ïES
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('An√°lises Estruturais do Grafo', fontsize=16, fontweight='bold')

# 5.1 Distribui√ß√£o de Graus (log-scale)
axes[0,0].hist(df_nodes["degree"], bins=50, alpha=0.7, edgecolor='black')
axes[0,0].set_xlabel('Grau')
axes[0,0].set_ylabel('Frequ√™ncia')
axes[0,0].set_title('Distribui√ß√£o de Graus')
axes[0,0].set_yscale('log')

# 5.2 Distribui√ß√£o de Stars
axes[0,1].hist(df_nodes["stars"].dropna(), bins=30, alpha=0.7, edgecolor='black')
axes[0,1].set_xlabel('Stars')
axes[0,1].set_ylabel('Frequ√™ncia')
axes[0,1].set_title('Distribui√ß√£o de Avalia√ß√µes (Stars)')

# 5.3 Review Count vs Stars
valid_data = df_nodes.dropna(subset=['stars', 'reviewCount'])
axes[0,2].scatter(valid_data["stars"], valid_data["reviewCount"], alpha=0.5, s=20)
axes[0,2].set_xlabel('Stars')
axes[0,2].set_ylabel('Review Count')
axes[0,2].set_title('Stars vs Review Count')
axes[0,2].set_yscale('log')

# 5.4 Degree vs Stars
axes[1,0].scatter(valid_data["stars"], valid_data["degree"], alpha=0.5, s=20)
axes[1,0].set_xlabel('Stars')
axes[1,0].set_ylabel('Degree')
axes[1,0].set_title('Avalia√ß√£o vs Conectividade')

# 5.5 Centralidade de Grau
axes[1,1].hist(df_nodes["degree_centrality"], bins=50, alpha=0.7, edgecolor='black')
axes[1,1].set_xlabel('Degree Centrality')
axes[1,1].set_ylabel('Frequ√™ncia')
axes[1,1].set_title('Distribui√ß√£o de Centralidade de Grau')

# 5.6 N√∫mero de Categorias
axes[1,2].hist(df_nodes["num_categories"], bins=range(1, df_nodes["num_categories"].max()+2), 
               alpha=0.7, edgecolor='black')
axes[1,2].set_xlabel('N√∫mero de Categorias')
axes[1,2].set_ylabel('Frequ√™ncia')
axes[1,2].set_title('Distribui√ß√£o do N√∫mero de Categorias')

plt.tight_layout()
plt.show()

# 6. AN√ÅLISE DE CLUSTERS POR CARACTER√çSTICAS
print("\n4. AN√ÅLISE DE CLUSTERING:")

# Preparar dados para clustering
cluster_features = ['stars', 'reviewCount', 'degree', 'num_categories']
cluster_data = df_nodes[cluster_features].dropna()

# Normalizar dados
scaler = StandardScaler()
cluster_data_scaled = scaler.fit_transform(cluster_data)

# K-means clustering
kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
cluster_labels = kmeans.fit_predict(cluster_data_scaled)
cluster_data['cluster'] = cluster_labels

# An√°lise dos clusters
cluster_analysis = cluster_data.groupby('cluster').agg({
    'stars': ['count', 'mean'],
    'reviewCount': 'mean',
    'degree': 'mean',
    'num_categories': 'mean'
}).round(3)

print("Caracter√≠sticas dos clusters:")
print(cluster_analysis)

# 7. VISUALIZA√á√ÉO DOS CLUSTERS
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Scatter plot dos clusters
scatter = axes[0].scatter(cluster_data['stars'], cluster_data['reviewCount'], 
                         c=cluster_data['cluster'], cmap='viridis', alpha=0.6, s=30)
axes[0].set_xlabel('Stars')
axes[0].set_ylabel('Review Count')
axes[0].set_title('Clusters por Stars e Review Count')
axes[0].set_yscale('log')
plt.colorbar(scatter, ax=axes[0])

# Box plot de stars por cluster
cluster_data.boxplot(column='stars', by='cluster', ax=axes[1])
axes[1].set_title('Distribui√ß√£o de Stars por Cluster')
axes[1].set_xlabel('Cluster')

plt.suptitle('An√°lise de Clusters', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

# 8. AN√ÅLISE DE ASSORTATIVIDADE
print("\n5. AN√ÅLISE DE ASSORTATIVIDADE:")

# Assortatividade por grau
degree_assortativity = nx.degree_assortativity_coefficient(G)
print(f"Coeficiente de assortatividade por grau: {degree_assortativity:.4f}")

# Assortatividade por atributos categ√≥ricos
try:
    # Para categorias (usando primary_category)
    node_categories = {node: data["primary_category"] for node, data in df_nodes.iterrows()}
    nx.set_node_attributes(G, node_categories, "primary_category")
    category_assortativity = nx.attribute_assortativity_coefficient(G, "primary_category")
    print(f"Assortatividade por categoria: {category_assortativity:.4f}")
except:
    print("N√£o foi poss√≠vel calcular assortatividade por categoria")

# 9. AN√ÅLISE DE SMALL WORLD
print("\n6. PROPRIEDADES DE SMALL WORLD:")
if nx.is_connected(G):
    avg_path_length = nx.average_shortest_path_length(G)
    print(f"Comprimento m√©dio do caminho: {avg_path_length:.3f}")
else:
    # Para o maior componente conectado
    largest_cc_subgraph = G.subgraph(largest_cc)
    avg_path_length = nx.average_shortest_path_length(largest_cc_subgraph)
    print(f"Comprimento m√©dio do caminho (maior componente): {avg_path_length:.3f}")

avg_clustering = nx.average_clustering(G)
print(f"Coeficiente de clustering m√©dio: {avg_clustering:.4f}")

# Small-world coefficient (aproxima√ß√£o)
# Comparar com grafo aleat√≥rio equivalente
n = len(G.nodes())
m = len(G.edges())
avg_degree = 2 * m / n

# Clustering esperado em grafo aleat√≥rio
random_clustering = avg_degree / (n - 1)
# Path length esperado em grafo aleat√≥rio  
random_path_length = np.log(n) / np.log(avg_degree)

small_world_coeff = (avg_clustering / random_clustering) / (avg_path_length / random_path_length)
print(f"Coeficiente Small World (œÉ): {small_world_coeff:.3f}")
print("Valores œÉ > 1 indicam propriedades de small world")

# 10. AN√ÅLISE DE DENSIDADE POR REGI√ÉO/CATEGORIA
print("\n7. AN√ÅLISE DE DENSIDADE:")
overall_density = nx.density(G)
print(f"Densidade global do grafo: {overall_density:.6f}")

# Densidade por categoria principal (top 5)
top_categories = df_nodes['primary_category'].value_counts().head(5).index

for category in top_categories:
    cat_nodes = df_nodes[df_nodes['primary_category'] == category].index
    subgraph = G.subgraph(cat_nodes)
    if len(subgraph.nodes()) > 1:
        density = nx.density(subgraph)
        print(f"Densidade para '{category}': {density:.6f}")

# 11. IDENTIFICA√á√ÉO DE HUBS E AUTORIDADES
print("\n8. AN√ÅLISE DE HUBS E AUTORIDADES:")
hubs, authorities = nx.hits(G, max_iter=100)

# Top hubs e autoridades
top_hubs = sorted(hubs.items(), key=lambda x: x[1], reverse=True)[:5]
top_authorities = sorted(authorities.items(), key=lambda x: x[1], reverse=True)[:5]

print("Top 5 Hubs:")
for node, score in top_hubs:
    node_info = df_nodes.loc[node]
    print(f"  Node {node}: {score:.4f} | Stars: {node_info['stars']:.1f} | Category: {node_info['primary_category']}")

print("Top 5 Authorities:")
for node, score in top_authorities:
    node_info = df_nodes.loc[node]
    print(f"  Node {node}: {score:.4f} | Stars: {node_info['stars']:.1f} | Category: {node_info['primary_category']}")

print("\n" + "="*80)
print("üéØ RESUMO EXECUTIVO:")
print("="*80)
print(f"‚Ä¢ Grafo com {len(G.nodes())} n√≥s e {len(G.edges())} arestas")
print(f"‚Ä¢ {num_components} componentes, maior com {len(largest_cc)/len(G.nodes)*100:.1f}% dos n√≥s")
print(f"‚Ä¢ Densidade: {overall_density:.6f} (grafo esparso)")
print(f"‚Ä¢ Clustering m√©dio: {avg_clustering:.4f}")
print(f"‚Ä¢ Coeficiente Small World: {small_world_coeff:.3f}")
print(f"‚Ä¢ Assortatividade por grau: {degree_assortativity:.4f}")
print(f"‚Ä¢ {len(df_nodes['primary_category'].unique())} categorias principais identificadas")
print("="*80)

ModuleNotFoundError: No module named 'plotly'