In [76]:
import json
import pandas as pd
from collections import Counter
import requests

In [None]:
communities = []
with open('output/run142-NQv0-best_run/run142_raw_communities.json', 'r') as f:
	communities = json.load(f)

communities_list = communities['louvain']
print(len(communities_list))

Loaded 4 communities


In [None]:
categories = pd.read_json('../data/categories.jsonl', lines=True)

In [None]:
with open('../data/v0.0/nodes.json', 'r', encoding='utf-8-sig') as f:
    data = json.load(f)

nodes = pd.json_normalize(data)

In [26]:
categories_merged = categories.merge(
    nodes[["d.identity", "d.properties.title_encode"]],
    left_on="title",
    right_on="d.properties.title_encode",
    how="left"
)

categories_merged = categories_merged.drop(columns=["d.properties.title_encode"])
categories_merged.head()

Unnamed: 0,example_id,title,categorias,categorias_ocultas,d.identity
0,4549465242785278785,The_Walking_Dead_(season_8),"[2017 American television seasons, The Walking...",[Official website different in Wikidata and Wi...,700
1,-2543388002166163252,Persephone,"[Greek goddesses, Queens in Greek mythology, L...",[Articles containing Mycenaean Greek-language ...,6930
2,5985355041383167183,Colony_(biology),"[Community ecology, Microbiology terms, Habita...",[],6931
3,-2975172535563055798,The_Man_in_the_High_Castle_(TV_series),"[2010s American drama television series, 2015 ...","[CS1 German-language sources (de), Use America...",4733
4,-1052334833502528495,List_of_heads_of_state_of_Nigeria,"[Government of Nigeria, Lists of political off...",[Official website not in Wikidata],6932


In [36]:
community_map = {}
for i, ids in enumerate(communities_list):
    for node_id in ids:
        community_map[node_id] = i

categories_merged['community_id'] = categories_merged['d.identity'].map(community_map)

print(f"Comunidades atribuídas: {categories_merged['community_id'].nunique()}")

Comunidades atribuídas: 24


In [44]:
# --- Contar categorias e categorias_ocultas em cada comunidade ---
top20_per_community_wiki = {}

for i in range(len(communities_list)):
    subset = categories_merged[categories_merged['community_id'] == i]

    # 'categorias' e 'categorias_ocultas' são listas — precisamos achatar
    all_cats = [cat for cats in subset['categorias'].dropna() for cat in cats]
    all_hidden = [cat for cats in subset['categorias_ocultas'].dropna() for cat in cats]

    top_cats = Counter(all_cats).most_common(20)
    top_hidden = Counter(all_hidden).most_common(20)

    top20_per_community_wiki[i] = {
        "top_categorias": top_cats,
        "top_categorias_ocultas": top_hidden
    }

In [75]:
# Ler o arquivo JSON correto
with open('output/run142-NQv0-best_run/run142_community_keywords.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Criar uma lista para o DataFrame
rows = []

# Iterar por todos os algoritmos e comunidades
for algorithm_name, communities in data.items():
    for comm_id, comm_data in communities.items():
        # Extrair keywords (apenas as palavras, sem as contagens)
        keywords = comm_data.get('keywords', [])
        keywords_list = [kw[0] for kw in keywords]
        keywords_string = ', '.join(keywords_list)
        
        rows.append({
            'algorithm': algorithm_name,
            'community_id': comm_id,
            'comm_name': comm_data.get('comm_name', ''),
            'keywords_string': keywords_string,
            'num_keywords': len(keywords_list)
        })

# Criar DataFrame
df = pd.DataFrame(rows)

print(df)

     algorithm community_id                            comm_name  \
0      infomap            0  Countries, History, European, World   
1      infomap            1            States, United, New, York   
2      infomap            2         Cell, System, Human, Disease   
3      infomap            3    Computer, Software, Data, Windows   
4      infomap            4    Church, Catholic, Bible, Biblical   
...        ...          ...                                  ...   
1327   k-means           35           Engine, Ford, Car, Vehicle   
1328   k-means           36           Star, Film, Series, Comics   
1329   k-means           37               Girl, Film, She, Woman   
1330   k-means           38             States, United, Act, Law   
1331   k-means           39    School, University, High, College   

                                        keywords_string  num_keywords  
0     countries, history, european, world, economy, ...            10  
1     states, united, new, york, state,

In [134]:
import requests
import pandas as pd

def categorize_with_ollama(keywords_string, model="gpt-oss:20b"):
    prompt = f"""
You are a precise classifier. Analyze the following keywords produced by a community detection algorithm and assign a short, meaningful category name (2–4 words maximum).

Keywords:
{keywords_string}

Return only the answer — no explanations, no text before or after.
"""

    data = {
        "model": model,
        "prompt": prompt,
        "stream": False,
        "options": {
            "seed": 42,
            "top_k": 1,
            "top_p": 1,
            "temperature": 0
        }
    }

    url = "http://localhost:11434/api/generate"
    headers = {"Content-Type": "application/json"}

    try:
        resp = requests.post(url, headers=headers, json=data, timeout=3000)
        resp.raise_for_status()
        result = resp.json()
        return result.get("response", "").strip().lower()
    except Exception as e:
        print(f"Erro ao categorizar: {e}")
        return "erro"

In [111]:
df.iloc[0]['keywords_string']

'middle, earth, rings, lord, film, series, hobbit, ring, king, tolkien'

In [None]:
categorize_with_ollama('countries, history, european, world, economy, national, republic, flag, union, languages')

In [119]:
# Remove todos do algorithm=infomap
df = df[df['algorithm'] != 'infomap']
#exclui coluna descriptions
df = df.drop(columns=['description'])
df

Unnamed: 0,algorithm,community_id,comm_name,keywords_string,num_keywords,category
1249,louvain,0,"Middle, Earth, Rings, Lord","middle, earth, rings, lord, film, series, hobb...",10,lord of the rings
1250,louvain,1,"Song, You, Love, Album","song, you, love, album, music, don, all, your,...",10,love songs
1251,louvain,2,"India, Indian, Cricket, National","india, indian, cricket, national, pakistan, fi...",10,south asian sports culture
1252,louvain,3,"Football, League, Basketball, Baseball","football, league, basketball, baseball, season...",10,sports leagues
1253,louvain,4,"Vegas, Las, Card, Casino","vegas, las, card, casino, nascar, poker, game,...",10,vegas casino racing
...,...,...,...,...,...,...
1327,k-means,35,"Engine, Ford, Car, Vehicle","engine, ford, car, vehicle, grand, prix, toyot...",10,automotive racing brands
1328,k-means,36,"Star, Film, Series, Comics","star, film, series, comics, wars, trek, season...",10,entertainment media
1329,k-means,37,"Girl, Film, She, Woman","girl, film, she, woman, queen, you, lady, litt...",10,female film characters
1330,k-means,38,"States, United, Act, Law","states, united, act, law, rights, federal, cou...",10,u.s. constitutional law


In [137]:
# category" = "erro"
df_correcao = df[df['category'] == 'erro']
df_correcao

Unnamed: 0,algorithm,community_id,comm_name,keywords_string,num_keywords,category


In [135]:
import time
from concurrent.futures import ThreadPoolExecutor, as_completed

print("\nCategorizando comunidades com Ollama...")

def process_row(idx, row):
    """Função auxiliar que processa uma linha do DataFrame."""
    print(f"Processando {row['algorithm']} - comunidade {row['community_id']}...")
    category = categorize_with_ollama(row['keywords_string'])
    time.sleep(1)
    print(f"  -> Rows keywords: {row['keywords_string']}")
    print(f"  -> Categoria: {category}")
    return idx, category

results = []
with ThreadPoolExecutor(max_workers=32) as executor:
    futures = {executor.submit(process_row, idx, row): idx for idx, row in df_correcao.iterrows()}
    total = len(futures)
    processed = 0

    for future in as_completed(futures):
        idx, category = future.result()
        df_correcao.at[idx, 'category'] = category
        processed += 1
        if processed % 10 == 0 or processed == total:
            print(f"Processados {processed}/{total}")

print("\n=== DataFrame Final ===")
print(df_correcao.head(10))



Categorizando comunidades com Ollama...
Processando leiden - comunidade 15...
Processando k-means - comunidade 33...
  -> Rows keywords: fire, art, color, black, glass, human, blue, red, ring, power
  -> Categoria: colorful artifacts
  -> Rows keywords: little, nyah, old, baby, mary, rain, down, man, row, round
  -> Categoria: nursery rhyme
Processados 2/2

=== DataFrame Final ===
     algorithm community_id                comm_name  \
1288    leiden           15  Little, Nyah, Old, Baby   
1325   k-means           33  Fire, Art, Color, Black   

                                        keywords_string  num_keywords  \
1288  little, nyah, old, baby, mary, rain, down, man...            10   
1325  fire, art, color, black, glass, human, blue, r...            10   

                category  
1288       nursery rhyme  
1325  colorful artifacts  


In [136]:
#Atualiza df com as correções
for idx, row in df_correcao.iterrows():
	df.at[idx, 'category'] = row['category']
df.to_json('communities_categorized.json', orient='records', indent=2)

In [138]:
df.to_json('communities_categorized.json', orient='records', indent=2)
print("\nResultados salvos em communities_categorized.json ✅")


Resultados salvos em communities_categorized.json ✅


In [142]:
df = pd.read_json('communities_categorized.json', orient='records')
df = df[df['algorithm'] == 'louvain']
df

Unnamed: 0,algorithm,community_id,comm_name,keywords_string,num_keywords,category
0,louvain,0,"Middle, Earth, Rings, Lord","middle, earth, rings, lord, film, series, hobb...",10,lord of the rings
1,louvain,1,"Song, You, Love, Album","song, you, love, album, music, don, all, your,...",10,love songs
2,louvain,2,"India, Indian, Cricket, National","india, indian, cricket, national, pakistan, fi...",10,south asian sports culture
3,louvain,3,"Football, League, Basketball, Baseball","football, league, basketball, baseball, season...",10,sports leagues
4,louvain,4,"Vegas, Las, Card, Casino","vegas, las, card, casino, nascar, poker, game,...",10,vegas casino racing
5,louvain,5,"Golf, Open, Championship, Pga","golf, open, championship, pga, tour, cup, club...",10,golf tournaments
6,louvain,6,"Cup, Surname, United, World","cup, surname, united, world, football, kingdom...",10,football clubs and competitions
7,louvain,7,"Top, America, Next, Model","top, america, next, model, cycle, wedding, sea...",10,fashion & apparel
8,louvain,8,"History, Church, Roman, Ancient","history, church, roman, ancient, theory, socia...",10,ancient roman culture
9,louvain,9,"Law, Management, Market, Tax","law, management, market, tax, economics, busin...",10,business finance


In [143]:
top20_per_community_wiki

{0: {'top_categorias': [('The Lord of the Rings characters', 95),
   ('English-language films', 85),
   ('American films', 85),
   ('New Zealand films', 84),
   ('New Line Cinema films', 84),
   ('American epic films', 80),
   ('Fictional-language films', 80),
   ('Film series', 80),
   ('Films directed by Peter Jackson', 80),
   ('The Lord of the Rings (film series)', 75),
   ('High fantasy films', 69),
   ('Film series introduced in 2001', 65),
   ('2000s fantasy films', 65),
   ('The Lord of the Rings', 65),
   ('Allen &amp; Unwin books', 63),
   ('Fictional characters introduced in 1954', 62),
   ('British novels adapted into films', 61),
   ('Sequel novels', 59),
   ('1954 British novels', 56),
   ('Adventure films by series', 54)],
  'top_categorias_ocultas': [('Webarchive template wayback links', 138),
   ('All articles with unsourced statements', 122),
   ('Articles using Infobox character with multiple unlabeled fields', 114),
   ('Good articles', 90),
   ('Use dmy dates from 

In [146]:
def make_prompt(category_llm, top_categories):
    top_cats_str = ", ".join([cat for cat, _ in top_categories])
    prompt = f"""
You are an accurate semantic evaluator. Evaluate the relevance of the following category created by an LLM model relative to a community's top Wikipedia categories.

LLM category: "{category_llm}"
Top Wikipedia categories: "{top_cats_str}"

Return only a number between 0 and 1, where 0 means no semantic relationship and 1 means complete semantic relationship.
"""
    return prompt

In [152]:
def evaluate_correlation(category_llm, top_categories):
    prompt = make_prompt(category_llm, top_categories)
    url = "http://localhost:11434/api/generate"
    data = {
        "model": "gpt-oss:20b",
        "prompt": prompt,
        "stream": False,
        "options": {
            "seed": 42,
            "top_k": 1,
            "top_p": 1,
            "temperature": 0
        }
    }

    url = "http://localhost:11434/api/generate"
    headers = {"Content-Type": "application/json"}

    try:
        resp = requests.post(url, headers=headers, json=data, timeout=3000)
        resp.raise_for_status()
        result = resp.json()
        return result.get("response", "").strip().lower()
    except Exception as e:
        print(f"Erro ao categorizar: {e}")
        return "erro"

In [153]:
import pandas as pd

scores = []

for idx, row in df.iterrows():
    community_id = row['community_id']
    category_llm = row['category']
    top_categories = top20_per_community_wiki.get(community_id, {}).get('top_categorias', [])
    
    score = evaluate_correlation(category_llm, top_categories)
    scores.append(score)

df['llm_correlation'] = scores


In [155]:
df

Unnamed: 0,algorithm,community_id,comm_name,keywords_string,num_keywords,category,llm_correlation
0,louvain,0,"Middle, Earth, Rings, Lord","middle, earth, rings, lord, film, series, hobb...",10,lord of the rings,0.95
1,louvain,1,"Song, You, Love, Album","song, you, love, album, music, don, all, your,...",10,love songs,0.1
2,louvain,2,"India, Indian, Cricket, National","india, indian, cricket, national, pakistan, fi...",10,south asian sports culture,0.1
3,louvain,3,"Football, League, Basketball, Baseball","football, league, basketball, baseball, season...",10,sports leagues,0.9
4,louvain,4,"Vegas, Las, Card, Casino","vegas, las, card, casino, nascar, poker, game,...",10,vegas casino racing,0.4
5,louvain,5,"Golf, Open, Championship, Pga","golf, open, championship, pga, tour, cup, club...",10,golf tournaments,0.95
6,louvain,6,"Cup, Surname, United, World","cup, surname, united, world, football, kingdom...",10,football clubs and competitions,0.9
7,louvain,7,"Top, America, Next, Model","top, america, next, model, cycle, wedding, sea...",10,fashion & apparel,0.85
8,louvain,8,"History, Church, Roman, Ancient","history, church, roman, ancient, theory, socia...",10,ancient roman culture,0.4
9,louvain,9,"Law, Management, Market, Tax","law, management, market, tax, economics, busin...",10,business finance,0.9


In [156]:
df['llm_correlation'] = pd.to_numeric(df['llm_correlation'], errors='coerce')

In [157]:
df.describe()

Unnamed: 0,community_id,num_keywords,llm_correlation
count,24.0,24.0,24.0
mean,11.5,10.0,0.670833
std,7.071068,0.0,0.315511
min,0.0,10.0,0.1
25%,5.75,10.0,0.4
50%,11.5,10.0,0.875
75%,17.25,10.0,0.9125
max,23.0,10.0,1.0


In [160]:
threshold = 0.7  # acima disso consideramos que bateu com a Wikipedia

df['llm_match'] = df['llm_correlation'] >= threshold
mean_score = df['llm_correlation'].mean()
top_50_percent = df[df['llm_correlation'] >= df['llm_correlation'].quantile(0.5)]

print(f"Mean LLM Correlation Score: {mean_score}")
print(f"Top 50% Communities based on LLM Correlation:\n{top_50_percent}")

Mean LLM Correlation Score: 0.6708333333333334
Top 50% Communities based on LLM Correlation:
   algorithm  community_id                               comm_name  \
0    louvain             0              Middle, Earth, Rings, Lord   
3    louvain             3  Football, League, Basketball, Baseball   
5    louvain             5           Golf, Open, Championship, Pga   
6    louvain             6             Cup, Surname, United, World   
9    louvain             9            Law, Management, Market, Tax   
10   louvain            10          Film, Series, Season, Episodes   
12   louvain            12                 Little, Old, Baby, Mary   
13   louvain            13              Gun, Laws, New, California   
17   louvain            17          Season, Vampire, Diaries, Home   
18   louvain            18              States, United, New, State   
20   louvain            20             Dead, Walking, Season, Fear   
21   louvain            21          Office, Season, Michael, Paper 