In [1]:
import json
import pandas as pd
from collections import Counter
import requests

In [2]:
communities = []
with open('output/run142-NQv0-best_run/run142_raw_communities.json', 'r') as f:
	communities = json.load(f)

communities_list = communities['leiden']
print(len(communities_list))

19


In [3]:
categories = pd.read_json('./categories.jsonl', lines=True)

In [4]:
with open('../community_analysis/nodes.json', 'r', encoding='utf-8-sig') as f:
    data = json.load(f)

nodes = pd.json_normalize(data)

In [5]:
categories_merged = categories.merge(
    nodes[["d.identity", "d.properties.title_encode"]],
    left_on="title",
    right_on="d.properties.title_encode",
    how="left"
)

categories_merged = categories_merged.drop(columns=["d.properties.title_encode"])
categories_merged.head()

Unnamed: 0,example_id,title,categorias,categorias_ocultas,d.identity
0,4549465242785278785,The_Walking_Dead_(season_8),"[2017 American television seasons, The Walking...",[Official website different in Wikidata and Wi...,700
1,-2543388002166163252,Persephone,"[Greek goddesses, Queens in Greek mythology, L...",[Articles containing Mycenaean Greek-language ...,6930
2,5985355041383167183,Colony_(biology),"[Community ecology, Microbiology terms, Habita...",[],6931
3,-2975172535563055798,The_Man_in_the_High_Castle_(TV_series),"[2010s American drama television series, 2015 ...","[CS1 German-language sources (de), Use America...",4733
4,-1052334833502528495,List_of_heads_of_state_of_Nigeria,"[Government of Nigeria, Lists of political off...",[Official website not in Wikidata],6932


In [6]:
community_map = {}
for i, ids in enumerate(communities_list):
    for node_id in ids:
        community_map[node_id] = i

categories_merged['community_id'] = categories_merged['d.identity'].map(community_map)

print(f"Comunidades atribuídas: {categories_merged['community_id'].nunique()}")

Comunidades atribuídas: 19


In [7]:
categories_merged.head()

Unnamed: 0,example_id,title,categorias,categorias_ocultas,d.identity,community_id
0,4549465242785278785,The_Walking_Dead_(season_8),"[2017 American television seasons, The Walking...",[Official website different in Wikidata and Wi...,700,0.0
1,-2543388002166163252,Persephone,"[Greek goddesses, Queens in Greek mythology, L...",[Articles containing Mycenaean Greek-language ...,6930,6.0
2,5985355041383167183,Colony_(biology),"[Community ecology, Microbiology terms, Habita...",[],6931,4.0
3,-2975172535563055798,The_Man_in_the_High_Castle_(TV_series),"[2010s American drama television series, 2015 ...","[CS1 German-language sources (de), Use America...",4733,0.0
4,-1052334833502528495,List_of_heads_of_state_of_Nigeria,"[Government of Nigeria, Lists of political off...",[Official website not in Wikidata],6932,2.0


In [8]:
# --- Contar categorias e categorias_ocultas em cada comunidade ---
top20_per_community_wiki = {}

for i in range(len(communities_list)):
    subset = categories_merged[categories_merged['community_id'] == i]

    # 'categorias' e 'categorias_ocultas' são listas — precisamos achatar
    all_cats = [cat for cats in subset['categorias'].dropna() for cat in cats]
    all_hidden = [cat for cats in subset['categorias_ocultas'].dropna() for cat in cats]

    top_cats = Counter(all_cats).most_common(20)
    top_hidden = Counter(all_hidden).most_common(20)

    top20_per_community_wiki[i] = {
        "top_categorias": top_cats,
        "top_categorias_ocultas": top_hidden
    }

In [9]:
top20_per_community_wiki

{0: {'top_categorias': [('English-language films', 16201),
   ('American films', 14439),
   ('English-language television programs', 9363),
   ('Living people', 7636),
   ('IMAX films', 3116),
   ('American male film actors', 2877),
   ('American male television actors', 2833),
   ('American sequel films', 2627),
   ('American television actresses', 2608),
   ('American film actresses', 2503),
   ('2010s American drama television series', 2402),
   ('2017 films', 2221),
   ('21st-century American actresses', 2194),
   ('Films featuring anthropomorphic characters', 2160),
   ('21st-century American male actors', 2101),
   ('20th-century American male actors', 2098),
   ('2017 American television seasons', 1994),
   ('2010s American television series', 1989),
   ('British films', 1931),
   ('20th-century American actresses', 1890)],
  'top_categorias_ocultas': [('All articles with unsourced statements', 16600),
   ('Wikipedia articles with VIAF identifiers', 13254),
   ('Wikipedia articl

In [10]:
# Ler o arquivo JSON correto
with open('output/run142-NQv0-best_run/run142_community_keywords.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Criar uma lista para o DataFrame
rows = []

# Iterar por todos os algoritmos e comunidades
for algorithm_name, communities in data.items():
    for comm_id, comm_data in communities.items():
        # Extrair keywords (apenas as palavras, sem as contagens)
        keywords = comm_data.get('keywords', [])
        keywords_list = [kw[0] for kw in keywords]
        keywords_string = ', '.join(keywords_list)
        
        rows.append({
            'algorithm': algorithm_name,
            'community_id': comm_id,
            'comm_name': comm_data.get('comm_name', ''),
            'keywords_string': keywords_string,
            'num_keywords': len(keywords_list)
        })

# Criar DataFrame
df = pd.DataFrame(rows)

print(df)

     algorithm community_id                            comm_name  \
0      infomap            0  Countries, History, European, World   
1      infomap            1            States, United, New, York   
2      infomap            2         Cell, System, Human, Disease   
3      infomap            3    Computer, Software, Data, Windows   
4      infomap            4    Church, Catholic, Bible, Biblical   
...        ...          ...                                  ...   
1327   k-means           35           Engine, Ford, Car, Vehicle   
1328   k-means           36           Star, Film, Series, Comics   
1329   k-means           37               Girl, Film, She, Woman   
1330   k-means           38             States, United, Act, Law   
1331   k-means           39    School, University, High, College   

                                        keywords_string  num_keywords  
0     countries, history, european, world, economy, ...            10  
1     states, united, new, york, state,

In [11]:
import requests
import pandas as pd

def categorize_with_ollama(keywords_string, model="mistral"):
    prompt = f"""
You are a precise classifier. Analyze the following keywords produced by a community detection algorithm and identify the main themes.

Keywords:
{keywords_string}

Task: Generate between 1 and 3 category names that best describe the community. Each category should be 2-4 words maximum.
- Use 1 category if the keywords represent a single coherent theme
- Use 2-3 categories if multiple distinct but related themes are present

Return ONLY the category names separated by semicolons (;). No explanations, no numbering, no extra text.
Example format: "machine learning; artificial intelligence" or "biology"
"""

    data = {
        "model": model,
        "prompt": prompt,
        "stream": False,
        "options": {
            "seed": 42,
            "top_k": 1,
            "top_p": 1,
            "temperature": 0
        }
    }

    url = "http://localhost:11434/api/generate"
    headers = {"Content-Type": "application/json"}

    try:
        resp = requests.post(url, headers=headers, json=data, timeout=3000)
        resp.raise_for_status()
        result = resp.json()
        return result.get("response", "").strip().lower()
    except Exception as e:
        print(f"Erro ao categorizar: {e}")
        return "erro"

In [12]:
df.iloc[0]['keywords_string']

'countries, history, european, world, economy, national, republic, flag, union, languages'

In [13]:
categorize_with_ollama('countries, history, european, world, economy, national, republic, flag, union, languages')

'european history; geopolitics; economy'

In [14]:
# Remove todos do algorithm=infomap
df = df[df['algorithm'] == 'leiden']
#exclui coluna descriptions
df

Unnamed: 0,algorithm,community_id,comm_name,keywords_string,num_keywords
1273,leiden,0,"Film, Series, Season, Episodes","film, series, season, episodes, characters, ma...",10
1274,leiden,1,"Song, You, Love, Album","song, you, love, album, music, don, all, your,...",10
1275,leiden,2,"World, War, South, United","world, war, south, united, history, olympics, ...",10
1276,leiden,3,"States, United, New, State","states, united, new, state, act, york, america...",10
1277,leiden,4,"Cell, History, Human, Cuisine","cell, history, human, cuisine, system, disease...",10
1278,leiden,5,"Power, System, Energy, Engine","power, system, energy, engine, history, ford, ...",10
1279,leiden,6,"History, Church, Roman, Ancient","history, church, roman, ancient, art, god, bib...",10
1280,leiden,7,"India, Indian, Cricket, National","india, indian, cricket, national, pakistan, in...",10
1281,leiden,8,"Episodes, Computer, Video, Game","episodes, computer, video, game, software, sys...",10
1282,leiden,9,"Law, Theory, Social, Management","law, theory, social, management, analysis, his...",10


In [15]:
import time
from concurrent.futures import ThreadPoolExecutor, as_completed

print("\nCategorizando comunidades com Ollama...")

def process_row(idx, row):
    """Função auxiliar que processa uma linha do DataFrame."""
    print(f"Processando {row['algorithm']} - comunidade {row['community_id']}...")
    category = categorize_with_ollama(row['keywords_string'])
    time.sleep(1)
    print(f"  -> Rows keywords: {row['keywords_string']}")
    print(f"  -> Categoria: {category}")
    return idx, category

results = []
with ThreadPoolExecutor(max_workers=32) as executor:
    futures = {executor.submit(process_row, idx, row): idx for idx, row in df.iterrows()}
    total = len(futures)
    processed = 0

    for future in as_completed(futures):
        idx, category = future.result()
        df.at[idx, 'category'] = category
        processed += 1
        if processed % 10 == 0 or processed == total:
            print(f"Processados {processed}/{total}")

print("\n=== DataFrame Final ===")
print(df.head(10))



Categorizando comunidades com Ollama...
Processando leiden - comunidade 0...
Processando leiden - comunidade 1...
Processando leiden - comunidade 2...
Processando leiden - comunidade 3...
Processando leiden - comunidade 4...
Processando leiden - comunidade 5...
Processando leiden - comunidade 6...
Processando leiden - comunidade 7...
Processando leiden - comunidade 8...
Processando leiden - comunidade 9...
Processando leiden - comunidade 10...
Processando leiden - comunidade 11...
Processando leiden - comunidade 12...
Processando leiden - comunidade 13...
Processando leiden - comunidade 14...
Processando leiden - comunidade 15...
Processando leiden - comunidade 16...
Processando leiden - comunidade 17...
Processando leiden - comunidade 18...
  -> Rows keywords: cell, history, human, cuisine, system, disease, food, medical, american, muscle
  -> Categoria: medical history; american cuisine; human health
  -> Rows keywords: states, united, new, state, act, york, american, national, cali

In [16]:
df

Unnamed: 0,algorithm,community_id,comm_name,keywords_string,num_keywords,category
1273,leiden,0,"Film, Series, Season, Episodes","film, series, season, episodes, characters, ma...",10,film & tv content; actors & characters
1274,leiden,1,"Song, You, Love, Album","song, you, love, album, music, don, all, your,...",10,music & albums; love songs
1275,leiden,2,"World, War, South, United","world, war, south, united, history, olympics, ...",10,global affairs; geography; sports & competitions
1276,leiden,3,"States, United, New, State","states, united, new, state, act, york, america...",10,united states history; american states; nation...
1277,leiden,4,"Cell, History, Human, Cuisine","cell, history, human, cuisine, system, disease...",10,medical history; american cuisine; human health
1278,leiden,5,"Power, System, Energy, Engine","power, system, energy, engine, history, ford, ...",10,power systems; energy sources; historical vehi...
1279,leiden,6,"History, Church, Roman, Ancient","history, church, roman, ancient, art, god, bib...",10,ancient roman art & religion; catholic history...
1280,leiden,7,"India, Indian, Cricket, National","india, indian, cricket, national, pakistan, in...",10,sports & culture; south asia; education & tech...
1281,leiden,8,"Episodes, Computer, Video, Game","episodes, computer, video, game, software, sys...",10,computer games history; video game software; n...
1282,leiden,9,"Law, Theory, Social, Management","law, theory, social, management, analysis, his...",10,legal & social studies; management & market an...


In [17]:
df.to_json('communities_categorized.json', orient='records', indent=2)
print("\nResultados salvos em communities_categorized.json ✅")


Resultados salvos em communities_categorized.json ✅


In [18]:
df = pd.read_json('communities_categorized.json', orient='records')
df

Unnamed: 0,algorithm,community_id,comm_name,keywords_string,num_keywords,category
0,leiden,0,"Film, Series, Season, Episodes","film, series, season, episodes, characters, ma...",10,film & tv content; actors & characters
1,leiden,1,"Song, You, Love, Album","song, you, love, album, music, don, all, your,...",10,music & albums; love songs
2,leiden,2,"World, War, South, United","world, war, south, united, history, olympics, ...",10,global affairs; geography; sports & competitions
3,leiden,3,"States, United, New, State","states, united, new, state, act, york, america...",10,united states history; american states; nation...
4,leiden,4,"Cell, History, Human, Cuisine","cell, history, human, cuisine, system, disease...",10,medical history; american cuisine; human health
5,leiden,5,"Power, System, Energy, Engine","power, system, energy, engine, history, ford, ...",10,power systems; energy sources; historical vehi...
6,leiden,6,"History, Church, Roman, Ancient","history, church, roman, ancient, art, god, bib...",10,ancient roman art & religion; catholic history...
7,leiden,7,"India, Indian, Cricket, National","india, indian, cricket, national, pakistan, in...",10,sports & culture; south asia; education & tech...
8,leiden,8,"Episodes, Computer, Video, Game","episodes, computer, video, game, software, sys...",10,computer games history; video game software; n...
9,leiden,9,"Law, Theory, Social, Management","law, theory, social, management, analysis, his...",10,legal & social studies; management & market an...


In [19]:
top20_per_community_wiki

{0: {'top_categorias': [('English-language films', 16201),
   ('American films', 14439),
   ('English-language television programs', 9363),
   ('Living people', 7636),
   ('IMAX films', 3116),
   ('American male film actors', 2877),
   ('American male television actors', 2833),
   ('American sequel films', 2627),
   ('American television actresses', 2608),
   ('American film actresses', 2503),
   ('2010s American drama television series', 2402),
   ('2017 films', 2221),
   ('21st-century American actresses', 2194),
   ('Films featuring anthropomorphic characters', 2160),
   ('21st-century American male actors', 2101),
   ('20th-century American male actors', 2098),
   ('2017 American television seasons', 1994),
   ('2010s American television series', 1989),
   ('British films', 1931),
   ('20th-century American actresses', 1890)],
  'top_categorias_ocultas': [('All articles with unsourced statements', 16600),
   ('Wikipedia articles with VIAF identifiers', 13254),
   ('Wikipedia articl

In [22]:
def make_prompt(category_llm, top_categories):
    top_cats_str = ", ".join([cat for cat, _ in top_categories])
    prompt = f"""
You are an accurate semantic evaluator. Evaluate the relevance of the following category created by an LLM model relative to a community's top keywords.

LLM-generated category: "{category_llm}"
Top keywords from the community: "{top_cats_str}"

Your task is to provide a single floating-point number between 0.0 and 1.0, where:
- 0.0 = no semantic relationship
- 0.3 = weak/tangential relationship
- 0.5 = moderate relationship
- 0.7 = strong relationship
- 1.0 = perfect semantic match

Return ONLY the number, nothing else.

"""
    return prompt

In [23]:
def evaluate_correlation(category_llm, top_categories):
    prompt = make_prompt(category_llm, top_categories)
    url = "http://localhost:11434/api/generate"
    data = {
        "model": "mistral",
        "prompt": prompt,
        "stream": False,
        "options": {
            "seed": 42,
            "top_k": 1,
            "top_p": 1,
            "temperature": 0
        }
    }

    url = "http://localhost:11434/api/generate"
    headers = {"Content-Type": "application/json"}

    try:
        resp = requests.post(url, headers=headers, json=data, timeout=3000)
        resp.raise_for_status()
        result = resp.json()
        return result.get("response", "").strip().lower()
    except Exception as e:
        print(f"Erro ao categorizar: {e}")
        return "erro"

In [25]:
import pandas as pd

scores = []

for idx, row in df.iterrows():
    community_id = row['community_id']
    category_llm = row['category']
    top_categories = top20_per_community_wiki.get(community_id, {}).get('top_categorias', [])
    print(f"Evaluating community {community_id} with LLM category '{category_llm}' and top categories {top_categories}")    


Evaluating community 0 with LLM category 'film & tv content; actors & characters' and top categories [('English-language films', 16201), ('American films', 14439), ('English-language television programs', 9363), ('Living people', 7636), ('IMAX films', 3116), ('American male film actors', 2877), ('American male television actors', 2833), ('American sequel films', 2627), ('American television actresses', 2608), ('American film actresses', 2503), ('2010s American drama television series', 2402), ('2017 films', 2221), ('21st-century American actresses', 2194), ('Films featuring anthropomorphic characters', 2160), ('21st-century American male actors', 2101), ('20th-century American male actors', 2098), ('2017 American television seasons', 1994), ('2010s American television series', 1989), ('British films', 1931), ('20th-century American actresses', 1890)]
Evaluating community 1 with LLM category 'music & albums; love songs' and top categories [('Billboard Hot 100 number-one singles', 3932),

In [26]:
import pandas as pd

scores = []

for idx, row in df.iterrows():
    community_id = row['community_id']
    category_llm = row['category']
    top_categories = top20_per_community_wiki.get(community_id, {}).get('top_categorias', [])
    
    score = evaluate_correlation(category_llm, top_categories)
    scores.append(score)

df['llm_correlation'] = scores


In [27]:
df

Unnamed: 0,algorithm,community_id,comm_name,keywords_string,num_keywords,category,llm_correlation
0,leiden,0,"Film, Series, Season, Episodes","film, series, season, episodes, characters, ma...",10,film & tv content; actors & characters,0.8
1,leiden,1,"Song, You, Love, Album","song, you, love, album, music, don, all, your,...",10,music & albums; love songs,0.6
2,leiden,2,"World, War, South, United","world, war, south, united, history, olympics, ...",10,global affairs; geography; sports & competitions,0.6
3,leiden,3,"States, United, New, State","states, united, new, state, act, york, america...",10,united states history; american states; nation...,0.95
4,leiden,4,"Cell, History, Human, Cuisine","cell, history, human, cuisine, system, disease...",10,medical history; american cuisine; human health,0.3
5,leiden,5,"Power, System, Energy, Engine","power, system, energy, engine, history, ford, ...",10,power systems; energy sources; historical vehi...,0.2
6,leiden,6,"History, Church, Roman, Ancient","history, church, roman, ancient, art, god, bib...",10,ancient roman art & religion; catholic history...,0.2
7,leiden,7,"India, Indian, Cricket, National","india, indian, cricket, national, pakistan, in...",10,sports & culture; south asia; education & tech...,0.4
8,leiden,8,"Episodes, Computer, Video, Game","episodes, computer, video, game, software, sys...",10,computer games history; video game software; n...,0.6
9,leiden,9,"Law, Theory, Social, Management","law, theory, social, management, analysis, his...",10,legal & social studies; management & market an...,0.6


In [28]:
df['llm_correlation'] = pd.to_numeric(df['llm_correlation'], errors='coerce')

In [29]:
df.describe()

Unnamed: 0,community_id,num_keywords,llm_correlation
count,19.0,19.0,19.0
mean,9.0,10.0,0.581579
std,5.627314,0.0,0.243362
min,0.0,10.0,0.2
25%,4.5,10.0,0.4
50%,9.0,10.0,0.6
75%,13.5,10.0,0.8
max,18.0,10.0,0.95


In [30]:
threshold = 0.7  # acima disso consideramos que bateu com a Wikipedia

df['llm_match'] = df['llm_correlation'] >= threshold
mean_score = df['llm_correlation'].mean()
top_50_percent = df[df['llm_correlation'] >= df['llm_correlation'].quantile(0.5)]

print(f"Mean LLM Correlation Score: {mean_score}")
print(f"Top 50% Communities based on LLM Correlation:\n{top_50_percent}")

Mean LLM Correlation Score: 0.5815789473684211
Top 50% Communities based on LLM Correlation:
   algorithm  community_id                               comm_name  \
0     leiden             0          Film, Series, Season, Episodes   
1     leiden             1                  Song, You, Love, Album   
2     leiden             2               World, War, South, United   
3     leiden             3              States, United, New, State   
8     leiden             8         Episodes, Computer, Video, Game   
9     leiden             9         Law, Theory, Social, Management   
10    leiden            10  Football, League, Basketball, Baseball   
11    leiden            11             Cup, Surname, United, World   
13    leiden            13          Office, Season, Paper, Michael   
14    leiden            14                 Brady, Days, Our, Lives   
16    leiden            16  Coronation, Street, Characters, Barlow   
17    leiden            17                   Gun, Laws, New, Carry 