# Knowledge Graph - Relations Extraction 

In [None]:
import openai
import json
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import os
import re
import tiktoken

# === Global accumulator ===
all_articles_data = []

# === 0. Text Extraction from Different Formats ===
def extract_text_from_file(file_path):
    ext = os.path.splitext(file_path)[1].lower()

    if ext == ".txt":
        with open(file_path, "r", encoding="utf-8") as f:
            return f.read()

    elif ext == ".pdf":
        try:
            from PyPDF2 import PdfReader
        except ImportError:
            raise ImportError("You need to install PyPDF2! Run: pip install PyPDF2")

        reader = PdfReader(file_path)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
        return text

    else:
        raise ValueError(f"Unsupported file extension: {ext}")

# === 1. Helper: Token Counter and Splitter ===
def num_tokens_from_text(text, model="gpt-3.5-turbo"):
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))

def split_text(text, max_tokens=3000, model="gpt-3.5-turbo"):
    encoding = tiktoken.encoding_for_model(model)
    tokens = encoding.encode(text)

    chunks = []
    for i in range(0, len(tokens), max_tokens):
        chunk_tokens = tokens[i:i + max_tokens]
        chunk_text = encoding.decode(chunk_tokens)
        chunks.append(chunk_text)

    return chunks

# === 2. Build Prompt for GPT ===
def build_prompt(text):
    return f"""
You are an expert in knowledge extraction.

Your task:
- Focus ONLY on text where the word "ecosystem" appears.
- From those parts, extract meaningful, well-formed entities (as nodes) and their relationships (as triples).

Guidelines:
- Include any terms that directly contain the word "ecosystem" (e.g., "Open Science Ecosystem", "Blockchain Ecosystem") as part of the **concept list**. 
- Only include entities that are concrete or conceptual — no vague or generic entries.
- Limit to a **maximum of 20 concepts** and **maximum of 20 relations**.
- It is perfectly fine if there are fewer than 20 concepts or relations, depending on the richness of the information.
- Do NOT invent or hallucinate content that is not explicitly or implicitly supported by the text.
- Prioritize clarity, importance, and relevance.
- Avoid very long phrases (keep concept labels under 6-7 words).
- For relations, avoid generic verbs like "is", "are", or conjugated forms like "includes". Use meaningful, specific verbs (e.g., "participate in", "enable", "comprise").

Output Format (strictly):
Return a valid JSON object with exactly these two keys:
- "concepts": a list of important concept strings
- "relations": a list of [subject, relation, object] triples

Example:
{{
  "concepts": ["Open Science Ecosystem", "Repositories", "Researchers", "Knowledge Sharing"],
  "relations": [
    ["Researchers", "participate in", "Open Science Ecosystem"],
    ["Repositories", "enable", "Knowledge Sharing"]
  ]
}}

Strict instructions:
- Only return the JSON object — no explanations, commentary, or extra text.
- Do not repeat terms unnecessarily.
- Ensure that the JSON is valid (no missing commas, brackets, or quotation marks).

TEXT:
---
{text}
---
"""

# === 3. Extract Knowledge Using GPT ===
def extract_knowledge(text, model="gpt-3.5-turbo"):
    prompt = build_prompt(text)
    response = openai.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.0
    )
    content = response.choices[0].message.content.strip()

    try:
        data = json.loads(content)
        if "concepts" not in data or "relations" not in data:
            raise ValueError("Missing 'concepts' or 'relations' keys.")
        return data
    except Exception as e:
        print("❌ Error parsing GPT output:", e)
        print("Content was:\n", content)
        raise

# === 4. Save Outputs to JSON, Excel, and PNG ===
def save_outputs(data, base_path, article_no=None, chunk_idx=None):
    chunk_suffix = f"_chunk{chunk_idx}" if chunk_idx is not None else ""
    suffix = f"_article{article_no}{chunk_suffix}" if article_no is not None else chunk_suffix

    json_path = f"{base_path}{suffix}_graph.json"
    excel_path = f"{base_path}{suffix}_graph.xlsx"

    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

    df = pd.DataFrame(data["relations"], columns=["Subject", "Relation", "Object"])
    df["Article No"] = article_no
    df.to_excel(excel_path, index=False)

    print(f"✅ Saved JSON to {json_path}")
    print(f"✅ Saved Excel to {excel_path}")
    print(f"📌 Concepts: {len(data['concepts'])} | Relations: {len(data['relations'])}")

def save_graph_image(data, path, title="Knowledge Graph", seed=42):
    G = nx.DiGraph()
    skipped = 0

    for relation in data.get("relations", []):
        if isinstance(relation, list) and len(relation) == 3:
            s, r, o = relation
            G.add_edge(s, o, relation=r)
        else:
            skipped += 1
            print(f"⚠️ Skipping malformed relation (not a triple): {relation}")

    try:
        pos = nx.spring_layout(G, seed=seed, k=0.6)
        plt.figure(figsize=(18, 14))
        nx.draw_networkx_nodes(G, pos, node_color="lightblue", node_size=700, edgecolors='black')
        nx.draw_networkx_edges(G, pos, arrows=True, arrowstyle='-|>', width=2, arrowsize=20)
        nx.draw_networkx_labels(G, pos, font_size=10, font_weight="bold")

        edge_labels = {(u, v): d.get("relation", "") for u, v, d in G.edges(data=True)}
        nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=8, font_color="darkred")

        plt.title(title)
        plt.axis("off")
        plt.tight_layout()
        plt.savefig(path)
        plt.close()
        print(f"🖼️ Saved graph visualization: {path}")
        if skipped > 0:
            print(f"⚠️ {skipped} malformed relations were skipped in graph rendering.")
    except Exception as e:
        print(f"❌ Error during graph rendering: {e}")

# === 5. Full Processing Pipeline for One Article ===
def process_article_from_file(file_path, save_base="output/article_knowledge"):
    match = re.search(r"(\d+)", os.path.splitext(os.path.basename(file_path))[0])
    article_no = match.group(1) if match else "unknown"

    output_dir = os.path.dirname(save_base)
    if output_dir and not os.path.exists(output_dir):
        os.makedirs(output_dir)
        print(f"📁 Created directory: {output_dir}")

    print(f"📄 Loading and processing file: {file_path}")
    text = extract_text_from_file(file_path)

    total_tokens = num_tokens_from_text(text)
    print(f"🧮 Text token count: {total_tokens}")

    if total_tokens > 14000:
        print("⚡ Text too large: Splitting into chunks...")
        chunks = split_text(text, max_tokens=12000)
    else:
        chunks = [text]

    for idx, chunk in enumerate(chunks):
        print(f"🔎 Processing chunk {idx+1}/{len(chunks)}...")
        try:
            data = extract_knowledge(chunk)
            data["article_no"] = article_no
            data["chunk_idx"] = idx if len(chunks) > 1 else 0

            save_outputs(data, save_base, article_no=article_no, chunk_idx=idx if len(chunks) > 1 else None)

            img_path = f"{save_base}_article{article_no}_chunk{idx}_graph.png"
            save_graph_image(data, img_path, title=f"Knowledge Graph: Article {article_no} Chunk {idx}")

            all_articles_data.append({
                "article_no": article_no,
                "chunk_idx": idx if len(chunks) > 1 else 0,
                "concepts": data["concepts"],
                "relations": data["relations"],
                "num_concepts": len(data["concepts"]),
                "num_relations": len(data["relations"])
            })
        except Exception as e:
            print(f"❌ Error processing chunk {idx} of article {article_no}: {e}")

# === 6. Batch Processing ===
def process_multiple_articles(folder_path, max_files=10, save_base="output/article_knowledge", final_master_path="output/master_knowledge.json"):
    files = sorted([
        f for f in os.listdir(folder_path)
        if f.lower().endswith((".pdf", ".txt"))
    ])[:max_files]

    print(f"🔁 Found {len(files)} files. Processing first {max_files}...\n")

    for file in files:
        file_path = os.path.join(folder_path, file)
        process_article_from_file(file_path, save_base=save_base)

    print(f"\n📦 Saving master file with {len(all_articles_data)} entries to {final_master_path}...")
    with open(final_master_path, "w", encoding="utf-8") as f:
        json.dump(all_articles_data, f, indent=2, ensure_ascii=False)
    print(f"✅ Master JSON saved!")

In [None]:
process_multiple_articles(
    folder_path="/Users/nil.ilba/Documents/2.Research/MA4_OpenEdition/Corpus/main/text_try", 
    max_files=211, 
    save_base="output5/04.29_KG_",
    final_master_path="output5/04.29_master_knowledge.json"
)

🔁 Found 204 files. Processing first 211...

📄 Loading and processing file: /Users/nil.ilba/Documents/2.Research/MA4_OpenEdition/Corpus/main/text_try/111.txt
🧮 Text token count: 28877
⚡ Text too large: Splitting into chunks...
🔎 Processing chunk 1/3...
✅ Saved JSON to output5/04.29_KG__article111_chunk0_graph.json
✅ Saved Excel to output5/04.29_KG__article111_chunk0_graph.xlsx
📌 Concepts: 11 | Relations: 29
🖼️ Saved graph visualization: output5/04.29_KG__article111_chunk0_graph.png
🔎 Processing chunk 2/3...
✅ Saved JSON to output5/04.29_KG__article111_chunk1_graph.json
✅ Saved Excel to output5/04.29_KG__article111_chunk1_graph.xlsx
📌 Concepts: 18 | Relations: 19
🖼️ Saved graph visualization: output5/04.29_KG__article111_chunk1_graph.png
🔎 Processing chunk 3/3...
✅ Saved JSON to output5/04.29_KG__article111_chunk2_graph.json
✅ Saved Excel to output5/04.29_KG__article111_chunk2_graph.xlsx
📌 Concepts: 20 | Relations: 13
🖼️ Saved graph visualization: output5/04.29_KG__article111_chunk2_grap

## Analysis on files 

In [None]:
import tiktoken
import os
import pandas as pd

def count_tokens_in_folder(folder_path, output_csv_path="output/corpus_token_counts.xlsx", model="gpt-3.5-turbo"):
    encoding = tiktoken.encoding_for_model(model)

    records = []

    txt_files = [f for f in os.listdir(folder_path) if f.endswith(".txt")]
    for file in txt_files:
        full_path = os.path.join(folder_path, file)
        with open(full_path, "r", encoding="utf-8") as f:
            text = f.read()
            tokens = len(encoding.encode(text))
            records.append({
                "File": file,
                "Tokens": tokens
            })

    df = pd.DataFrame(records)
    df.sort_values(by="Tokens", ascending=False, inplace=True)

    # Ensure output folder exists
    os.makedirs(os.path.dirname(output_csv_path), exist_ok=True)

    df.to_excel(output_csv_path, index=False)
    print(f"✅ Token count saved to {output_csv_path}")

    # Quick statistics
    print("\n📊 Token Statistics:")
    print(df.describe())

# Example Usage
count_tokens_in_folder("/Users/nil.ilba/Documents/2.Research/MA4_OpenEdition/Corpus/main/text", output_csv_path="output_analysis/token_counts.xlsx")

✅ Token count saved to output_analysis/token_counts.xlsx

📊 Token Statistics:
              Tokens
count     211.000000
mean    23786.194313
std     42158.720071
min       490.000000
25%      5925.500000
50%     13376.000000
75%     20169.500000
max    307247.000000


In [None]:
import pandas as pd
import os

def combine_excel_files(folder_path, output_excel_path="output/combined_articles.xlsx"):
    excel_files = [f for f in os.listdir(folder_path) if f.endswith(".xlsx")]

    all_dfs = []
    for file in excel_files:
        full_path = os.path.join(folder_path, file)
        print(f"🔎 Reading {file}...")
        try:
            df = pd.read_excel(full_path)
            article_no = os.path.splitext(file)[0].split("_article")[-1].split("_")[0]  # Extract article number
            df["Article File"] = file
            df["Article No"] = article_no
            all_dfs.append(df)
        except Exception as e:
            print(f"⚠️ Skipping {file}: {e}")

    combined_df = pd.concat(all_dfs, ignore_index=True)

    print(f"\n📊 Combined {len(all_dfs)} files into a single dataframe with {len(combined_df)} rows.")

    # Ensure output folder exists
    os.makedirs(os.path.dirname(output_excel_path), exist_ok=True)

    combined_df.to_excel(output_excel_path, index=False)
    print(f"✅ Combined Excel saved to {output_excel_path}")

    # Basic Initial Analysis
    print("\n🔎 Initial Analysis:")
    print(f"Total Relations: {len(combined_df)}")
    print(f"Unique Subjects: {combined_df['Subject'].nunique()}")
    print(f"Unique Objects: {combined_df['Object'].nunique()}")
    print(f"Top 10 Relations:\n{combined_df['Relation'].value_counts().head(10)}")

# Example Usage
combine_excel_files("output5", output_excel_path="output_analysis/combined_articles.xlsx")

🔎 Reading 04.29_KG__article2_chunk0_graph.xlsx...
🔎 Reading 04.29_KG__article125_chunk0_graph.xlsx...
🔎 Reading 04.29_KG__article294_graph.xlsx...
🔎 Reading 04.29_KG__article10_chunk1_graph.xlsx...
🔎 Reading 04.29_KG__article28_chunk2_graph.xlsx...
🔎 Reading 04.29_KG__article4_graph.xlsx...
🔎 Reading 04.29_KG__article402_chunk1_graph.xlsx...
🔎 Reading 04.29_KG__article370_chunk2_graph.xlsx...
🔎 Reading 04.29_KG__article278_chunk4_graph.xlsx...
🔎 Reading 04.29_KG__article320_chunk0_graph.xlsx...
🔎 Reading 04.29_KG__article209_graph.xlsx...
🔎 Reading 04.29_KG__article78_chunk0_graph.xlsx...
🔎 Reading 04.29_KG__article40_chunk3_graph.xlsx...
🔎 Reading 04.29_KG__article302_chunk4_graph.xlsx...
🔎 Reading 04.29_KG__article38_chunk1_graph.xlsx...
🔎 Reading 04.29_KG__article76_chunk1_graph.xlsx...
🔎 Reading 04.29_KG__article438_chunk0_graph.xlsx...
🔎 Reading 04.29_KG__article332_chunk1_graph.xlsx...
🔎 Reading 04.29_KG__article111_chunk0_graph.xlsx...
🔎 Reading 04.29_KG__article21_chunk18_graph

In [None]:
import pandas as pd
import os

def analyze_combined_excel(combined_excel_path):
    # Load your combined Excel file
    df = pd.read_excel(combined_excel_path)
    print(f"✅ Loaded {len(df)} relations from {combined_excel_path}")

    # 1. Total Relations per Article
    relations_per_article = df.groupby("Article No").size().reset_index(name="Total_Relations")
    print("\n🔗 Total relations per article:")
    print(relations_per_article.head())

    # 2. Since we don't have chunk info, assume 1 chunk per file (optional)
    relations_per_article["Estimated_Num_Chunks"] = 1

    # 3. Save the summary
    output_folder = os.path.dirname(combined_excel_path)
    os.makedirs(output_folder, exist_ok=True)

    relations_per_article.to_excel(os.path.join(output_folder, "analysis_summary_per_article.xlsx"), index=False)
    print(f"\n✅ Summary analysis saved to {os.path.join(output_folder, 'analysis_summary_per_article.xlsx')}")

    return relations_per_article

# Example usage:
summary_per_article = analyze_combined_excel("output_analysis/combined_articles.xlsx")

✅ Loaded 7198 relations from output_analysis/combined_articles.xlsx

🔗 Total relations per article:
   Article No  Total_Relations
0           1               19
1           2               21
2           4               20
3           6               16
4           8               36

✅ Summary analysis saved to output_analysis/analysis_summary_per_article.xlsx


In [None]:
import pandas as pd
import numpy as np
import os

def analyze_combined_excel_detailed(combined_excel_path):
    # Load your combined Excel file
    df = pd.read_excel(combined_excel_path)
    print(f"✅ Loaded {len(df)} relations from {combined_excel_path}")

    # Group by Article No
    relations_per_article = df.groupby("Article No").size().reset_index(name="Total_Relations")
    print("\n🔗 Total relations per article (first 5):")
    print(relations_per_article.head())

    # Basic Statistics
    total_articles = len(relations_per_article)
    avg_relations = relations_per_article["Total_Relations"].mean()
    median_relations = relations_per_article["Total_Relations"].median()
    min_relations = relations_per_article["Total_Relations"].min()
    max_relations = relations_per_article["Total_Relations"].max()
    std_relations = relations_per_article["Total_Relations"].std()

    print(f"\n📈 Basic Statistics:")
    print(f"- Total Articles: {total_articles}")
    print(f"- Mean Total Relations: {avg_relations:.2f}")
    print(f"- Median Total Relations: {median_relations}")
    print(f"- Min Relations: {min_relations}")
    print(f"- Max Relations: {max_relations}")
    print(f"- Standard Deviation: {std_relations:.2f}")

    # Outlier Detection (using IQR method)
    Q1 = relations_per_article["Total_Relations"].quantile(0.25)
    Q3 = relations_per_article["Total_Relations"].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    print(f"\n🛡️ Outlier Thresholds:")
    print(f"- Lower Bound: {lower_bound}")
    print(f"- Upper Bound: {upper_bound}")

    # Identify outliers
    outliers = relations_per_article[
        (relations_per_article["Total_Relations"] < lower_bound) | 
        (relations_per_article["Total_Relations"] > upper_bound)
    ]
    non_outliers = relations_per_article[
        (relations_per_article["Total_Relations"] >= lower_bound) & 
        (relations_per_article["Total_Relations"] <= upper_bound)
    ]

    print(f"\n🚨 Found {len(outliers)} outlier articles.")
    
    # Average without outliers
    avg_without_outliers = non_outliers["Total_Relations"].mean()
    median_without_outliers = non_outliers["Total_Relations"].median()

    print(f"\n📉 Statistics Excluding Outliers:")
    print(f"- Mean (no outliers): {avg_without_outliers:.2f}")
    print(f"- Median (no outliers): {median_without_outliers}")

    # Save detailed summary
    output_folder = os.path.dirname(combined_excel_path)
    os.makedirs(output_folder, exist_ok=True)

    relations_per_article.to_excel(os.path.join(output_folder, "analysis_summary_full.xlsx"), index=False)
    outliers.to_excel(os.path.join(output_folder, "analysis_outliers.xlsx"), index=False)
    non_outliers.to_excel(os.path.join(output_folder, "analysis_no_outliers.xlsx"), index=False)
    
    print(f"\n✅ Analysis files saved to {output_folder}!")

    return {
        "total_articles": total_articles,
        "avg_relations": avg_relations,
        "median_relations": median_relations,
        "std_relations": std_relations,
        "avg_without_outliers": avg_without_outliers,
        "median_without_outliers": median_without_outliers,
        "num_outliers": len(outliers)
    }

# Example usage
summary_stats = analyze_combined_excel_detailed("output_analysis/combined_articles.xlsx")

✅ Loaded 7198 relations from output_analysis/combined_articles.xlsx

🔗 Total relations per article (first 5):
   Article No  Total_Relations
0           1               19
1           2               21
2           4               20
3           6               16
4           8               36

📈 Basic Statistics:
- Total Articles: 211
- Mean Total Relations: 34.11
- Median Total Relations: 21.0
- Min Relations: 9
- Max Relations: 302
- Standard Deviation: 36.04

🛡️ Outlier Thresholds:
- Lower Bound: -9.0
- Upper Bound: 63.0

🚨 Found 18 outlier articles.

📉 Statistics Excluding Outliers:
- Mean (no outliers): 25.70
- Median (no outliers): 20.0

✅ Analysis files saved to output_analysis!


In [None]:
import pandas as pd
import os
import re

def clean_and_group_relations(excel_path):
    # Load your combined relations
    df = pd.read_excel(excel_path)
    print(f"✅ Loaded {len(df)} rows from {excel_path}")

    # 1. Clean Relation column
    def clean_relation(rel):
        if isinstance(rel, str):
            rel = rel.strip()
            if rel.endswith("s") and len(rel) > 3:  # avoid words like "is" becoming "i"
                return rel[:-1]
            else:
                return rel
        return rel

    df["Relation_Clean"] = df["Relation"].apply(clean_relation)
    print(f"🧹 Cleaned relation verbs (singularized).")

    # 2. Group by Subject, Relation_Clean, Object
    grouped = df.groupby(["Subject", "Relation_Clean", "Object", "Article No"]).size().reset_index(name="Count")
    print(f"📦 Grouped into {len(grouped)} unique (Subject, Relation, Object, Article) combinations.")

    # Save grouped version
    output_folder = os.path.dirname(excel_path)
    os.makedirs(output_folder, exist_ok=True)

    grouped_output_path = os.path.join(output_folder, "cleaned_grouped_relations.xlsx")
    grouped.to_excel(grouped_output_path, index=False)
    print(f"✅ Saved grouped clean relations to {grouped_output_path}")

    # 3. Combine Subject and Object into one Entity column
    df_combined = pd.DataFrame()
    df_combined["Entity"] = df["Subject"].astype(str) + " -> " + df["Object"].astype(str)
    df_combined["Relation_Clean"] = df["Relation_Clean"]
    df_combined["Article No"] = df["Article No"]

    combined_output_path = os.path.join(output_folder, "combined_subject_object.xlsx")
    df_combined.to_excel(combined_output_path, index=False)
    print(f"✅ Saved combined (Subject-Object) to {combined_output_path}")

    return grouped, df_combined

# Example Usage
grouped_data, combined_entities = clean_and_group_relations("output_analysis/combined_articles.xlsx")

✅ Loaded 7198 rows from output_analysis/combined_articles.xlsx
🧹 Cleaned relation verbs (singularized).
📦 Grouped into 7135 unique (Subject, Relation, Object, Article) combinations.
✅ Saved grouped clean relations to output_analysis/cleaned_grouped_relations.xlsx
✅ Saved combined (Subject-Object) to output_analysis/combined_subject_object.xlsx


# Level 2 Knowledge Graph HTML

In [None]:
import pandas as pd
from pyvis.network import Network
import collections
import networkx as nx
from networkx.algorithms import community

# --- 1. LOAD AND PREPARE THE DATA ---
print("Loading and preparing data...")
try:
    df = pd.read_csv('relations_final_entities.csv')
except FileNotFoundError:
    print("Error: 'relations_final_entities.csv' not found.")
    print("Please make sure the CSV file is in the same directory as this script.")
    exit()

# Clean up whitespace from all label columns
for col in ['subject_Label1', 'object_Label1', 'subject_Label2', 'object_Label2', 'relation']:
    if col in df.columns:
        df[col] = df[col].str.strip()

# Apply the filter based on Label1
filtered_df = df[
    (df['subject_Label1'] != 'Unlabeled') | (df['object_Label1'] != 'Unlabeled')
].copy()

# Drop any rows where the Label2 columns (our nodes) are missing
filtered_df.dropna(subset=['subject_Label2', 'object_Label2'], inplace=True)
print(f"Filtered down to {len(filtered_df)} valid relations for the full corpus.")


# --- 2. PROCESS NODES AND EDGES FOR THE ENTIRE CORPUS ---
node_to_group = {}
in_degree = collections.defaultdict(int)
edge_counts = collections.defaultdict(int)
edge_relations = collections.defaultdict(list)

for _, row in filtered_df.iterrows():
    source_node = row['subject_Label2']
    target_node = row['object_Label2']
    relation = row['relation']
    
    if source_node not in node_to_group:
        node_to_group[source_node] = row['subject_Label1']
    if target_node not in node_to_group:
        node_to_group[target_node] = row['object_Label1']
    
    in_degree[target_node] += 1
    
    edge = (source_node, target_node)
    edge_counts[edge] += 1
    edge_relations[edge].append(relation)


# --- 3. ADVANCED NETWORK ANALYSIS WITH NETWORKX ---
# Re-adding analysis to get community data for the dropdown
print("\n--- Starting Network Analysis for Community Detection ---")
G = nx.DiGraph()
for (source, target), count in edge_counts.items():
    G.add_edge(source, target, weight=count)

communities = community.louvain_communities(G.to_undirected(), weight='weight', seed=123)
node_to_community = {}
for i, comm in enumerate(communities):
    for node in comm:
        node_to_community[node] = i
print(f"Detected {len(communities)} communities/clusters in the network.")


# --- 4. CREATE CUSTOM COLOR MAP ---
defined_color_map = {
    'Economy': '#1f77b4', 'Ecosystem': '#ff7f0e', 'Events': '#2ca02c', 
    'Fields and Disciplines': '#d62728', 'Frameworks': '#9467bd', 'Institutional Action': '#8c564b',
    'Open Access': '#e377c2', 'Open Data': '#7f7f7f', 'Open Government': '#bcbd22',
    'Open Innovation': '#17becf', 'Open Science': '#aec7e8', 'Policies': '#ffbb78',
    'Research Outputs / Resources': '#98df8a', 'Research Processes / Practices': '#ff9896',
    'Research Values / Virtues': '#c5b0d5', 'Science and Society': '#c49c94',
    'Sociotechnical Devices': '#f7b6d2', 'Stakeholders/Actors': '#c7c7c7'
}
pastel_fallback = ['#A1C9F4', '#DEBB9B', '#B9F2F0', '#FFFEA3']
fallback_idx = 0
unique_groups = sorted(list(set(node_to_group.values())))
color_map = {}
for group in unique_groups:
    if group in defined_color_map:
        color_map[group] = defined_color_map[group]
    else:
        color_map[group] = pastel_fallback[fallback_idx % len(pastel_fallback)]
        fallback_idx += 1


# --- 5. CREATE THE PYVIS NETWORK ---
net = Network(height='100vh', width='100%', bgcolor='white', font_color='#222222', directed=True, notebook=True)
# Updated options for better performance and new styling
net.set_options("""
const options = {
  "configure": { "enabled": true, "filter": "physics" },
  "nodes": {
    "font": { "size": 22, "strokeWidth": 1, "strokeColor": "white" }
  },
  "edges": {
    "color": {
      "color": "#cccccc",
      "highlight": "#9370DB",
      "hover": "#9370DB",
      "inherit": false
    },
    "font": { "size": 12, "align": "top", "strokeWidth": 0 },
    "smooth": { "enabled": true, "type": "dynamic" }
  },
  "physics": {
    "enabled": true,
    "stabilization": {
      "enabled": true,
      "iterations": 1000,
      "updateInterval": 50,
      "onlyDynamicEdges": false,
      "fit": true
    },
    "barnesHut": {
      "gravitationalConstant": -40000,
      "centralGravity": 0.1,
      "springLength": 250
    },
    "minVelocity": 0.75
  },
  "interaction": { "hover": true, "navigationButtons": true }
}
""")


# --- 6. ADD NODES AND EDGES TO THE GRAPH ---
print("\nBuilding the interactive graph...")
# Add nodes with custom colors and community data
for node, group in node_to_group.items():
    node_size = 30 + (in_degree.get(node, 0) * 4)
    node_color = color_map.get(group, '#97c2fc')
    community_id = node_to_community.get(node, -1)
    
    tooltip = f"Category: {group}<br>Community ID: {community_id}"
    
    # Add community ID as a hidden attribute and explicitly set shape
    net.add_node(
        node, 
        label=node, 
        color=node_color, 
        value=node_size, 
        title=tooltip, 
        community=community_id,
        shape='box'  
    )

# Add aggregated edges with weights
for (source, target), count in edge_counts.items():
    unique_relations = sorted(list(set(edge_relations[(source, target)])))
    title_text = f"Frequency: {count}<br>Relations: {', '.join(unique_relations)}"
    net.add_edge(source=source, to=target, value=count, title=title_text)


# --- 7. GENERATE THE HTML FILE AND INJECT LEGEND AND DROPDOWN ---
output_filename = '/Users/nil.ilba/Documents/2.Research/MA4_OpenEdition/P5_gephi/html_web/knowledge_graph.html'
try:
    net.save_graph(output_filename)
    
    # --- Create HTML for the legend ---
    legend_html = '<div id="legend"><h4>Categories</h4><ul>'
    for group in unique_groups:
        color = color_map.get(group)
        legend_html += f'<li><span style="background-color:{color};"></span>{group}</li>'
    legend_html += '</ul></div>'
    
    # --- Create HTML for the community dropdown ---
    dropdown_html = '<div id="community-filter"><span>Filter by Community: </span><select id="community-select">'
    dropdown_html += '<option value="all">Show All</option>'
    for i in range(len(communities)):
        dropdown_html += f'<option value="{i}">Community {i}</option>'
    dropdown_html += '</select></div>'

    # --- Create JavaScript for the filtering logic and to disable physics after load ---
    filter_script = """
    <script type="text/javascript">
      // Wait for the network to be fully initialized and stabilized
      network.on("stabilizationIterationsDone", function () {
        // Turn off physics to improve performance
        network.setOptions( { physics: false } );
        
        // Store original nodes and edges
        const allNodes = network.body.data.nodes.get();
        const allEdges = network.body.data.edges.get();
        const communitySelect = document.getElementById('community-select');

        communitySelect.addEventListener('change', (event) => {
          const selectedCommunity = event.target.value;
          
          let nodesToShow;
          if (selectedCommunity === 'all') {
            nodesToShow = allNodes;
          } else {
            nodesToShow = allNodes.filter(node => node.community == selectedCommunity);
          }
          
          // Create a new DataSet with the filtered nodes
          const filteredNodes = new vis.DataSet(nodesToShow);
          const nodeIdsToShow = new Set(filteredNodes.getIds());

          // Filter edges to only include those within the community
          const filteredEdges = new vis.DataSet(
            allEdges.filter(edge => nodeIdsToShow.has(edge.from) && nodeIdsToShow.has(edge.to))
          );
          
          // Update the network with the new data
          network.setData({nodes: filteredNodes, edges: filteredEdges});
        });
      });
    </script>
    """

    # --- Create CSS for all UI elements ---
    ui_css = """
    <style type='text/css'>
      #legend, #community-filter {
        position: absolute;
        background-color: rgba(255, 255, 255, 0.9);
        padding: 10px;
        border-radius: 6px;
        border: 1px solid #ccc;
        font-family: sans-serif;
        font-size: 14px;
        z-index: 1000;
      }
      #legend { bottom: 15px; left: 15px; max-height: 80vh; overflow-y: auto; }
      #community-filter { top: 15px; left: 15px; }
      #legend h4 { margin-top: 0; margin-bottom: 10px; }
      #legend ul { list-style: none; padding: 0; margin: 0; }
      #legend li { display: flex; align-items: center; margin-bottom: 5px; }
      #legend span {
        display: inline-block; width: 14px; height: 14px;
        margin-right: 8px; border-radius: 2px; border: 1px solid #555;
      }
    </style>
    """

    # --- Read the generated HTML, inject everything, and save ---
    with open(output_filename, 'r', encoding='utf-8') as f:
        html_content = f.read()
    
    html_content = html_content.replace('</head>', f'{ui_css}</head>')
    # Inject the dropdown and legend first, then the script
    html_content = html_content.replace('</body>', f'{legend_html}{dropdown_html}{filter_script}</body>')

    with open(output_filename, 'w', encoding='utf-8') as f:
        f.write(html_content)

    print(f"\n✅ Success! Interactive graph with legend and filter saved as '{output_filename}'")
    print("Open this file in your web browser to view the focused knowledge graph.")

except Exception as e:
    print(f"\nAn error occurred while saving the file: {e}")


## Level 2 Knowledge Graph Analysis

In [1]:
import pandas as pd
import networkx as nx
from networkx.algorithms import community

# --- 1. LOAD AND PREPARE THE DATA ---
print("Loading and preparing data...")
try:
    df = pd.read_csv('relations_final_entities.csv')
except FileNotFoundError:
    print("Error: 'relations_final_entities.csv' not found.")
    print("Please make sure the CSV file is in the same directory as this script.")
    exit()

# Clean up whitespace from all label columns
for col in ['subject_Label1', 'object_Label1', 'subject_Label2', 'object_Label2', 'relation']:
    if col in df.columns:
        df[col] = df[col].str.strip()

# Apply the filter based on Label1
filtered_df = df[
    (df['subject_Label1'] != 'Unlabeled') | (df['object_Label1'] != 'Unlabeled')
].copy()

# Drop any rows where the Label2 columns (our nodes) are missing
filtered_df.dropna(subset=['subject_Label2', 'object_Label2'], inplace=True)
print(f"Filtered down to {len(filtered_df)} valid relations for the full corpus.")


# --- 2. PROCESS NODES AND EDGES FOR THE ENTIRE CORPUS ---
edge_counts = {}
for _, row in filtered_df.iterrows():
    source_node = row['subject_Label2']
    target_node = row['object_Label2']
    
    edge = (source_node, target_node)
    edge_counts[edge] = edge_counts.get(edge, 0) + 1


# --- 3. ADVANCED NETWORK ANALYSIS WITH NETWORKX ---
print("\n--- Starting Advanced Network Analysis ---")
# Create a NetworkX graph to perform calculations
G = nx.DiGraph()
for (source, target), count in edge_counts.items():
    G.add_edge(source, target, weight=count)

# --- A. Overall Network Statistics ---
density = nx.density(G)
num_nodes = G.number_of_nodes()
num_edges = G.number_of_edges()
avg_degree = sum(dict(G.degree()).values()) / num_nodes
print("\n--- Overall Network Statistics ---")
print(f"Total Nodes: {num_nodes}")
print(f"Total Edges: {num_edges}")
print(f"Network Density: {density:.4f}")
print(f"Average Degree: {avg_degree:.4f}")


# --- B. Global Centrality Measures ---
print("\n--- Global Centrality Analysis ---")
# Degree Centrality (In-degree and Out-degree)
in_degree_centrality = nx.in_degree_centrality(G)
out_degree_centrality = nx.out_degree_centrality(G)
betweenness = nx.betweenness_centrality(G, weight='weight', normalized=True)

print("\nTop 5 Nodes by In-Degree (Most Referenced):")
for node, score in sorted(in_degree_centrality.items(), key=lambda item: item[1], reverse=True)[:5]:
    print(f"  - {node}: {score:.4f}")

print("\nTop 5 Nodes by Out-Degree (Most Referential):")
for node, score in sorted(out_degree_centrality.items(), key=lambda item: item[1], reverse=True)[:5]:
    print(f"  - {node}: {score:.4f}")

print("\nTop 5 Most Influential Nodes (Global Betweenness Centrality):")
for node, score in sorted(betweenness.items(), key=lambda item: item[1], reverse=True)[:5]:
    print(f"  - {node}: {score:.4f}")

# --- C. Community Detection ---
print("\n--- Community Detection ---")
# Use the Louvain method on an undirected version of the graph to find communities
communities = community.louvain_communities(G.to_undirected(), weight='weight', seed=123)
# Create a mapping from node to its community ID
node_to_community = {}
for i, comm in enumerate(communities):
    for node in comm:
        node_to_community[node] = i
print(f"Detected {len(communities)} communities/clusters in the network.")

# --- D. RANK NODES WITHIN EACH COMMUNITY ---
print("\n--- Ranking Nodes Within Each Community (by Intra-Community Centrality) ---")
for i, comm_nodes in enumerate(sorted(communities, key=len, reverse=True)):
    # Create a subgraph containing only the nodes and edges of this community
    subgraph = G.subgraph(comm_nodes)
    
    # Calculate Betweenness Centrality *within* this specific subgraph
    # This measures a node's importance relative to its own community
    community_centrality = nx.betweenness_centrality(subgraph, weight='weight', normalized=True)
    
    # Sort the nodes in this community by their local centrality score
    sorted_community_nodes = sorted(community_centrality.items(), key=lambda item: item[1], reverse=True)
    
    print(f"\n- Community {i} ({len(comm_nodes)} members):")
    # Print the top 3 most central nodes for this community
    for node, score in sorted_community_nodes[:10]:
        print(f"  - {node} (Local Centrality: {score:.4f})")
    if len(sorted_community_nodes) > 3:
        print("  - ...")



Loading and preparing data...
Filtered down to 1067 valid relations for the full corpus.

--- Starting Advanced Network Analysis ---

--- Overall Network Statistics ---
Total Nodes: 146
Total Edges: 706
Network Density: 0.0333
Average Degree: 9.6712

--- Global Centrality Analysis ---

Top 5 Nodes by In-Degree (Most Referenced):
  - open access: 0.2621
  - sharing: 0.2069
  - open science: 0.1724
  - data: 0.1724
  - repository: 0.1448

Top 5 Nodes by Out-Degree (Most Referential):
  - open science: 0.4138
  - open access: 0.2000
  - researcher: 0.2000
  - repository: 0.1862
  - data: 0.1724

Top 5 Most Influential Nodes (Global Betweenness Centrality):
  - open science: 0.1429
  - open access: 0.1203
  - sharing: 0.1050
  - data: 0.0941
  - ecosystem: 0.0671

--- Community Detection ---
Detected 7 communities/clusters in the network.

--- Ranking Nodes Within Each Community (by Intra-Community Centrality) ---

- Community 0 (31 members):
  - research output (Local Centrality: 0.1857)


# Level 2 Knowledge Graph & Article Citations Multilayer Network

In [None]:
import pandas as pd
from pyvis.network import Network
import networkx as nx
import collections

# --- 1. DEFINE FILE PATHS ---
multilayer_path = '/Users/nil.ilba/Documents/2.Research/MA4_OpenEdition/P5_gephi/multilayer/'
relations_file = '/Users/nil.ilba/Documents/2.Research/MA4_OpenEdition/P5_gephi/html_web/relations_final_entities.csv'
id_mapping_file = f'{multilayer_path}Alex_all_211.xlsx'
citation_nodes_file = f'{multilayer_path}gexf_nodes.csv'
citation_edges_file = f'{multilayer_path}gexf_edges.csv'


# --- 2. LOAD AND PREPARE ALL DATA SOURCES ---
print("--- Loading and Preparing Data ---")
try:
    df_relations = pd.read_csv(relations_file)
    df_id_map = pd.read_excel(id_mapping_file, sheet_name='Sheet1') 
    df_cite_nodes = pd.read_csv(citation_nodes_file)
    df_cite_edges = pd.read_csv(citation_edges_file)
    print("All data files loaded successfully.")
except Exception as e:
    print(f"Error loading files: {e}")
    exit()

# --- 3. IDENTIFY ALL UNIQUE ARTICLES AND THEMES ---
# Create the mapping from numeric ID to OpenAlex URL
try:
    id_map = pd.Series(df_id_map['id'].values, index=df_id_map['no']).to_dict()
    print(f"Created a map for {len(id_map)} article IDs.")
except KeyError:
    print("Error: Could not find 'id' or 'no' columns in the ID mapping file. Please check the column names.")
    exit()

# Helper function to parse article strings
def parse_article_string(article_str):
    if pd.isna(article_str): return []
    return [int(art_id.strip()) for art_id in str(article_str).split(',') if art_id.strip().isdigit()]

# Find all unique OpenAlex IDs mentioned in the entire relations file
articles_to_keep = set()
for _, row in df_relations.iterrows():
    subject_article_ids = parse_article_string(row.get('subject_articles'))
    for numeric_id in subject_article_ids:
        openalex_id = id_map.get(numeric_id)
        if openalex_id:
            articles_to_keep.add(openalex_id)
            
    object_article_ids = parse_article_string(row.get('object_articles'))
    for numeric_id in object_article_ids:
        openalex_id = id_map.get(numeric_id)
        if openalex_id:
            articles_to_keep.add(openalex_id)

print(f"Identified {len(articles_to_keep)} unique articles connected to any thematic concept.")

# --- 4. FILTER THE CITATION NETWORK ---
print("\n--- Filtering the Citation Network ---")
# Filter the nodes to keep only the relevant articles
filtered_cite_nodes = df_cite_nodes[df_cite_nodes['id'].isin(articles_to_keep)].copy()
print(f"Filtered down to {len(filtered_cite_nodes)} nodes in the citation network.")

# Filter the edges to keep only connections between the remaining nodes
filtered_cite_edges = df_cite_edges[
    df_cite_edges['source'].isin(filtered_cite_nodes['id']) &
    df_cite_edges['target'].isin(filtered_cite_nodes['id'])
].copy()
print(f"Filtered down to {len(filtered_cite_edges)} edges in the citation network.")

# --- 5. BUILD THE MULTILAYER NETWORK ---
print("\n--- Building the Multilayer Network Visualization ---")
net = Network(height='100vh', width='100%', bgcolor='white', font_color='#222222', directed=True, notebook=True)

# --- A. Add Filtered Citation Layer (Articles) ---
for _, row in filtered_cite_nodes.iterrows():
    # Set label to an empty space to hide it, but keep title for hover
    net.add_node(row['id'], label=' ', color='#CCCCCC', shape='dot', value=10, title=row['label'])
print(f"Added {len(filtered_cite_nodes)} article nodes to the graph.")

# --- B. Add Thematic Layer (Label 2 entities) ---
thematic_nodes = set(df_relations['subject_Label2'].dropna()) | set(df_relations['object_Label2'].dropna())
node_to_group = {row['subject_Label2']: row['subject_Label1'] for _, row in df_relations.iterrows() if pd.notna(row['subject_Label2'])}
node_to_group.update({row['object_Label2']: row['object_Label1'] for _, row in df_relations.iterrows() if pd.notna(row['object_Label2'])})

# Updated color map as per your request
defined_color_map = {
    'Economy': '#1f77b4', 'Ecosystem': '#ff7f0e', 'Events': '#2ca02c', 
    'Fields and Disciplines': '#d62728', 'Frameworks': '#9467bd', 'Institutional Action': '#8c564b',
    'Open Access': '#e377c2', 'Open Data': '#7f7f7f', 'Open Government': '#bcbd22',
    'Open Innovation': '#17becf', 'Open Science': '#aec7e8', 'Policies': '#ffbb78',
    'Research Outputs / Resources': '#98df8a', 'Research Processes / Practices': '#ff9896',
    'Research Values / Virtues': '#c5b0d5', 'Science and Society': '#c49c94',
    'Sociotechnical Devices': '#f7b6d2', 'Stakeholders/Actors': '#c7c7c7'
}

for node in thematic_nodes:
    group = node_to_group.get(node, 'Unknown')
    color = defined_color_map.get(group, '#E0E0E0')
    net.add_node(node, label=node, color=color, shape='box', value=20, title=f"Category: {group}")
print(f"Added {len(thematic_nodes)} thematic nodes to the graph.")

# --- C. Add Edges ---
# Add citation edges
for _, row in filtered_cite_edges.iterrows():
    net.add_edge(row['source'], row['target'], color='black', width=1.5, arrows='to')
print(f"Added {len(filtered_cite_edges)} citation edges.")

# Add connecting edges (Theme -> Article)
connecting_edge_count = 0
for _, row in df_relations.iterrows():
    subject_articles = parse_article_string(row.get('subject_articles'))
    for num_id in subject_articles:
        alex_id = id_map.get(num_id)
        if alex_id in articles_to_keep and pd.notna(row['subject_Label2']):
            net.add_edge(row['subject_Label2'], alex_id, color='#fad29b', width=1, dashes=True, arrows='to')
            connecting_edge_count += 1
            
    object_articles = parse_article_string(row.get('object_articles'))
    for num_id in object_articles:
        alex_id = id_map.get(num_id)
        if alex_id in articles_to_keep and pd.notna(row['object_Label2']):
            net.add_edge(row['object_Label2'], alex_id, color='#fad29b', width=1, dashes=True, arrows='to')
            connecting_edge_count += 1
print(f"Added {connecting_edge_count} connecting edges between themes and articles.")

# --- 6. SET OPTIONS AND GENERATE HTML ---
print("\n--- Generating Interactive HTML File ---")
net.set_options("""
const options = {
  "configure": { "enabled": true, "filter": "physics" },
  "nodes": { "font": { "size": 20 } },
  "edges": { "smooth": { "type": "continuous" } },
  "physics": {
    "forceAtlas2Based": {
      "gravitationalConstant": -80,
      "centralGravity": 0.01,
      "springLength": 200
    },
    "minVelocity": 0.75,
    "solver": "forceAtlas2Based"
  },
  "interaction": { "hover": true, "navigationButtons": true, "tooltipDelay": 200 }
}
""")

output_filename = f'{multilayer_path}multilayer_thematic_citation_network_v2.html'
try:
    net.show(output_filename)
    print(f"\n✅ Success! Multilayer network saved as '{output_filename}'")
except Exception as e:
    print(f"\nAn error occurred while saving the file: {e}")
