## Co-citation Network Clustering and Analysis

In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import community as community_louvain # python-louvain library
from collections import Counter
import re
from pathlib import Path
import numpy as np

# For text processing, if you want more advanced tokenization (optional)
# import nltk
# from nltk.tokenize import word_tokenize
# from nltk.corpus import stopwords
# Ensure you have the necessary NLTK data if you use it:
# nltk.download('punkt')
# nltk.download('stopwords')

### Configuration

In [None]:
# --- Configuration ---
GRAPHML_FILE_PATH = "data/graphml/python_cocitation_network_filtered.graphml" # Or your specific graphml file
NUMBER_OF_TOP_CLUSTERS_TO_SHOW = 10
NUMBER_OF_TOP_PAPERS_PER_CLUSTER = 10
NUMBER_OF_TOP_KEYWORDS_PER_CLUSTER = 15
MIN_CLUSTER_SIZE_FOR_ANALYSIS = 5 # Minimum number of nodes for a cluster to be analyzed

# Define stopwords (customize as needed)
STOPWORDS = set([
    "the", "a", "an", "is", "are", "was", "were", "of", "and", "to", "in", "it", "that", "this",
    "for", "on", "with", "as", "by", "at", "from", "about", "into", "onto", "through", "over",
    "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how",
    "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor",
    "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just",
    "don", "should", "now", "d", "ll", "m", "o", "re", "ve", "y", "ain", "aren", "couldn",
    "didn", "doesn", "hadn", "hasn", "haven", "isn", "ma", "mightn", "mustn", "needn", "shan",
    "shouldn", "wasn", "weren", "won", "wouldn", "fig", "figure", "table", "abstract", "introduction",
    "results", "discussion", "conclusion", "references", "et", "al", "paper", "study", "method",
    "analysis", "based", "using", "system", "approach", "model", "data", "research",
    "urban", "computing", "city", "cities", "science", "review", "survey", "p", "pp", "vol", "v",
    "issue", "journal", "conference", "workshop", "chapter", "university", "department", "dept",
    "institute", "ieee", "acm", "elsevier", "springer", "wiley", "press", "proc", "int", "j", "ann",
    "soc", "sci", "technol", "res", "commun", "comput", "syst", "eur", "lect", "notes", "adv", 
    "ser", "trans", "eng", "manag", "appl", "rev", "lett", "rep", "bull", "mem", "assoc", "symp",
    "inc", "ltd", "corp", "org", "co", "ed", "eds", "vol", "no", "pp", "chap", "art", "isbn",
    "doi", "http", "https", "www", "com", "org", "net", "pdf", "html", "gov", "edu",
    # Common author initials/short names (can be expanded)
    "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z",
    "dr", "prof", "phd", "ms", "mr", "mrs", "inc", "corp", "ltd"
])

### 1. Load Co-citation Graph

In [None]:
graph_path = Path(GRAPHML_FILE_PATH)
if not graph_path.exists():
    print(f"ERROR: GraphML file not found at {graph_path}")
    print("Please ensure 'build_network.py' has been run and the file path is correct.")
    G = None
else:
    G = nx.read_graphml(graph_path)
    print(f"Graph loaded from {graph_path}")
    print(f"Number of nodes: {G.number_of_nodes()}")
    print(f"Number of edges: {G.number_of_edges()}")

# Ensure node attributes are of correct type (freq should be int, year str/int)
if G is not None:
    for node, data in G.nodes(data=True):
        if 'freq' in data:
            try:
                data['freq'] = int(data['freq'])
            except (ValueError, TypeError):
                data['freq'] = 0 # Default if conversion fails
        else:
            data['freq'] = 0
        
        if 'year' in data:
            data['year'] = str(data['year']) # Keep as string for now, convert to int when needed
        else:
            data['year'] = 'Unknown'

### 2. Graph Preprocessing (Largest Connected Component)

In [None]:
if G is not None and G.number_of_nodes() > 0:
    # Convert to undirected graph for Louvain if it's directed
    if G.is_directed():
        G_undirected = G.to_undirected()
        print("Converted graph to undirected for community detection.")
    else:
        G_undirected = G
    
    # Find the largest connected component (LCC)
    # Community detection algorithms often work best on connected graphs.
    connected_components = list(nx.connected_components(G_undirected))
    if connected_components:
        largest_component_nodes = max(connected_components, key=len)
        G_lcc = G_undirected.subgraph(largest_component_nodes).copy()
        print(f"Largest Connected Component (LCC) selected for analysis:")
        print(f"  Nodes in LCC: {G_lcc.number_of_nodes()}")
        print(f"  Edges in LCC: {G_lcc.number_of_edges()}")
    else:
        print("Graph has no connected components (it might be empty or all isolated nodes).")
        G_lcc = G_undirected # Fallback to original graph if no components
else:
    G_lcc = None
    print("Graph is not loaded or is empty. Skipping further analysis.")

### 3. Community Detection (Louvain Algorithm)

In [None]:
if G_lcc is not None and G_lcc.number_of_nodes() > 0:
    print("\nPerforming community detection using Louvain algorithm...")
    # Compute the best partition using Louvain algorithm, considering edge weights
    partition = community_louvain.best_partition(G_lcc, weight='weight', random_state=42)
    
    # Add community information to node attributes in G_lcc
    nx.set_node_attributes(G_lcc, partition, 'community')
    
    num_communities = len(set(partition.values()))
    print(f"Number of communities found: {num_communities}")
    
    if num_communities > 0:
        modularity = community_louvain.modularity(partition, G_lcc, weight='weight')
        print(f"Modularity of the partition: {modularity:.4f}")
    else:
        print("No communities were found.")
else:
    partition = {}
    num_communities = 0
    print("Skipping community detection as graph is not suitable.")

### 4. Helper Functions for Cluster Analysis

In [None]:
def simple_tokenizer(text):
    """A very basic tokenizer that splits by non-alphanumeric and converts to lower.
       It also tries to handle hyphenated words and common journal patterns.
    """
    if not isinstance(text, str):
        return []
    # Preserve hyphens within words, remove leading/trailing hyphens after split
    text = re.sub(r'([a-zA-Z])-|-(?=[a-zA-Z])', r'\1', text.lower())
    words = re.split(r'[^a-z0-9-]+', text)
    return [word.strip('-') for word in words if word.strip('-') and len(word.strip('-')) > 1] # Min length 2

def extract_keywords_from_node_attributes(nodes_data_list, stopwords_set, top_n=10):
    """Extracts keywords from 'source' and 'author' attributes of nodes in a cluster."""
    all_words = []
    for node_data in nodes_data_list:
        # Extract text from source (journal/conference name)
        source_text = node_data.get('source', '')
        if isinstance(source_text, str):
            words = simple_tokenizer(source_text)
            all_words.extend([w for w in words if w not in stopwords_set and not w.isdigit()])
        
        # Optionally, extract from author names (might be less topical)
        # author_text = node_data.get('author', '')
        # if isinstance(author_text, str):
        #     words = simple_tokenizer(author_text)
        #     all_words.extend([w for w in words if w not in stopwords_set and not w.isdigit()])
            
    if not all_words:
        return []
    
    word_counts = Counter(all_words)
    return [word for word, count in word_counts.most_common(top_n)]

def get_cluster_details(graph, partition_map, min_size_for_analysis, num_top_papers, num_top_keywords, stopwords_set):
    """Analyzes each cluster to find top papers and keywords."""
    cluster_summaries = []
    if not partition_map:
        return pd.DataFrame()

    # Group nodes by community ID
    communities = defaultdict(list)
    for node, comm_id in partition_map.items():
        communities[comm_id].append(node)

    # Sort communities by size (number of nodes)
    sorted_communities = sorted(communities.items(), key=lambda item: len(item[1]), reverse=True)

    print(f"\nAnalyzing top communities (min size: {min_size_for_analysis})...")
    for comm_id, nodes_in_comm in sorted_communities:
        if len(nodes_in_comm) < min_size_for_analysis:
            continue
        
        # Get data for nodes in this community
        nodes_data_in_comm = [graph.nodes[node] for node in nodes_in_comm if node in graph.nodes]
        
        # Sort nodes by 'freq' (overall citation count of the cited reference) descending
        # The 'label' attribute is the "AUTHOR, YEAR, SOURCE" string
        sorted_nodes_by_freq = sorted(nodes_data_in_comm, key=lambda x: x.get('freq', 0), reverse=True)
        
        top_papers_info = []
        for node_data in sorted_nodes_by_freq[:num_top_papers]:
            label = node_data.get('label', 'Unknown Label')
            freq = node_data.get('freq', 0)
            year = node_data.get('year', 'Unknown')
            top_papers_info.append(f"{label} (Freq: {freq}, Year: {year})")
            
        # Extract keywords from all nodes in the community
        keywords = extract_keywords_from_node_attributes(nodes_data_in_comm, stopwords_set, num_top_keywords)
        
        # Calculate average year and year range for the cluster (cited reference years)
        years_in_comm = [int(data['year']) for data in nodes_data_in_comm if data['year'].isdigit()]
        avg_year = np.mean(years_in_comm) if years_in_comm else 'N/A'
        min_year = min(years_in_comm) if years_in_comm else 'N/A'
        max_year = max(years_in_comm) if years_in_comm else 'N/A'
        year_range_str = f"{min_year}-{max_year}" if min_year != 'N/A' else 'N/A'
        
        cluster_summaries.append({
            'Cluster ID': comm_id,
            'Size (Nodes)': len(nodes_in_comm),
            'Avg. Cited Year': f"{avg_year:.1f}" if isinstance(avg_year, float) else avg_year,
            'Cited Year Range': year_range_str,
            f'Top {num_top_papers} Cited Refs': "\n".join(top_papers_info),
            f'Top {num_top_keywords} Keywords': ", ".join(keywords)
        })
        
    return pd.DataFrame(cluster_summaries)

### 5. Analyze and Display Clusters

In [None]:
if G_lcc is not None and num_communities > 0:
    cluster_summary_df = get_cluster_details(
        G_lcc, 
        partition, 
        MIN_CLUSTER_SIZE_FOR_ANALYSIS, 
        NUMBER_OF_TOP_PAPERS_PER_CLUSTER, 
        NUMBER_OF_TOP_KEYWORDS_PER_CLUSTER, 
        STOPWORDS
    )
    
    print(f"\n--- Summary of Top {min(NUMBER_OF_TOP_CLUSTERS_TO_SHOW, len(cluster_summary_df))} Largest Communities ---")
    if not cluster_summary_df.empty:
        # To display long text fields better in pandas
        pd.set_option('display.max_colwidth', 200)
        display(cluster_summary_df.head(NUMBER_OF_TOP_CLUSTERS_TO_SHOW))
    else:
        print("No clusters met the minimum size for analysis.")
else:
    print("\nNo communities to analyze.")

### 6. Temporal Analysis of Nodes within Static Clusters

In [None]:
if G_lcc is not None and not cluster_summary_df.empty:
    print("\n--- Temporal Analysis of Cited References within Top Clusters ---")
    
    # Analyze a few top clusters for brevity
    num_clusters_for_temporal_plot = min(5, len(cluster_summary_df))
    
    if num_clusters_for_temporal_plot > 0:
        fig, axes = plt.subplots(num_clusters_for_temporal_plot, 1, figsize=(12, 4 * num_clusters_for_temporal_plot), sharex=True)
        if num_clusters_for_temporal_plot == 1:
            axes = [axes] # Make it iterable if only one subplot
            
        all_node_years = [int(data['year']) for node, data in G_lcc.nodes(data=True) if data['year'].isdigit()]
        if not all_node_years:
            print("No valid year data found in graph nodes for temporal analysis.")
        else:
            min_graph_year = min(all_node_years) if all_node_years else 1900
            max_graph_year = max(all_node_years) if all_node_years else 2025
            year_bins = np.arange(min_graph_year, max_graph_year + 2) # +2 for right edge of last bin

            for i in range(num_clusters_for_temporal_plot):
                cluster_id = cluster_summary_df.loc[i, 'Cluster ID']
                nodes_in_comm = [node for node, comm in partition.items() if comm == cluster_id]
                
                node_years_in_comm = []
                for node_id in nodes_in_comm:
                    year_str = G_lcc.nodes[node_id].get('year', 'Unknown')
                    if year_str.isdigit():
                        node_years_in_comm.append(int(year_str))
                
                if node_years_in_comm:
                    ax = axes[i]
                    ax.hist(node_years_in_comm, bins=year_bins, edgecolor='black', alpha=0.7)
                    ax.set_title(f"Cluster {cluster_id} (Size: {len(nodes_in_comm)}) - Cited Reference Year Distribution")
                    ax.set_ylabel("Number of Cited Refs")
                    avg_year_comm = np.mean(node_years_in_comm)
                    ax.axvline(avg_year_comm, color='red', linestyle='dashed', linewidth=1, label=f'Avg Year: {avg_year_comm:.1f}')
                    ax.legend()
                else:
                    axes[i].set_title(f"Cluster {cluster_id} - No valid year data for nodes")
            
            axes[-1].set_xlabel("Publication Year of Cited Reference")
            plt.tight_layout()
            plt.show()
    else:
        print("No clusters large enough for temporal plot analysis.")

### 7. Visualization of Clustered Graph (LCC)

In [None]:
if G_lcc is not None and G_lcc.number_of_nodes() > 0 and num_communities > 0:
    print("\nVisualizing the largest connected component with communities...")
    plt.figure(figsize=(15, 15))
    
    # Use spring_layout or kamada_kawai_layout. Kamada-Kawai can be slow for large graphs.
    # For very large graphs, consider a simpler layout or visualizing subgraphs.
    if G_lcc.number_of_nodes() < 500: # Kamada-Kawai for smaller graphs
        pos = nx.kamada_kawai_layout(G_lcc, weight='weight')
    else: # Spring layout for larger graphs (can be adjusted)
        pos = nx.spring_layout(G_lcc, k=0.1, iterations=50, weight='weight', seed=42)
        
    # Node colors based on community
    community_ids = [partition[node] for node in G_lcc.nodes()]
    cmap = cm.get_cmap('viridis', max(community_ids) + 1) # Or 'tab20', 'Set3'
    
    # Node sizes based on frequency ('freq' attribute)
    node_sizes = [G_lcc.nodes[node].get('freq', 1) * 20 + 10 for node in G_lcc.nodes()] # Scale for visibility
    
    nx.draw_networkx_nodes(G_lcc, pos, node_color=community_ids, cmap=cmap, node_size=node_sizes, alpha=0.8)
    nx.draw_networkx_edges(G_lcc, pos, alpha=0.2, width=0.5)
    
    # Optional: Add labels to very important nodes (e.g., top N by frequency)
    # sorted_nodes_for_labels = sorted(G_lcc.nodes(data=True), key=lambda x: x[1].get('freq', 0), reverse=True)
    # labels_to_draw = {node: data.get('label', '')[:20]+"..." for node, data in sorted_nodes_for_labels[:10]} # Label top 10
    # nx.draw_networkx_labels(G_lcc, pos, labels=labels_to_draw, font_size=8)
    
    plt.title(f"Co-citation Network (LCC) - Colored by Louvain Community (k={num_communities}, Modularity={modularity:.3f})")
    plt.axis('off')
    plt.show()
else:
    print("Skipping graph visualization as graph is not suitable or no communities found.")

### 8. Discussion on Time-Series Co-citation Analysis (Dynamic Communities)

Your coworker's notebook (`citation (1).ipynb`) includes a section for "Time-series Co-citation Network Analysis." This type of analysis typically involves creating graph snapshots for different time periods (e.g., yearly) and then detecting communities within each snapshot to see how topics evolve.

**Key Requirement for Dynamic Community Analysis:**
To perform this, the **edges** in your co-citation graph need a 'year' attribute, representing the publication year of the *citing paper* that created the co-citation link. Your current `build_network.py` script (as last revised) adds a 'year' attribute to *nodes* (the publication year of the cited reference itself), but not to edges.

**How to Adapt for Dynamic Analysis (if desired):**

1.  **Modify `build_network.py`:**
    * When parsing WoS files in `parse_wos_file`, ensure you capture the Publication Year (`PY`) of the *citing paper*.
    * In `build_cocitation_network`, when you create co-citation edges (`cocitation_links[edge] += 1`), you would also need to store the `PY` of the *citing paper* that generated this specific co-citation instance. If an edge (a pair of co-cited references) is formed by multiple citing papers from different years, you might store a list of years or the earliest/latest year.
    * When saving to GraphML in `save_graph_to_graphml`, ensure this 'year' (or 'years') attribute is written for each edge.

2.  **Adapt Analysis Notebook:**
    * Once your GraphML file contains 'year' attributes on edges, you can create yearly subgraphs:
        ```python
        # Example:
        # G_with_edge_years = nx.read_graphml("path_to_your_new_graphml_with_edge_years.graphml")
        # unique_edge_years = sorted(list(set(d['year'] for u,v,d in G_with_edge_years.edges(data=True) if 'year' in d)))
        # subgraphs_by_year = {}
        # for year in unique_edge_years:
        #     edges_this_year = [(u,v) for u,v,d in G_with_edge_years.edges(data=True) if d.get('year') == year]
        #     subG = G_with_edge_years.edge_subgraph(edges_this_year).copy()
        #     # Remove isolated nodes from the subgraph for cleaner community detection
        #     subG.remove_nodes_from(list(nx.isolates(subG)))
        #     subgraphs_by_year[year] = subG
        ```
    * Then, you can run community detection on each `subG` in `subgraphs_by_year` and analyze the evolution of communities, similar to cells 16-19 in your coworker's notebook (e.g., tracking community size, top papers, Jaccard similarity between communities in consecutive years).

**Alternative Temporal Insight (with current graph):**
The "Temporal Analysis of Nodes within Static Clusters" section (Section 6) already provides some temporal insights by looking at the distribution of the publication years of the *cited references* within the clusters found in the overall static graph. This can indicate whether a cluster focuses on foundational (older) literature or more recent developments.