In [30]:
# Cell 2: Imports and Configuration
import re
import pandas as pd
import networkx as nx
from collections import Counter, defaultdict
import community as community_louvain # python-louvain library
import matplotlib.pyplot as plt
import numpy as np

# --- Configuration for later analysis ---
NUMBER_OF_TOP_ITEMS_TO_SHOW = 10 # << MODIFIED
STOPWORDS = set([
    "the", "a", "an", "is", "are", "was", "were", "of", "and", "to", "in", "it", "that", "this",
    "for", "on", "with", "as", "by", "at", "from", "about", "into", "onto", "through", "over",
    "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how",
    "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor",
    "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just",
    "don", "should", "now", "d", "ll", "m", "o", "re", "ve", "y", "ain", "aren", "couldn",
    "didn", "doesn", "hadn", "hasn", "haven", "isn", "ma", "mightn", "mustn", "needn", "shan",
    "shouldn", "wasn", "weren", "won", "wouldn", "fig", "figure", "table", "abstract", "introduction",
    "results", "discussion", "conclusion", "references", "et", "al", "paper", "study", "method",
    "analysis", "based", "using", "system", "approach", "model", "data", "research",
    "urban", "computing", "city", "cities", "science", "review", "survey", "p", "pp", "vol",
    "issue", "journal", "conference", "workshop", "chapter", "university", "department",
    "institute", "ieee", "acm", "elsevier", "springer"
])

In [31]:
# Cell 3: Parser and Initial DataFrame Processing (No significant change to logic, minor parsing robustness for authors)
def parse_wos_file(file_content):
    """
    Parses the content of a Web of Science plain text file.
    Records are separated by 'ER' and start with 'PT J' (usually, after initial FN/VR).
    Handles multi-line fields.
    """
    records_str = file_content.split('\nER\n') # ER is the End Record tag

    parsed_records = []
    current_field_tag = None

    for rec_str in records_str:
        if not rec_str.strip():
            continue

        record = defaultdict(list)
        lines = rec_str.strip().split('\n')
        field_tag = None
        for line in lines:
            if not line.strip():
                continue

            if re.match(r"^[A-Z0-9]{2} ", line[:3]):
                field_tag = line[:2]
                field_value = line[3:].strip()
            elif field_tag and line.startswith('   '):
                field_value = line[3:].strip()
            else:
                if field_tag is None and ':' in line :
                    parts = line.split(':',1)
                    if len(parts[0]) <= 3 :
                         field_tag = parts[0].strip()
                         field_value = parts[1].strip()
                    else:
                        continue
                else :
                    continue

            # Fields that can have multiple values and are typically semi-colon separated if on one line in source
            # but WoS plain text often lists them on new lines with the same tag or continued lines.
            if field_tag in ['AU', 'AF', 'DE', 'ID']: # Added 'AF' (Author Full Names)
                record[field_tag].append(field_value)
            elif field_tag == 'CR': # Cited References are appended individually
                 record[field_tag].append(field_value)
            elif field_tag in ['C1', 'RP']: # Address fields
                if field_tag not in record or not record[field_tag]: # First line for this tag
                    record[field_tag] = [field_value]
                elif line.startswith('   ') and record[field_tag]: # Continuation of current C1/RP item
                    record[field_tag][-1] += " " + field_value # Append to current line of the address
                else: # New author/address block within C1 (often separated by semicolons in the file or new tag line)
                    record[field_tag].append(field_value) # Start a new address item
            elif field_tag == 'AB':
                if record[field_tag]:
                    record[field_tag] = record[field_tag] + " " + field_value
                else:
                    record[field_tag] = field_value
            elif field_tag: # For single-value fields (or fields treated as single if not specified above)
                if record[field_tag] and isinstance(record[field_tag], list): # If it was accidentally made a list
                     record[field_tag][0] = field_value # Overwrite if already set (e.g. from FN on first line)
                else:
                    record[field_tag] = field_value


        # Clean up: join list items for fields that are actually single
        # For C1 and RP, if they are lists of strings, join them carefully if they represent a single address block.
        # However, C1 can also list multiple authors' affiliations.
        # For now, keeping C1/RP as lists of strings, where each string is a part of an address or a full one.
        # The previous logic might have over-appended. Let's refine C1/RP handling during parsing.
        # If C1/RP are address blocks that can legitimately be multi-part, we should join them appropriately.
        # The defaultdict(list) and append for AU, AF, DE, ID, CR is correct.
        # For other fields that become lists but should be strings:
        for tag in list(record.keys()):
            if tag not in ['AU', 'AF', 'DE', 'ID', 'CR', 'C1', 'RP'] and isinstance(record[tag], list):
                if len(record[tag]) == 1:
                    record[tag] = record[tag][0]
                else: # If multiple entries for a supposedly single field, join them (heuristic)
                    record[tag] = "; ".join(record[tag])


        if 'PT' in record:
            parsed_records.append(dict(record))

    return parsed_records

# --- Load and parse the file ---
file_path = "savedrecs.txt" # Ensure this file is in the correct path
try:
    with open(file_path, 'r', encoding='utf-8-sig') as f:
        file_content = f.read()

    wos_records = parse_wos_file(file_content)

    if not wos_records:
        raise ValueError("No records were parsed. Check file format or content.")

    df_records = pd.DataFrame(wos_records)

    if 'UT' not in df_records.columns or df_records['UT'].isnull().any():
        print("Warning: 'UT' field (Unique ID) is missing or incomplete. Generating sequential IDs.")
        df_records['UT_generated'] = range(len(df_records))
        id_col = 'UT_generated'
    else:
        df_records['UT'] = df_records['UT'].astype(str).str.replace('WOS:', '', case=False)
        id_col = 'UT'

    df_records.set_index(id_col, inplace=True)

    print(f"Successfully parsed {len(df_records)} records from {file_path}.")
    # print(f"DataFrame columns: {df_records.columns.tolist()}")
    # print("\nSample of parsed data (first record C1, RP, AU, AF):")
    # if not df_records.empty:
    #     sample_record = df_records.head(1)
    #     for col_to_check in ['C1', 'RP', 'AU', 'AF', 'CR', 'DE', 'ID']:
    #         if col_to_check in sample_record:
    #             print(f"{col_to_check}: {sample_record[col_to_check].values[0]}")
    #         else:
    #             print(f"{col_to_check}: Not present")


    # Ensure essential columns for network building exist
    list_like_cols = ['CR', 'AU', 'AF', 'DE', 'ID'] # AF added
    for col in list_like_cols:
        if col not in df_records.columns:
            df_records[col] = pd.Series([[] for _ in range(len(df_records))], index=df_records.index)
        else:
            # Ensure these are lists, even if NaN (for papers with no CRs, DEs, etc.)
             df_records[col] = df_records[col].apply(lambda x: x if isinstance(x, list) else ([] if pd.isna(x) else ([x] if not isinstance(x,list) else x) ))


    if 'TC' not in df_records.columns:
        df_records['TC'] = 0
    else:
        df_records['TC'] = pd.to_numeric(df_records['TC'], errors='coerce').fillna(0).astype(int)

    if 'PY' not in df_records.columns:
        df_records['PY'] = 'Unknown'
    else:
        df_records['PY'] = df_records['PY'].fillna('Unknown').astype(str)

    if 'TI' not in df_records.columns:
        df_records['TI'] = 'No Title Captured'
    else:
        df_records['TI'] = df_records['TI'].fillna('No Title Captured')

    if 'SO' not in df_records.columns:
        df_records['SO'] = 'Unknown Source'
    else:
        df_records['SO'] = df_records['SO'].fillna('Unknown Source')

    if 'AB' not in df_records.columns:
        df_records['AB'] = ''
    else:
        df_records['AB'] = df_records['AB'].fillna('')

except FileNotFoundError:
    print(f"ERROR: File '{file_path}' not found.")
    df_records = pd.DataFrame()
except Exception as e:
    print(f"An error occurred during parsing or initial DataFrame processing: {e}")
    df_records = pd.DataFrame()

# Display sample of critical fields for verification after parsing
if not df_records.empty:
    print("\nSample of key fields from the first parsed record:")
    sample_record_df = df_records.head(1)
    for col_key in ['TI', 'AU', 'AF', 'PY', 'SO', 'AB', 'DE', 'ID', 'CR', 'TC']:
        if col_key in sample_record_df.columns:
            val = sample_record_df[col_key].iloc[0]
            if isinstance(val, list):
                print(f"  {col_key}: {val[:3]} {'...' if len(val) > 3 else ''} (List of {len(val)})")
            else:
                print(f"  {col_key}: {str(val)[:70]} {'...' if len(str(val)) > 70 else ''}")
        else:
            print(f" {col_key}: Not found")
else:
    print("DataFrame is empty, cannot show sample.")

Successfully parsed 915 records from savedrecs.txt.

Sample of key fields from the first parsed record:
  TI: Urban Computing: Concepts, Methodologies, and Applications 
  AU: ['Zheng, Y', 'Capra, L', 'Wolfson, O'] ... (List of 4)
  AF: ['Zheng, Yu', 'Capra, Licia', 'Wolfson, Ouri'] ... (List of 4)
  PY: 2014 
  SO: ACM TRANSACTIONS ON INTELLIGENT SYSTEMS AND TECHNOLOGY 
  AB: Urbanization's rapid progress has modernized many people's lives but a ...
  DE: ['Algorithms; Measurement; Experimentation; Urban computing; urban', 'informatics; big data; human mobility; city dynamics; urban sensing;', 'knowledge fusion; computing with heterogeneous data; trajectories']  (List of 3)
  ID: ['PATTERNS; INFORMATION; DISCOVERY; TIME']  (List of 1)
  CR: ['Aggarwal C. C., 2007, DATA STREAMS MODELS', 'Andrienko G., 2010, P EUR IEEE CGTC S VI', 'Andrienko N, 2003, J VISUAL LANG COMPUT, V14, P503, DOI 10.1016/S1045-926X(03)00046-6'] ... (List of 163)
  TC: 918 


In [32]:
# Cell 4: Bibliographic Coupling Network Construction
if not df_records.empty:
    print("Building Bibliographic Coupling Network...")
    G_coupling = nx.Graph()

    for paper_id, data in df_records.iterrows():
        # Determine the first author's name for the label (Full Name if available)
        af_authors = data.get('AF', [])
        au_authors = data.get('AU', [])
        first_author_display_name = "Unknown Author"

        if af_authors and isinstance(af_authors, list) and af_authors[0]:
            first_author_display_name = af_authors[0]
        elif au_authors and isinstance(au_authors, list) and au_authors[0]:
            # Fallback to the first part of AU (before comma or semicolon) if AF is not available
            first_author_display_name = re.split(r'[;,]', au_authors[0])[0].strip()

        node_label = f"{first_author_display_name}, {data.get('PY', 'N/A')}, {str(data.get('TI', 'No Title'))[:30]}..."

        G_coupling.add_node(
            paper_id,
            title=data.get('TI', 'N/A'),
            authors_short=au_authors, # AU field
            authors_full=af_authors,  # AF field << MODIFIED to store full names
            year=str(data.get('PY', 'N/A')),
            source=data.get('SO', 'N/A'),
            times_cited=int(data.get('TC', 0)),
            keywords_de=data.get('DE', []),
            keywords_id=data.get('ID', []),
            abstract=data.get('AB', ''),
            cited_references_count=len(data.get('CR', [])),
            label=node_label # Label now uses full first author name if available
        )

    paper_ids = list(df_records.index)
    cited_ref_map = {pid: set(df_records.loc[pid, 'CR']) for pid in paper_ids if 'CR' in df_records.columns and isinstance(df_records.loc[pid, 'CR'], list) and df_records.loc[pid, 'CR']}


    for i in range(len(paper_ids)):
        for j in range(i + 1, len(paper_ids)):
            u_id = paper_ids[i]
            v_id = paper_ids[j]

            refs_u = cited_ref_map.get(u_id, set())
            refs_v = cited_ref_map.get(v_id, set())

            if not refs_u or not refs_v:
                continue

            common_refs = refs_u.intersection(refs_v)
            coupling_strength = len(common_refs)

            if coupling_strength > 0:
                G_coupling.add_edge(u_id, v_id, weight=coupling_strength)
        if (i + 1) % 100 == 0:
            print(f"Processed bibliographic coupling for {i+1}/{len(paper_ids)} papers...")


    print(f"\nBibliographic Coupling Network constructed:")
    print(f"Number of nodes: {G_coupling.number_of_nodes()}")
    print(f"Number of edges: {G_coupling.number_of_edges()}")

    if G_coupling.number_of_edges() == 0 and G_coupling.number_of_nodes() > 1:
        print("\nWarning: No edges were created. Check 'CR' field parsing and content.")

else:
    print("DataFrame `df_records` is empty. Skipping network construction.")
    G_coupling = nx.Graph()

Building Bibliographic Coupling Network...
Processed bibliographic coupling for 100/915 papers...
Processed bibliographic coupling for 200/915 papers...
Processed bibliographic coupling for 300/915 papers...
Processed bibliographic coupling for 400/915 papers...
Processed bibliographic coupling for 500/915 papers...
Processed bibliographic coupling for 600/915 papers...
Processed bibliographic coupling for 700/915 papers...
Processed bibliographic coupling for 800/915 papers...
Processed bibliographic coupling for 900/915 papers...

Bibliographic Coupling Network constructed:
Number of nodes: 915
Number of edges: 32780


In [33]:
# Cell 5: Largest Connected Component (No change to logic)
if G_coupling.number_of_nodes() > 0 and G_coupling.number_of_edges() > 0:
    connected_components = list(nx.connected_components(G_coupling))
    if connected_components:
        largest_cc_nodes = max(connected_components, key=len)
        G_main_cc_new = G_coupling.subgraph(largest_cc_nodes).copy()
        print(f"\nAnalysis will proceed on the largest connected component (LCC):")
        print(f"LCC - Number of nodes: {G_main_cc_new.number_of_nodes()}")
        print(f"LCC - Number of edges: {G_main_cc_new.number_of_edges()}")
    else:
        print("Graph has no connected components. Using the original graph.")
        G_main_cc_new = G_coupling.copy()
else:
    print("Graph G_coupling is empty or has no edges. Skipping LCC step.")
    G_main_cc_new = G_coupling.copy()


Analysis will proceed on the largest connected component (LCC):
LCC - Number of nodes: 832
LCC - Number of edges: 32779


In [34]:
# Cell 6: Centrality Calculations (No change to logic, label used for print will have full name)
if G_main_cc_new.number_of_nodes() > 0:
    print("\nCalculating centrality measures for the LCC...")

    degree_centrality = nx.degree_centrality(G_main_cc_new)
    nx.set_node_attributes(G_main_cc_new, degree_centrality, 'degree_centrality')
    print("Degree centrality calculated.")

    num_nodes_for_k = G_main_cc_new.number_of_nodes()
    k_betweenness = min(max(100, num_nodes_for_k // 20), 1000) if num_nodes_for_k > 100 else num_nodes_for_k
    if num_nodes_for_k > 1 :
        print(f"Calculating approximate betweenness centrality with k={k_betweenness}...")
        betweenness_centrality = nx.betweenness_centrality(G_main_cc_new, k=k_betweenness, normalized=True, seed=42)
        nx.set_node_attributes(G_main_cc_new, betweenness_centrality, 'betweenness_centrality')
        print("Approximate betweenness centrality calculated.")
    else:
        print("Skipping betweenness centrality due to very few nodes.")

    try:
        print("Calculating eigenvector centrality...")
        eigenvector_centrality = nx.eigenvector_centrality_numpy(G_main_cc_new, weight='weight')
        nx.set_node_attributes(G_main_cc_new, eigenvector_centrality, 'eigenvector_centrality')
        print("Eigenvector centrality calculated.")
    except Exception as e:
        print(f"Could not compute eigenvector centrality with weights: {e}. Trying without weights.")
        try:
            eigenvector_centrality = nx.eigenvector_centrality_numpy(G_main_cc_new)
            nx.set_node_attributes(G_main_cc_new, eigenvector_centrality, 'eigenvector_centrality')
            print("Eigenvector centrality calculated (unweighted).")
        except Exception as e2:
            print(f"Eigenvector centrality still failed: {e2}")
            nx.set_node_attributes(G_main_cc_new, {n: 0.0 for n in G_main_cc_new.nodes()}, 'eigenvector_centrality')

    if degree_centrality:
        sorted_degree = sorted(degree_centrality.items(), key=lambda item: item[1], reverse=True)
        # Display top N (NUMBER_OF_TOP_ITEMS_TO_SHOW) nodes
        print(f"\nTop {NUMBER_OF_TOP_ITEMS_TO_SHOW} nodes in LCC by Degree Centrality:")
        for i, (node_id, score) in enumerate(sorted_degree[:NUMBER_OF_TOP_ITEMS_TO_SHOW]): # << MODIFIED to use global N
            node_label = G_main_cc_new.nodes[node_id].get('label', node_id)
            print(f"  {i+1}. Node {node_label}: {score:.4f}")
else:
    print("LCC is empty. Skipping centrality calculations.")


Calculating centrality measures for the LCC...
Degree centrality calculated.
Calculating approximate betweenness centrality with k=100...
Approximate betweenness centrality calculated.
Calculating eigenvector centrality...
Eigenvector centrality calculated.

Top 10 nodes in LCC by Degree Centrality:
  1. Node Amin, Modhurima Dey, 2021, Predicting access to healthful...: 0.3213
  2. Node Martin, Katie S., 2014, Case Study of Hartford-Connect...: 0.2972
  3. Node Wolf-Powers, Laura, 2017, Food Deserts and Real-Estate-L...: 0.2960
  4. Node Kelli, Heval M., 2019, With Cardiovascular Disease...: 0.2912
  5. Node Mishra, Sabyasachee, 2023, shopping travel patterns...: 0.2876
  6. Node Bastian, Elizabeth, 2016, Metropolitan Detroit...: 0.2864
  7. Node Zhang, Mingyang, 2022, Urban Anomaly Analytics: Descr...: 0.2780
  8. Node Colon-Ramos, Uriyoan, 2018, Children? A Photovoice Narrati...: 0.2720
  9. Node Jin, He, 2021, swamps...: 0.2720
  10. Node Cooksey-Stowers, Kristen, 2017, States...: 

In [35]:
# Cell 7: Louvain Community Detection (No change to logic)
if G_main_cc_new.number_of_nodes() > 0 and G_main_cc_new.number_of_edges() > 0:
    print("\nPerforming community detection using Louvain algorithm on LCC...")
    use_weights_for_louvain = False
    if G_main_cc_new.edges():
        sample_edge_data = next(iter(G_main_cc_new.edges(data=True)))[2]
        if 'weight' in sample_edge_data and isinstance(sample_edge_data['weight'], (int,float)):
            use_weights_for_louvain = True
            print("Using 'weight' attribute for Louvain community detection.")
        else:
            print("No suitable 'weight' attribute found on edges. Running Louvain unweighted.")

    if use_weights_for_louvain:
        partition_new = community_louvain.best_partition(G_main_cc_new, weight='weight', random_state=42)
    else:
        partition_new = community_louvain.best_partition(G_main_cc_new, random_state=42)

    nx.set_node_attributes(G_main_cc_new, partition_new, 'community_id')
    num_communities_found = len(set(partition_new.values()))
    print(f"Number of communities found (Louvain): {num_communities_found}")

    community_counts = Counter(partition_new.values())
    # Display sizes of the N largest communities (NUMBER_OF_TOP_ITEMS_TO_SHOW, e.g. 10)
    print(f"\nTop {NUMBER_OF_TOP_ITEMS_TO_SHOW} largest communities (Louvain):") # << MODIFIED to use global N for printout
    for i, (comm_id, count) in enumerate(community_counts.most_common(NUMBER_OF_TOP_ITEMS_TO_SHOW)):
        print(f"  Community {comm_id}: {count} nodes")
else:
    print("LCC is empty or has no edges. Skipping community detection.")
    partition_new = {}
    num_communities_found = 0


Performing community detection using Louvain algorithm on LCC...
Using 'weight' attribute for Louvain community detection.
Number of communities found (Louvain): 5

Top 10 largest communities (Louvain):
  Community 2: 338 nodes
  Community 0: 329 nodes
  Community 4: 137 nodes
  Community 3: 20 nodes
  Community 1: 8 nodes


In [36]:
# Cell 8: Initial Community Analysis Printout (Modified to use full author names and NUMBER_OF_TOP_ITEMS_TO_SHOW)
# This cell provides a basic printout. The more detailed one is in Cell 10.

def get_node_data_for_initial_analysis(graph, node_id): # Renamed to avoid conflict
    if node_id not in graph:
        return {"id": node_id, "error": "Node not found in graph"}
    node_attrs = graph.nodes[node_id]

    # Prioritize full author names, fallback to short names
    authors_list = node_attrs.get('authors_full', [])
    if not authors_list or not any(authors_list): # If AF is empty or list of empty/None
        authors_list = node_attrs.get('authors_short', [])

    title_text = node_attrs.get('title', '')
    abstract_text = node_attrs.get('abstract', '')
    keywords_de_list = node_attrs.get('keywords_de', [])
    keywords_id_list = node_attrs.get('keywords_id', [])
    keywords_de_str = " ".join(keywords_de_list if isinstance(keywords_de_list, list) else [str(keywords_de_list)])
    keywords_id_str = " ".join(keywords_id_list if isinstance(keywords_id_list, list) else [str(keywords_id_list)])
    text_for_keywords = f"{title_text} {abstract_text} {keywords_de_str} {keywords_id_str}"

    return {
        "id": node_id,
        "label": node_attrs.get('label', str(node_id)), # Label already updated for full first author
        "text_for_keywords": text_for_keywords,
        "year": node_attrs.get('year', 'N/A'),
        "times_cited": int(node_attrs.get('times_cited', 0)),
        "authors": authors_list, # << MODIFIED to use full names primarily
        "source": node_attrs.get('source', 'N/A'),
        "degree_centrality": float(node_attrs.get('degree_centrality', 0.0)),
        "betweenness_centrality": float(node_attrs.get('betweenness_centrality', 0.0)),
        "eigenvector_centrality": float(node_attrs.get('eigenvector_centrality', 0.0)),
        "community_id": node_attrs.get('community_id', -1)
    }

if 'partition_new' not in locals() or not isinstance(partition_new, dict) or not partition_new:
    print("ERROR: 'partition_new' dictionary not found or empty.")
elif 'G_main_cc_new' not in locals() or not hasattr(G_main_cc_new, 'nodes') or G_main_cc_new.number_of_nodes() == 0:
    print("ERROR: 'G_main_cc_new' graph not found or empty.")
else:
    community_sizes = Counter(partition_new.values())
    # Analyze top N (NUMBER_OF_TOP_ITEMS_TO_SHOW) largest communities
    top_communities_to_analyze = community_sizes.most_common(NUMBER_OF_TOP_ITEMS_TO_SHOW) # << MODIFIED

    if not top_communities_to_analyze:
        print("No communities found to analyze.")
    else:
        top_community_ids_to_analyze = [item[0] for item in top_communities_to_analyze]
        print(f"Brief Analysis of the Top {len(top_community_ids_to_analyze)} largest communities: {top_community_ids_to_analyze}\n")

        for comm_id in top_community_ids_to_analyze:
            community_size = community_sizes[comm_id]
            print(f"\n\n{'='*20} Brief Community Analysis: ID {comm_id} (Size: {community_size} papers) {'='*20}")
            community_node_ids = [node_id for node_id, c_id in partition_new.items() if c_id == comm_id]
            community_papers_details = []
            for node_id in community_node_ids:
                paper_data = get_node_data_for_initial_analysis(G_main_cc_new, node_id)
                if "error" not in paper_data:
                     community_papers_details.append(paper_data)

            if not community_papers_details:
                print("  No valid paper data collected for this community.")
                continue

            sorted_by_times_cited = sorted(community_papers_details, key=lambda x: x['times_cited'], reverse=True)
            print(f"\n  --- Top {NUMBER_OF_TOP_ITEMS_TO_SHOW} Papers by Times Cited ---")
            for i, paper in enumerate(sorted_by_times_cited[:NUMBER_OF_TOP_ITEMS_TO_SHOW]):
                print(f"    {i+1}. TC: {paper['times_cited']} | Year: {paper['year']} | {paper['label']}")

            # ... (other brief centrality rankings if desired, using NUMBER_OF_TOP_ITEMS_TO_SHOW)

            all_text_content_in_community = [paper['text_for_keywords'] for paper in community_papers_details if paper['text_for_keywords']]
            community_words = []
            for text_content in all_text_content_in_community:
                words = re.findall(r'\b[a-zA-Z]{3,}\b', text_content.lower())
                community_words.extend([word for word in words if word not in STOPWORDS and len(word) > 2])
            keyword_counts = Counter(community_words)
            print(f"\n  --- Top {NUMBER_OF_TOP_ITEMS_TO_SHOW * 2} Keywords ---") # Shows more keywords
            for keyword, count in keyword_counts.most_common(NUMBER_OF_TOP_ITEMS_TO_SHOW * 2): print(f"    '{keyword}': {count}")

            all_authors_in_community = []
            for paper in community_papers_details:
                if paper['authors']: all_authors_in_community.extend(paper['authors'])
            author_counts = Counter(all_authors_in_community)
            print(f"\n  --- Top {NUMBER_OF_TOP_ITEMS_TO_SHOW} Authors ---") # Uses full names now
            for author, count in author_counts.most_common(NUMBER_OF_TOP_ITEMS_TO_SHOW): print(f"    {author}: {count} appearance(s)")
        print(f"\n\n{'='*20} End of Brief Top {len(top_community_ids_to_analyze)} Community Analysis {'='*20}")

Brief Analysis of the Top 5 largest communities: [2, 0, 4, 3, 1]




  --- Top 10 Papers by Times Cited ---
    1. TC: 1108 | Year: 2010 | Walker, Renee E., 2010, food deserts literature...
    2. TC: 388 | Year: 2017 | Cooksey-Stowers, Kristen, 2017, States...
    3. TC: 356 | Year: 2014 | Cummins, Steven, 2014, Did Not Alter Dietary Habits O...
    4. TC: 305 | Year: 2004 | Song, Y, 2004, Measuring urban form - Is Port...
    5. TC: 288 | Year: 2003 | Wrigley, N, 2003, deserts' study...
    6. TC: 224 | Year: 2015 | Dubowitz, Tamara, 2015, Desert, But Not Because Of Sup...
    7. TC: 209 | Year: 2014 | Farber, Steven, 2014, Temporal variability in transi...
    8. TC: 189 | Year: 2008 | Hawkes, Corinna, 2008, Dietary Implications of Superm...
    9. TC: 185 | Year: 2011 | Gordon, Cynthia, 2011, Measuring food deserts in New ...
    10. TC: 184 | Year: 2020 | Turner, Christopher, 2020, Systematic Scoping Review...

  --- Top 20 Keywords ---
    'food': 3260
    'access': 819
    'dese

In [37]:
# Cell 9: Detailed Community Summaries (print output version - before HTML table)
# This cell is an adaptation of the previous cell 9 logic if you want a text-based detailed dump.
# The main HTML output is generated by the next cell (formerly cell 10).
# For brevity, I will keep this similar to the original cell 9's structure,
# but note that the real "enhanced" output is in the next cell.

# Helper function (can be merged with the one in the next cell if identical)
def get_node_data_for_detailed_print_analysis(graph, node_id):
    if node_id not in graph:
        return {"id": node_id, "error": "Node not found in graph"}
    node_attrs = graph.nodes[node_id]
    authors_list = node_attrs.get('authors_full', [])
    if not authors_list or not any(authors_list): authors_list = node_attrs.get('authors_short', [])

    title_text = node_attrs.get('title', '')
    abstract_text = node_attrs.get('abstract', '') # Ensure abstract is fetched
    keywords_de_list = node_attrs.get('keywords_de', [])
    keywords_id_list = node_attrs.get('keywords_id', [])
    keywords_de_str = " ".join(keywords_de_list if isinstance(keywords_de_list, list) else [str(keywords_de_list)])
    keywords_id_str = " ".join(keywords_id_list if isinstance(keywords_id_list, list) else [str(keywords_id_list)])
    text_for_keywords = f"{title_text} {abstract_text} {keywords_de_str} {keywords_id_str}"

    year_str_val = node_attrs.get('year', '0')
    try: year_int_val = int(float(year_str_val))
    except ValueError: year_int_val = 0

    return {
        "id": node_id,
        "label": node_attrs.get('label', str(node_id)),
        "title": title_text,
        "abstract": abstract_text, # Include abstract
        "text_for_keywords": text_for_keywords,
        "year_str": year_str_val, # Keep original year string
        "year_int": year_int_val, # Integer year
        "times_cited": int(node_attrs.get('times_cited', 0)),
        "authors": authors_list, # Full names
        "source": node_attrs.get('source', 'N/A'),
        "degree_centrality": float(node_attrs.get('degree_centrality', 0.0)),
        "betweenness_centrality": float(node_attrs.get('betweenness_centrality', 0.0)),
        "eigenvector_centrality": float(node_attrs.get('eigenvector_centrality', 0.0)),
        "community_id": node_attrs.get('community_id', -1)
    }


if 'partition_new' not in locals() or not isinstance(partition_new, dict) or not partition_new:
    print("ERROR: 'partition_new' dictionary not found or empty for detailed print analysis.")
elif 'G_main_cc_new' not in locals() or not hasattr(G_main_cc_new, 'nodes') or G_main_cc_new.number_of_nodes() == 0:
    print("ERROR: 'G_main_cc_new' graph not found or empty for detailed print analysis.")
else:
    community_sizes = Counter(partition_new.values())
    # Analyze top N (NUMBER_OF_TOP_ITEMS_TO_SHOW, now 10) largest communities
    top_communities_for_print = community_sizes.most_common(NUMBER_OF_TOP_ITEMS_TO_SHOW)

    if not top_communities_for_print:
        print("No communities found for detailed print analysis.")
    else:
        top_ids_for_print = [item[0] for item in top_communities_for_print]
        print(f"Generating Detailed Print Summaries for Top {len(top_ids_for_print)} Communities: {top_ids_for_print}\n")

        all_communities_print_summary_data = []

        for comm_id in top_ids_for_print:
            community_size = community_sizes[comm_id]
            community_data_print = { # For structured data, similar to HTML table version
                "id": comm_id, "size": community_size, "papers": [],
                "top_keywords": [], "suggested_theme_title": "N/A",
                "pillar_papers": [], "top_by_times_cited": [], "top_by_betweenness": [],
                "top_by_degree": [], "top_authors": [], "top_sources": [], "temporal_overview": {}
            }
            print(f"\n\n{'='*30} Detailed Print Summary: Community ID {comm_id} (Size: {community_size} papers) {'='*30}")

            community_node_ids = [node_id for node_id, c_id in partition_new.items() if c_id == comm_id]
            community_papers_details_print = []
            paper_years_in_community_print = []

            for node_id in community_node_ids:
                paper_data = get_node_data_for_detailed_print_analysis(G_main_cc_new, node_id)
                if "error" not in paper_data:
                     community_papers_details_print.append(paper_data)
                     if paper_data['year_int'] > 0:
                         paper_years_in_community_print.append(paper_data['year_int'])

            if not community_papers_details_print:
                print("  No valid paper data collected for this community.")
                all_communities_print_summary_data.append(community_data_print)
                continue

            community_data_print["papers"] = community_papers_details_print

            # --- Temporal Overview ---
            if paper_years_in_community_print:
                to = {
                    "min_year": int(np.min(paper_years_in_community_print)) if paper_years_in_community_print else 'N/A',
                    "max_year": int(np.max(paper_years_in_community_print)) if paper_years_in_community_print else 'N/A',
                    "median_year": int(np.median(paper_years_in_community_print)) if paper_years_in_community_print else 'N/A',
                    "num_papers_with_year": len(paper_years_in_community_print)
                }
                community_data_print["temporal_overview"] = to
                print("\n   **Temporal Overview:**")
                print(f"     - Years Covered: {to.get('min_year', 'N/A')} - {to.get('max_year', 'N/A')}")
                print(f"     - Median Year: {to.get('median_year', 'N/A')}")

            # --- Top Papers by Times Cited (with Abstracts) ---
            community_data_print["top_by_times_cited"] = sorted(community_papers_details_print, key=lambda x: x['times_cited'], reverse=True)[:NUMBER_OF_TOP_ITEMS_TO_SHOW]
            print(f"\n   **Top {NUMBER_OF_TOP_ITEMS_TO_SHOW} Papers by Times Cited (with Abstracts):**")
            if community_data_print["top_by_times_cited"]:
                for i, paper in enumerate(community_data_print["top_by_times_cited"]):
                    print(f"    {i+1}. **Paper:** {paper['label']}")
                    authors_display = ", ".join(paper.get('authors', ['N/A'])[:3])
                    if len(paper.get('authors', [])) > 3: authors_display += " et al."
                    print(f"       **Authors:** {authors_display}")
                    print(f"       **Year:** {paper['year_str']}, **Times Cited:** {paper['times_cited']}")
                    print(f"       **Source:** {paper['source']}")
                    abstract_content = paper.get('abstract', 'No abstract available.')
                    if not abstract_content or not abstract_content.strip(): abstract_content = "No abstract available."
                    print(f"       **Abstract:** {abstract_content[:500]}...\n")
            else:
                print("     - N/A")

            # --- Keyword Extraction ---
            all_text_content = [p['text_for_keywords'] for p in community_papers_details_print if p['text_for_keywords']]
            comm_words = []
            for text in all_text_content:
                words = re.findall(r'\b[a-zA-Z]{3,}\b', text.lower())
                comm_words.extend([w for w in words if w not in STOPWORDS and len(w) > 2])
            keyword_counts_print = Counter(comm_words)
            community_data_print["top_keywords"] = keyword_counts_print.most_common(NUMBER_OF_TOP_ITEMS_TO_SHOW * 2)
            if community_data_print["top_keywords"]:
                num_kw_title = min(3, len(community_data_print["top_keywords"]))
                community_data_print["suggested_theme_title"] = ", ".join([kw[0].capitalize() for kw in community_data_print["top_keywords"][:num_kw_title]])
            print(f"\n   **Suggested Theme:** {community_data_print['suggested_theme_title']}")
            print(f"\n   **Top {NUMBER_OF_TOP_ITEMS_TO_SHOW * 2} Keywords:**")
            for keyword, count in community_data_print["top_keywords"]: print(f"     - '{keyword}': {count}")

            # --- Top Authors ---
            all_auth = []
            for p in community_papers_details_print:
                if p.get('authors'): all_auth.extend(p['authors'])
            author_counts_print = Counter(all_auth)
            community_data_print["top_authors"] = author_counts_print.most_common(NUMBER_OF_TOP_ITEMS_TO_SHOW)
            print(f"\n   **Top {NUMBER_OF_TOP_ITEMS_TO_SHOW} Authors:**") # Now shows 10
            for author, count in community_data_print["top_authors"]: print(f"     - {author}: {count} paper(s)/appearance(s)")

            # Add other rankings (betweenness, degree, sources) if desired for print summary
            # community_data_print["top_by_betweenness"] = sorted(community_papers_details_print, key=lambda x: x['betweenness_centrality'], reverse=True)[:NUMBER_OF_TOP_ITEMS_TO_SHOW]
            # print(f"\n   **Top {NUMBER_OF_TOP_ITEMS_TO_SHOW} Papers by Betweenness Centrality:**")
            # for i, paper in enumerate(community_data_print["top_by_betweenness"]): print(f"    {i+1}. Betw: {paper['betweenness_centrality']:.4f} | {paper['label']}")

            all_communities_print_summary_data.append(community_data_print)
        print(f"\n\n{'='*30} End of Detailed Print Summaries {'='*30}")

Generating Detailed Print Summaries for Top 5 Communities: [2, 0, 4, 3, 1]




   **Temporal Overview:**
     - Years Covered: 2002 - 2025
     - Median Year: 2019

   **Top 10 Papers by Times Cited (with Abstracts):**
    1. **Paper:** Walker, Renee E., 2010, food deserts literature...
       **Authors:** Walker, Renee E., Keane, Christopher R., Burke, Jessica G.
       **Year:** 2010, **Times Cited:** 1108
       **Source:** HEALTH & PLACE
       **Abstract:** Increasingly, studies are focusing on the role the local food environment plays in residents' ability to purchase affordable, healthy and nutritious foods. In a food desert, an area devoid of a supermarket, access to healthy food is limited. We conducted a systematic review of studies that focused on food access and food desert research in the United States. The 31 studies identified utilized 9 measures to assess food access. Results from these studies can be summarized primarily into four major...

    2. **Paper:** Cooksey-St

In [38]:
# Cell 10: Deeper Analysis & HTML Summary Table (Main Output Cell, adapted from original cell 10)
import collections # Already imported but good for clarity
import re
import numpy as np
import pandas as pd
from IPython.display import display, HTML

# Helper function (ensure it's consistent or use the one from Cell 9 if identical)
def get_node_data_for_html_summary(graph, node_id_from_graph):
    if node_id_from_graph not in graph:
        # print(f"Error: Node {node_id_from_graph} not found in graph for HTML summary.")
        return {"id": node_id_from_graph, "error": "Node not found in graph"}
    node_attrs = graph.nodes[node_id_from_graph]

    # Prioritize full author names from 'authors_full', fallback to 'authors_short'
    authors_list_final = node_attrs.get('authors_full', [])
    if not authors_list_final or not any(authors_list_final): # Check if empty or list of Nones/empty strings
        authors_list_final = node_attrs.get('authors_short', [])

    title_text = node_attrs.get('title', 'N/A')
    abstract_text = node_attrs.get('abstract', '')
    keywords_de_list = node_attrs.get('keywords_de', [])
    keywords_id_list = node_attrs.get('keywords_id', [])
    keywords_de_str = " ".join(keywords_de_list if isinstance(keywords_de_list, list) else [str(keywords_de_list)])
    keywords_id_str = " ".join(keywords_id_list if isinstance(keywords_id_list, list) else [str(keywords_id_list)])
    text_for_keywords = f"{title_text} {abstract_text} {keywords_de_str} {keywords_id_str}"

    year_str_val = node_attrs.get('year', '0')
    try: year_int_val = int(float(year_str_val))
    except ValueError: year_int_val = 0

    return {
        "id": node_id_from_graph,
        "label": node_attrs.get('label', str(node_id_from_graph)), # Label already updated
        "title": title_text,
        "abstract": abstract_text,
        "text_for_keywords": text_for_keywords.strip(),
        "year_str": year_str_val,
        "year_int": year_int_val,
        "times_cited": int(node_attrs.get('times_cited', 0)),
        "authors": authors_list_final, # << ENSURED FULL NAMES
        "source": node_attrs.get('source', 'N/A'),
        "degree_centrality": float(node_attrs.get('degree_centrality', 0.0)),
        "betweenness_centrality": float(node_attrs.get('betweenness_centrality', 0.0)),
        "eigenvector_centrality": float(node_attrs.get('eigenvector_centrality', 0.0)),
        "community_id": node_attrs.get('community_id', -1)
    }

def generate_ngrams_for_summary(text, n, stopwords_list): # Renamed to avoid conflict
    words = re.findall(r'\b[a-zA-Z]{3,}\b', text.lower())
    filtered_words = [word for word in words if word not in stopwords_list]
    if not filtered_words or len(filtered_words) < n: return []
    ngrams = zip(*[filtered_words[i:] for i in range(n)])
    return [" ".join(ngram) for ngram in ngrams]

# --- Main Analysis Loop for Top N Communities ---
if 'partition_new' not in locals() or not isinstance(partition_new, dict) or not partition_new:
    print("ERROR: 'partition_new' not found for HTML summary. Run community detection.")
elif 'G_main_cc_new' not in locals() or not hasattr(G_main_cc_new, 'nodes') or G_main_cc_new.number_of_nodes() == 0:
    print("ERROR: 'G_main_cc_new' graph not found for HTML summary.")
else:
    community_sizes = Counter(partition_new.values())
    if 'NUMBER_OF_TOP_ITEMS_TO_SHOW' not in locals(): NUMBER_OF_TOP_ITEMS_TO_SHOW = 10 # Default
    if 'STOPWORDS' not in locals(): STOPWORDS = set(["the", "a", "is"]) # Minimal default

    # Analyze Top N (e.g., 10) communities
    top_N_communities_list = community_sizes.most_common(NUMBER_OF_TOP_ITEMS_TO_SHOW) # << MODIFIED to take N

    if not top_N_communities_list:
        print("No communities found to analyze for HTML summary.")
    else:
        top_N_community_ids = [item[0] for item in top_N_communities_list]
        # print(f"Performing HTML summary analysis on Top {len(top_N_community_ids)} communities: {top_N_community_ids}\n") # Verbose print

        all_communities_html_summary_data = []

        for comm_id in top_N_community_ids:
            community_size = community_sizes[comm_id]
            # Verbose print for each community being processed for the table
            # print(f"\n--- Processing Community ID: {comm_id} (Size: {community_size} papers) for HTML Table ---")

            current_community_papers_details_html = []
            paper_years_in_community_html = []
            community_node_ids_html = [node_id for node_id, c_id in partition_new.items() if c_id == comm_id]

            for node_id in community_node_ids_html:
                paper_data = get_node_data_for_html_summary(G_main_cc_new, node_id)
                if "error" not in paper_data:
                     current_community_papers_details_html.append(paper_data)
                     if paper_data['year_int'] > 0:
                         paper_years_in_community_html.append(paper_data['year_int'])

            community_summary_for_table = {
                "Community ID": comm_id, "Size (Papers)": community_size,
                "Suggested Theme": "N/A", "Top Keywords (Uni)": "N/A",
                "Top Bi-grams": "N/A", "Pillar Paper(s)": "N/A",
                "Top Author(s)": "N/A", "Median Year": "N/A"
            }

            if not current_community_papers_details_html:
                # print(f"  No valid paper data for Community {comm_id} for HTML table.")
                all_communities_html_summary_data.append(community_summary_for_table) # Add placeholder
                continue

            if paper_years_in_community_html:
                community_summary_for_table["Median Year"] = int(np.median(paper_years_in_community_html))

            full_text_for_comm_html = " ".join([p['text_for_keywords'] for p in current_community_papers_details_html if p['text_for_keywords']])
            if full_text_for_comm_html.strip():
                unigrams_html = generate_ngrams_for_summary(full_text_for_comm_html, 1, STOPWORDS)
                unigram_counts_html = Counter(unigrams_html)
                top_unigrams_html = unigram_counts_html.most_common(NUMBER_OF_TOP_ITEMS_TO_SHOW) # Use N for basis
                community_summary_for_table["Top Keywords (Uni)"] = "; ".join([f"{ug[0]} ({ug[1]})" for ug in top_unigrams_html[:5]]) # Show top 5 in table
                if top_unigrams_html: community_summary_for_table["Suggested Theme"] = ", ".join([kw[0].capitalize() for kw in top_unigrams_html[:3]])

                bigrams_html = generate_ngrams_for_summary(full_text_for_comm_html, 2, STOPWORDS)
                bigram_counts_html = Counter(bigrams_html)
                top_bigrams_html = bigram_counts_html.most_common(NUMBER_OF_TOP_ITEMS_TO_SHOW)
                community_summary_for_table["Top Bi-grams"] = "; ".join([f"{bg[0]} ({bg[1]})" for bg in top_bigrams_html[:3]]) # Show top 3 in table

            all_authors_comm_html = []
            for p in current_community_papers_details_html:
                if p.get('authors'): all_authors_comm_html.extend(p['authors']) # authors are now full names
            author_counts_comm_html = Counter(all_authors_comm_html)
            # Show top 3 authors in the table summary, derived from top N
            top_authors_for_table = author_counts_comm_html.most_common(min(3, len(author_counts_comm_html)))
            community_summary_for_table["Top Author(s)"] = "; ".join([f"{auth[0]} ({auth[1]})" for auth in top_authors_for_table])


            # Pillar Paper Identification for table
            top_cited_html = sorted(current_community_papers_details_html, key=lambda x: x['times_cited'], reverse=True)
            top_betweenness_html = sorted(current_community_papers_details_html, key=lambda x: x['betweenness_centrality'], reverse=True)

            # Consider broader pool for pillar candidacy, e.g. top 2*N
            pool_size = NUMBER_OF_TOP_ITEMS_TO_SHOW * 2
            top_cited_ids_html = {p['id'] for p in top_cited_html[:pool_size]}
            top_betweenness_ids_html = {p['id'] for p in top_betweenness_html[:pool_size]}
            pillar_ids_html = top_cited_ids_html.intersection(top_betweenness_ids_html)

            pillar_papers_for_table_list = []
            if pillar_ids_html:
                pillar_papers_for_table_list = sorted(
                    [p for p in current_community_papers_details_html if p['id'] in pillar_ids_html],
                    key=lambda x: x['times_cited'], reverse=True
                )
            else: # Fallback to top cited if no intersection
                pillar_papers_for_table_list = top_cited_html[:2] # Show top 2 cited as fallback "pillar"

            # Show 1 or 2 pillar papers in the table using their labels (labels now have full first author name)
            community_summary_for_table["Pillar Paper(s)"] = "; ".join([p['label'][:70]+"..." for p in pillar_papers_for_table_list[:2]])


            all_communities_html_summary_data.append(community_summary_for_table)
            # print(f"--- Done processing Community ID: {comm_id} for HTML Table ---")


        # --- Create and Display Summary Table ---
        print(f"\n\n{'='*30} HTML Summary Table for Top {len(top_N_community_ids)} Communities {'='*30}")
        summary_df_html = pd.DataFrame(all_communities_html_summary_data)

        columns_for_html_table = [
            "Community ID", "Size (Papers)", "Suggested Theme",
            "Median Year", "Top Keywords (Uni)", "Top Bi-grams",
            "Pillar Paper(s)", "Top Author(s)"
        ]
        display_cols_html = [col for col in columns_for_html_table if col in summary_df_html.columns]

        if not summary_df_html.empty:
            pd.set_option('display.max_colwidth', 200)
            pd.set_option('display.width', 1000)
            display(HTML(summary_df_html[display_cols_html].to_html(index=False, escape=False))) # escape=False to render HTML if any
        else:
            print("No summary data to display in HTML table.")

        print(f"\n\n{'='*30} End of All Community HTML Summary Analysis {'='*30}")





Community ID,Size (Papers),Suggested Theme,Median Year,Top Keywords (Uni),Top Bi-grams,Pillar Paper(s),Top Author(s)
2,338,"Food, Access, Desert",2019,food (3260); access (819); desert (698); deserts (555); health (484),food desert (674); food deserts (512); food access (278),"Farber, Steven, 2014, Temporal variability in transi......","Dubowitz, Tamara (27); Beckman, Robin (15); Collins, Rebecca L. (13)"
0,329,"Smart, Time, Which",2020,smart (313); time (246); which (243); our (211); network (191),internet things (77); real world (61); human mobility (55),"Zheng, Yu, 2014, Urban Computing: Concepts, Met......; Castro, Pablo Samuel, 2013, From Taxi GPS Traces to Social......","Zheng, Yu (14); Li, Tianrui (6); Li, Yanhua (6)"
4,137,"Prediction, Temporal, Traffic",2023,prediction (332); temporal (322); traffic (319); learning (239); network (228),spatio temporal (128); spatial temporal (96); deep learning (78),"Ali, Ahmad, 2022, for citywide traffic flows pre......; Peng, Hao, 2020, flow forecasting......","Zheng, Yu (12); Zhang, Junbo (11); Li, Tianrui (8)"
3,20,"Land, Areas, Surface",2018,land (32); areas (31); surface (30); lst (27); heat (26),heat island (17); surface temperature (11); land cover (11),"Fan, Chao, 2017, Longitudinal Analysis of the O......; Chakraborty, Tc, 2021, islands: A global analysis......","Bocher, Erwan (1); Petit, Gwendall (1); Bernard, Jeremy (1)"
1,8,"Runoff, Rational, Proposed",2018,runoff (16); rational (15); proposed (13); hydrograph (13); time (12),rational hydrograph (13); real time (10); improved rational (9),"Zeng, Zhiqiang, 2021, provide decision support for r......; Hua, Lizhong, 2017, application associated with ra......","Bennis, S. (2); Crobeddu, E. (2); Li, Donglai (1)"




