# SetUp

In [None]:
import os
import pandas as pd
import ast
from collections import Counter
import re
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score
from ast import literal_eval
from typing import List, Union, Optional, Dict, Tuple
import plotly.express as px
from sklearn.decomposition import PCA
from collections import Counter
import random
import warnings
import networkx as nx
from itertools import combinations

In [None]:
dataset_dir = 'your-project-path/datasets'
subject_main_dir = 'your-project-path/analyze_dataset'
os.makedirs(dataset_dir, exist_ok=True)
os.makedirs(subject_main_dir, exist_ok=True)

In [None]:
raw_pyalex_df_path = os.path.join(dataset_dir, 'pyalex_df.csv')
clean_pyalex_df_path = os.path.join(dataset_dir, 'cleaned_pyalex_df.csv')
unified_df_path=os.path.join(dataset_dir, 'unified_dataset.csv')
unified_cleaned_df_path=os.path.join(dataset_dir, 'cleaned_unified_dataset.csv')
filtered_titles_df_path=os.path.join(dataset_dir, 'filtered_titles_dataset.csv')
filtered_abstracts_df_path=os.path.join(dataset_dir, 'filtered_abstracts_dataset.csv')

In [None]:
df = pd.read_csv(unified_cleaned_df_path)
df.info()

# Utils

In [None]:
def detect_language(text):
    """
    Detect the language of a given text
    
    Parameters:
    text (str): The text to analyze
    
    Returns:
    str: The detected language code
    """
    try:
        return langdetect.detect(text)
    except (langdetect.lang_detect_exception.LangDetectException, ValueError, TypeError):
        # Return None if language detection fails
        return None


In [None]:
import pandas as pd
import ast
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

def save_plot(plt, filename, subject_main_dir, subject_type):
    """
    Save the current matplotlib plot to a file in the appropriate subject subdirectory
    
    Parameters:
    plt: matplotlib.pyplot instance
    filename (str): Name of the file to save
    subject_main_dir (str/Path): Main directory for all subject analyses
    subject_type (str): Type of subject analysis (e.g., 'domains', 'fields', 'topics')
    """
    # Create subject-specific subdirectory
    subject_dir = Path(subject_main_dir) / subject_type
    subject_dir.mkdir(parents=True, exist_ok=True)
    
    # Save the plot
    save_path = subject_dir / filename
    plt.savefig(save_path, bbox_inches='tight', dpi=300)
    plt.close()

# Pre-Process

In [None]:
def drop_empty_columns(df):
    """
    Drops columns that have 100% missing values.
    
    Parameters:
    df (pandas.DataFrame): Input DataFrame
    
    Returns:
    pandas.DataFrame: DataFrame with empty columns removed
    tuple: (processed DataFrame, list of dropped columns)
    """
    # Create a copy to avoid modifying the original
    df_cleaned = df.copy()
    
    # Calculate percentage of missing values for each column
    missing_percentage = df_cleaned.isna().mean() * 100
    
    # Find columns with 100% missing values
    completely_empty_cols = missing_percentage[missing_percentage == 100].index.tolist()
    
    # Print information about dropped columns
    if completely_empty_cols:
        print(f"Dropping {len(completely_empty_cols)} completely empty columns: {', '.join(completely_empty_cols)}")
        
        # Drop the completely empty columns
        df_cleaned = df_cleaned.drop(columns=completely_empty_cols)
    else:
        print("No completely empty columns found.")
    
    return df_cleaned, completely_empty_cols

In [None]:
import pandas as pd
import ast
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import langdetect
from langdetect import DetectorFactory
import re

# For reproducible language detection results
DetectorFactory.seed = 0

def clean_author_name(author_name):
    """
    Remove text within parentheses from author names
    
    Parameters:
    author_name (str): Author name that may contain text in parentheses
    
    Returns:
    str: Cleaned author name
    """
    if not isinstance(author_name, str):
        return author_name
    
    # Remove text within parentheses and any extra whitespace
    cleaned_name = re.sub(r'\s*\([^)]*\)\s*', ' ', author_name)
    # Remove any extra spaces that might have been created
    cleaned_name = re.sub(r'\s+', ' ', cleaned_name).strip()
    
    return cleaned_name

def preprocess_dataframe(df):
    """
    Preprocess the DataFrame to convert string representations back to their original types
    
    Parameters:
    df (pandas.DataFrame): The input DataFrame
    
    Returns:
    pandas.DataFrame: Processed DataFrame with correct data types
    """
    df = df.copy()
    
    # Identify all columns that might contain lists
    potential_list_columns = []
    for col in df.columns:
        # Sample some non-null values to check if they look like lists
        sample = df[col].dropna().head(10)
        for val in sample:
            if isinstance(val, str) and val.strip().startswith('[') and val.strip().endswith(']'):
                potential_list_columns.append(col)
                break
    
    print(f"Detected potential list columns: {potential_list_columns}")
    
    # Convert string representations of lists back to actual lists for all detected columns
    for col in potential_list_columns:
        print(f"Converting column: {col}")
        df[col] = df[col].apply(
            lambda x: ast.literal_eval(x) if isinstance(x, str) and x.strip().startswith('[') and x.strip().endswith(']') else x
        )
    
    # Convert concept_dict from string to dictionary (special handling)
    if 'concept_dict' in df.columns:
        print("Converting concept_dict column")
        df['concept_dict'] = df['concept_dict'].apply(
            lambda x: ast.literal_eval(x) if isinstance(x, str) and x.strip().startswith('{') and x.strip().endswith('}') else x
        )
    
    # Clean author names by removing text in parentheses
    if 'Authors' in df.columns:
        print("Cleaning author names")
        # Handle both string and list representations
        df['Authors'] = df['Authors'].apply(
            lambda x: [clean_author_name(author) for author in x] if isinstance(x, list) 
                     else clean_author_name(x) if isinstance(x, str)
                     else x
        )
    
    # Ensure numeric columns are correct type
    numeric_columns = ['Publication Year', 'Citation Count']
    for col in numeric_columns:
        if col in df.columns:
            print(f"Converting numeric column: {col}")
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    return df
def clean_titles(df, remove_missing=True, english_only=True):
    """
    Clean the dataset by removing rows with missing titles 
    and optionally filtering to keep only English titles
    
    Parameters:
    df (pandas.DataFrame): The input DataFrame
    remove_missing (bool): Whether to remove rows with missing titles
    english_only (bool): Whether to keep only English titles
    
    Returns:
    pandas.DataFrame: Cleaned DataFrame
    tuple: (cleaned DataFrame, stats dictionary with cleaning information)
    """
    # Create a copy of the DataFrame to avoid modifying the original
    df_clean = df.copy()
    
    # Initialize stats dictionary
    stats = {
        'original_row_count': len(df),
        'missing_titles': 0,
        'non_english_titles': 0,
        'remaining_row_count': 0
    }
    
    # Remove rows with missing titles
    if remove_missing and 'Title' in df.columns:
        missing_mask = df_clean['Title'].isna() | (df_clean['Title'] == '')
        stats['missing_titles'] = missing_mask.sum()
        df_clean = df_clean[~missing_mask]
    
    # Filter to keep only English titles
    if english_only and 'Title' in df.columns:
        # Apply language detection to titles
        print("Detecting languages for titles... (this might take a while)")
        df_clean['detected_language'] = df_clean['Title'].apply(detect_language)
        
        # Filter out non-English titles
        non_english_mask = (df_clean['detected_language'] != 'en') & (df_clean['detected_language'].notna())
        stats['non_english_titles'] = non_english_mask.sum()
        df_clean = df_clean[~non_english_mask]
        
        # Drop the temporary language column
        df_clean = df_clean.drop('detected_language', axis=1)
    
    stats['remaining_row_count'] = len(df_clean)
    
    return df_clean, stats

def print_cleaning_stats(stats):
    """
    Print statistics about the cleaning process
    
    Parameters:
    stats (dict): Dictionary containing cleaning statistics
    """
    print(f"Original number of rows: {stats['original_row_count']}")
    print(f"Rows with missing titles: {stats['missing_titles']}")
    print(f"Rows with non-English titles: {stats['non_english_titles']}")
    print(f"Remaining rows after cleaning: {stats['remaining_row_count']}")
    print(f"Percentage of data retained: {stats['remaining_row_count'] / stats['original_row_count'] * 100:.2f}%")
    
def analyze_dataset(df, verbose=True):
    """
    Perform analysis on the dataset to understand its structure and quality
    
    Parameters:
    df (pandas.DataFrame): Input DataFrame
    verbose (bool): Whether to print detailed information
    
    Returns:
    dict: Analysis results
    """
    analysis = {}
    
    # Basic information
    analysis['row_count'] = len(df)
    analysis['column_count'] = len(df.columns)
    
    # Missing values
    missing_values = df.isna().sum()
    missing_percentage = (missing_values / len(df) * 100).round(2)
    analysis['missing_values'] = dict(zip(df.columns, missing_values))
    analysis['missing_percentage'] = dict(zip(df.columns, missing_percentage))
    
    # Data types
    analysis['dtypes'] = dict(zip(df.columns, df.dtypes.astype(str)))
    
    # Identify columns with list values
    list_columns = []
    for col in df.columns:
        if df[col].dropna().head(1).map(lambda x: isinstance(x, list)).any():
            list_columns.append(col)
    analysis['list_columns'] = list_columns
    
    # Publication years distribution
    if 'Publication Year' in df.columns:
        year_counts = df['Publication Year'].value_counts().sort_index()
        analysis['publication_years'] = dict(zip(year_counts.index, year_counts.values))
        analysis['min_year'] = df['Publication Year'].min()
        analysis['max_year'] = df['Publication Year'].max()
        
        print("Calculating Decade column")
        # Floor division by 10 and then multiply by 10 to get the decade
        df['decade'] = (df['Publication Year'] // 10) * 10
        # Handle any NaN values
        df['decade'] = df['decade'].astype('Int64')  # pandas nullable integer type to handle NAs
    
    return df
    # Document types distribution
    if 'Type' in df.columns:
        type_counts = df['Type'].value_counts()
        analysis['document_types'] = dict(zip(type_counts.index, type_counts.values))
    
    # Citation count statistics
    if 'Citation Count' in df.columns:
        analysis['citation_stats'] = {
            'min': df['Citation Count'].min(),
            'max': df['Citation Count'].max(),
            'mean': df['Citation Count'].mean(),
            'median': df['Citation Count'].median(),
            'std': df['Citation Count'].std()
        }
        
    # Language detection summary (if available)
    if 'detected_language' in df.columns:
        lang_counts = df['detected_language'].value_counts()
        analysis['language_counts'] = dict(zip(lang_counts.index, lang_counts.values))
    
    # Print detailed analysis if requested
    if verbose:
        print("\n=== Dataset Analysis ===")
        print(f"Number of rows: {analysis['row_count']}")
        print(f"Number of columns: {analysis['column_count']}")
        
        print("\n--- Missing Values ---")
        for col, count in sorted(analysis['missing_values'].items(), key=lambda x: x[1], reverse=True):
            if count > 0:
                print(f"{col}: {count} missing values ({analysis['missing_percentage'][col]}%)")
        
        print("\n--- Data Types ---")
        for col, dtype in analysis['dtypes'].items():
            print(f"{col}: {dtype}")
        
        print("\n--- List Columns ---")
        for col in analysis['list_columns']:
            # Get sample lengths of lists in this column
            sample_lengths = df[col].dropna().head(5).map(lambda x: len(x) if isinstance(x, list) else 0)
            print(f"{col}: (sample lengths: {list(sample_lengths)})")
        
        if 'publication_years' in analysis:
            print("\n--- Publication Years ---")
            print(f"Range: {analysis['min_year']} to {analysis['max_year']}")
            
        if 'document_types' in analysis:
            print("\n--- Document Types ---")
            for doc_type, count in sorted(analysis['document_types'].items(), key=lambda x: x[1], reverse=True):
                print(f"{doc_type}: {count} ({count/analysis['row_count']*100:.2f}%)")
        
        if 'citation_stats' in analysis:
            print("\n--- Citation Statistics ---")
            for stat, value in analysis['citation_stats'].items():
                print(f"{stat.capitalize()}: {value:.2f}")
                
        if 'language_counts' in analysis:
            print("\n--- Detected Languages ---")
            total = sum(analysis['language_counts'].values())
            for lang, count in sorted(analysis['language_counts'].items(), key=lambda x: x[1], reverse=True)[:10]:
                print(f"{lang}: {count} ({count/total*100:.2f}%)")
    
    return analysis

# Example usage:
if __name__ == "__main__":
    # Analyze the raw dataset
    print("\n==== Raw Dataset Analysis ====")
    raw_analysis = analyze_dataset(df)
    
    # Preprocess the dataframe to ensure correct data types
    print("\nPreprocessing data types...")
    df = preprocess_dataframe(df)
    
    # Analyze after type conversion
    print("\n==== Dataset Analysis After Type Conversion ====")
    preprocessed_analysis = analyze_dataset(df)
    
    # Clean titles (remove missing and non-English)
    print("\nCleaning titles...")
    df_clean, stats = clean_titles(df, remove_missing=True, english_only=False)


    
    # Print cleaning statistics
    print("\nCleaning statistics:")
    print_cleaning_stats(stats)

    # After loading the dataframe and before preprocessing
    print("\nChecking for empty columns...")
    df_clean, dropped_cols = drop_empty_columns(df_clean)
    print(f"DataFrame shape after dropping empty columns: {df_clean.shape}")
    
    # Analyze the cleaned dataset
    print("\n==== Cleaned Dataset Analysis ====")
    cleaned_analysis = analyze_dataset(df_clean)


    
    # Show some sample data after cleaning
    print("\nSample of cleaned data:")
    print(df_clean.head())
    
    # Save the cleaned dataset
    df=df_clean
    df_clean.to_csv(unified_cleaned_df_path, index=False)

    print(f"\nCleaned dataset saved to: {unified_cleaned_df_path}")

# General analysis

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pathlib import Path

def check_decade_column(df):
    """
    Check if the DataFrame has a decade column, raise error if not
    
    Parameters:
    df (pandas.DataFrame): DataFrame to check
    
    Returns:
    None
    """
    if 'decade' not in df.columns:
        raise ValueError("DataFrame must contain 'decade' column")

def count_papers_by_decade(df):
    """
    Count the number of papers published in each decade
    
    Parameters:
    df (pandas.DataFrame): DataFrame with 'decade' column
    
    Returns:
    pandas.Series: Series with decade as index and paper count as value
    """
    check_decade_column(df)
    
    # Count papers by decade and sort by decade
    decade_counts = df['decade'].value_counts().sort_index()
    
    return decade_counts

def plot_papers_per_decade_bar(df, subject_main_dir):
    """
    Create a bar chart of papers published per decade with vertical count labels
    
    Parameters:
    df (pandas.DataFrame): DataFrame with publication data
    subject_main_dir (str/Path): Main directory for saving plots
    """
    # Count papers by decade
    decade_counts = count_papers_by_decade(df)
    
    # Set up the figure with a clean, professional style
    plt.figure(figsize=(14, 8))
    sns.set_style("whitegrid")
    
    # Sort the decade_counts for color mapping
    sorted_indices = np.argsort(decade_counts.values)
    color_indices = np.zeros(len(decade_counts), dtype=int)
    
    # Assign color indices based on rank (higher count = darker color)
    for i, idx in enumerate(sorted_indices):
        color_indices[idx] = i
    
    # Create a light blue palette
    blue_palette = sns.color_palette("Blues", len(decade_counts))
    
    # Map colors to bars based on their count value
    colors = [blue_palette[idx] for idx in color_indices]
    
    # Create the bar plot
    ax = sns.barplot(x=decade_counts.index, y=decade_counts.values, 
                     palette=colors)
    
    # Add count labels on the bars
    for i, count in enumerate(decade_counts.values):
        ax.text(i, count + (max(decade_counts.values) * 0.01), f'n={count:,}', 
                ha='center', va='bottom', color='black', fontsize=9)
    
    # Customize the plot
    plt.xlabel('Decade', fontsize=12)
    plt.ylabel('Number of Papers', fontsize=12)
    plt.title('Number of Papers Published by Decade', fontsize=14)
    
    # Format x-axis ticks to show years with no rotation
    plt.xticks(rotation=0)
    
    # Save the plot
    save_plot(plt, 'papers_per_decade_bar.png', subject_main_dir, 'general_analysis')
    
    return decade_counts
    
def plot_papers_per_decade_line(df, subject_main_dir):
    """
    Create a line graph of papers published per decade with trend line
    
    Parameters:
    df (pandas.DataFrame): DataFrame with publication data
    subject_main_dir (str/Path): Main directory for saving plots
    """
    # Count papers by decade
    decade_counts = count_papers_by_decade(df)
    
    # Set up the figure with a clean, professional style
    plt.figure(figsize=(12, 7))
    sns.set_style("whitegrid")
    
    # Create line plot
    decades = decade_counts.index.astype(int)
    counts = decade_counts.values
    
    plt.plot(decades, counts, 'o-', linewidth=2, markersize=10, color='#1f77b4')
    
    # Add trend line (linear regression)
    z = np.polyfit(decades, counts, 1)
    p = np.poly1d(z)
    plt.plot(decades, p(decades), "r--", linewidth=2, 
             label=f'Trend line (y = {z[0]:.2f}x + {z[1]:.2f})')
    
    # Calculate R-squared value
    correlation_matrix = np.corrcoef(decades, counts)
    correlation_xy = correlation_matrix[0,1]
    r_squared = correlation_xy**2
    
    # Add R-squared annotation
    plt.annotate(f'R² = {r_squared:.4f}', 
                xy=(0.05, 0.95), 
                xycoords='axes fraction', 
                fontsize=10, 
                bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="gray", alpha=0.8))
    
    # Customize the plot
    plt.xlabel('Decade', fontsize=12)
    plt.ylabel('Number of Papers', fontsize=12)
    plt.title('Growth of Publication Volume by Decade', fontsize=14)
    plt.legend()
    
    # Format x-axis to show decades properly
    plt.xticks(decades)
    
    # Save the plot
    save_plot(plt, 'papers_per_decade_line.png', subject_main_dir, 'general_analysis')
    
    return decade_counts

def analyze_decades(df, subject_main_dir):
    """
    Perform decade analysis with multiple visualizations
    
    Parameters:
    df (pandas.DataFrame): DataFrame with publication data
    subject_main_dir (str/Path): Main directory for saving plots
    
    Returns:
    pandas.Series: Series with decade counts
    """
    # Check that decade column exists
    check_decade_column(df)
    
    # Create both visualizations
    decade_counts_bar = plot_papers_per_decade_bar(df, subject_main_dir)
    decade_counts_line = plot_papers_per_decade_line(df, subject_main_dir)
    
    # Print decade statistics
    print("\nDecade Publication Statistics:")
    for decade, count in decade_counts_bar.items():
        print(f"{decade}s: {count:,} papers")
    
    # Calculate growth rates between decades
    decades = decade_counts_bar.index.sort_values()
    print("\nDecade-over-Decade Growth Rates:")
    for i in range(1, len(decades)):
        current_count = decade_counts_bar[decades[i]]
        previous_count = decade_counts_bar[decades[i-1]]
        growth_rate = ((current_count - previous_count) / previous_count) * 100
        print(f"{decades[i-1]}s to {decades[i]}s: {growth_rate:.2f}%")
    
    return decade_counts_bar

# Function call
decade_counts = analyze_decades(df, subject_main_dir)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import langdetect
from collections import Counter

# Utility function for horizontal bar charts
def plot_horizontal_bar_chart(data, title, xlabel, ylabel, filename, subject_main_dir, subject_type, 
                             figsize=(10, 8), color='skyblue', edgecolor='navy', min_count=None, 
                             top_n=None, sort_by_value=True, show_count=True):
    """
    Create and save a horizontal bar chart for distribution analysis
    
    Parameters:
    data (dict or Counter): Dictionary with categories as keys and counts as values
    title (str): Chart title
    xlabel (str): X-axis label (typically count or frequency)
    ylabel (str): Y-axis label (typically category name)
    filename (str): Name of the file to save
    subject_main_dir (str/Path): Main directory for all subject analyses
    subject_type (str): Type of subject analysis
    figsize (tuple): Figure size in inches (width, height)
    color (str): Bar color
    edgecolor (str): Bar edge color
    min_count (int, optional): Minimum count to include in visualization
    top_n (int, optional): Only show top N categories by count
    sort_by_value (bool): Whether to sort by count (True) or alphabetically (False)
    show_count (bool): Whether to display count values on bars
    """
    # Filter data if needed
    if min_count is not None:
        data = {k: v for k, v in data.items() if v >= min_count}
    
    # Convert to DataFrame for easier manipulation
    df = pd.DataFrame(list(data.items()), columns=['Category', 'Count'])
    
    # Sort data
    if sort_by_value:
        df = df.sort_values('Count', ascending=True)
    else:
        df = df.sort_values('Category', ascending=True)
    
    # Take only top N if specified
    if top_n is not None and len(df) > top_n:
        if sort_by_value:
            # If sorting by value, take the highest counts
            df = df.nlargest(top_n, 'Count').sort_values('Count', ascending=True)
        else:
            # If sorting alphabetically, take first top_n after sorting
            df = df.head(top_n)
    
    # Create the plot
    plt.figure(figsize=figsize)
    bars = plt.barh(df['Category'], df['Count'], color=color, edgecolor=edgecolor)
    
    # Add counts on the bars if requested
    if show_count:
        for bar in bars:
            width = bar.get_width()
            plt.text(width + (width * 0.02),  # Slightly offset from the end of the bar
                     bar.get_y() + bar.get_height()/2,
                     f'{width:,}',  # Format with commas for thousands
                     va='center')
    
    # Add labels and title
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.title(title)
    plt.tight_layout()
    
    # Save the plot
    save_plot(plt, filename, subject_main_dir, subject_type)
    
    return plt

# Task-specific function for language distribution visualization
def visualize_language_distribution(df, text_column, subject_main_dir, 
                                   top_n=15, min_count=5):
    """
    Visualize the distribution of languages in the dataset
    
    Parameters:
    df (pandas.DataFrame): The dataframe containing the data
    text_column (str): The column name containing text to analyze for language
    subject_main_dir (str/Path): Main directory for all subject analyses
    top_n (int): Number of top languages to display
    min_count (int): Minimum count to include in visualization
    """
    print(f"Analyzing language distribution in {len(df)} texts...")
    
    # Check if language column already exists
    lang_column = 'detected_language'
    if lang_column not in df.columns:
        # Apply language detection
        print("Detecting languages...")
        df[lang_column] = df[text_column].apply(detect_language)
        print("Language detection complete.")
    
    # Count languages
    language_counts = Counter(df[lang_column].dropna())
    
    # Map language codes to full names for better readability
    language_names = {
        'en': 'English',
        'es': 'Spanish',
        'fr': 'French',
        'de': 'German',
        'it': 'Italian',
        'pt': 'Portuguese',
        'nl': 'Dutch',
        'ru': 'Russian',
        'zh': 'Chinese',
        'ja': 'Japanese',
        'ko': 'Korean',
        'ar': 'Arabic',
        'hi': 'Hindi',
        'sv': 'Swedish',
        'fi': 'Finnish',
        'no': 'Norwegian',
        'da': 'Danish',
        'cs': 'Czech',
        'pl': 'Polish',
        'tr': 'Turkish',
        # Add more mappings as needed
    }
    
    # Replace codes with full names where possible
    named_counts = {}
    for code, count in language_counts.items():
        name = language_names.get(code, code)  # Use code as fallback if not in mapping
        named_counts[name] = count
    
    # Create visualization
    plot_horizontal_bar_chart(
        data=named_counts,
        title='Distribution of Languages',
        xlabel='Count',
        ylabel='Language',
        filename='language_distribution.png',
        subject_main_dir=subject_main_dir,
        subject_type='languages',
        figsize=(12, 10),
        color='lightseagreen',
        edgecolor='teal',
        min_count=min_count,
        top_n=top_n,
        sort_by_value=True,
        show_count=True
    )
    
    # Print summary
    total_documents = sum(language_counts.values())
    print(f"Total documents with detected language: {total_documents}")
    print(f"Number of unique languages detected: {len(language_counts)}")
    
    # Return the counts for further analysis if needed
    return named_counts

# Example usage:
visualize_language_distribution(df, 'Title', subject_main_dir)

## Data statistics

In [None]:
import pandas as pd
import numpy as np
import re
import ast

def analyze_academic_dataset(df):
    """
    Analyzes academic dataset where list fields are stored as string representations of lists
    """
    results = {}
    
    # 1. Title and Abstract Analysis
    results['title_na_percentage'] = (df['Title'].isna().sum() / len(df)) * 100
    
    # Consider "No abstract available" as missing for Abstract
    abstract_na = df['Abstract'].isna() | df['Abstract'].eq('No abstract available')
    results['abstract_na_percentage'] = (abstract_na.sum() / len(df)) * 100
    
    # 2. Citation Count and Publication Year Statistics
    numeric_cols = ['Citation Count', 'Publication Year']
    numeric_stats = {}
    for col in numeric_cols:
        numeric_stats[col] = {
            'median': df[col].median(),
            'std_dev': df[col].std(),
            'min': df[col].min(),
            'max': df[col].max()
        }
    results['numeric_stats'] = numeric_stats
    
    # 3. DOI Analysis
    results['doi_na_percentage'] = (df['DOI'].isna().sum() / len(df)) * 100
    
    # 4. AUTHORS ANALYSIS - Handle string representation of lists
    # Convert string representations to actual lists
    parsed_authors = df['Authors'].apply(lambda x: parse_string_list(x) if isinstance(x, str) else [])
    
    # Count NA values (empty lists or NaN)
    authors_na = df['Authors'].isna() | parsed_authors.apply(lambda x: len(x) == 0)
    authors_na_percentage = (authors_na.sum() / len(df)) * 100
    
    # Extract unique authors (removing URLs)
    unique_authors = set()
    for authors_list in parsed_authors:
        for author in authors_list:
            # Remove the URL part if present
            clean_author = re.sub(r'\s*\(https://.*?\)', '', author).strip()
            if clean_author:
                unique_authors.add(clean_author)
    
    results['authors_stats'] = {
        'unique_authors': len(unique_authors),
        'na_percentage': authors_na_percentage
    }
    
    # 5. Categorical fields
    categorical_cols = ['Journal', 'Publisher', 'Type']
    categorical_stats = {}
    
    for col in categorical_cols:
        unique_count = df[col].dropna().nunique()
        na_percentage = (df[col].isna().sum() / len(df)) * 100
        
        categorical_stats[col] = {
            'unique_values': unique_count,
            'na_percentage': na_percentage
        }
    
    # 6. Special case: INSTITUTIONS and COUNTRIES
    for col in ['Institutions', 'Countries']:
        # Parse string representations to actual nested lists
        parsed_col = df[col].apply(lambda x: parse_string_list(x) if isinstance(x, str) else [])
        
        # Count NA/empty
        is_empty = df[col].isna() | parsed_col.apply(lambda x: x == [] or x == [[]])
        na_percentage = (is_empty.sum() / len(df)) * 100
        
        # For these columns, we need to flatten nested lists
        unique_values = set()
        for item in parsed_col:
            if isinstance(item, list):
                # Flatten one level of nesting
                flattened = []
                for subitem in item:
                    if isinstance(subitem, list):
                        flattened.extend(subitem)
                    else:
                        flattened.append(subitem)
                
                # Add non-empty values
                for value in flattened:
                    if isinstance(value, str) and value.strip():
                        normalized = re.sub(r'[^\w\s]', '', value.lower()).strip()
                        if normalized:
                            unique_values.add(normalized)
        
        categorical_stats[col] = {
            'unique_values': len(unique_values),
            'na_percentage': na_percentage
        }
    
    results['categorical_stats'] = categorical_stats
    
    # 7. List columns analysis (Concepts, Sub-fields, Topics, Domains, Fields)
    list_cols = ['Concepts', 'Sub-fields', 'Topics', 'Domains', 'Fields']
    list_stats = {}
    
    for col in list_cols:
        # Parse string representations to actual lists
        parsed_col = df[col].apply(lambda x: parse_string_list(x) if isinstance(x, str) else [])
        
        # Count NA/empty
        is_empty = df[col].isna() | parsed_col.apply(lambda x: len(x) == 0)
        na_percentage = (is_empty.sum() / len(df)) * 100
        
        # Get unique values
        unique_values = set()
        for item_list in parsed_col:
            for item in item_list:
                if isinstance(item, str) and item.strip():
                    # For concepts, remove score part
                    if col == 'Concepts' and '(score:' in item:
                        item = re.sub(r'\s*\(score:.*?\)', '', item)
                    
                    # Normalize and add
                    normalized = re.sub(r'[^\w\s]', '', item.lower()).strip()
                    if normalized:
                        unique_values.add(normalized)
        
        list_stats[col] = {
            'unique_values': len(unique_values),
            'na_percentage': na_percentage
        }
    
    results['list_stats'] = list_stats
    
    return results

def parse_string_list(s):
    """
    Safely convert a string representation of a list to an actual list
    """
    if not isinstance(s, str):
        return []
    
    try:
        # Use ast.literal_eval for safer parsing
        return ast.literal_eval(s)
    except (ValueError, SyntaxError):
        # If parsing fails, return empty list
        return []

def generate_latex_table(analysis_results):
    """Generate LaTeX table from analysis results"""
    latex_code = r"""\begin{table}[t]
  \caption{Dataset Features Description}
  \label{tab:dataset_description}
  \begin{tabular}{llcl}\toprule
    \textbf{Feature} & \textbf{Type} & \textbf{Missing (\%)} & \textbf{Statistics} \\ \midrule
"""
    
    # Title and Abstract
    latex_code += f"    Title & Text & {analysis_results['title_na_percentage']:.1f}\\% & - \\\\\n"
    latex_code += f"    Abstract & Text & {analysis_results['abstract_na_percentage']:.1f}\\% & Includes 'No abstract available' as missing \\\\\n"
    
    # Citation Count and Publication Year
    for col in ['Citation Count', 'Publication Year']:
        stats = analysis_results['numeric_stats'][col]
        latex_code += f"    {col} & Numeric & - & Median: {stats['median']:.1f}, Std: {stats['std_dev']:.1f}, "
        latex_code += f"Min: {stats['min']}, Max: {stats['max']} \\\\\n"
    
    # DOI
    latex_code += f"    DOI & String & {analysis_results['doi_na_percentage']:.1f}\\% & - \\\\\n"
    
    # Authors
    authors_stats = analysis_results['authors_stats']
    latex_code += f"    Authors & List & {authors_stats['na_percentage']:.1f}\\% & Unique authors: {authors_stats['unique_authors']} \\\\\n"
    
    # Categorical fields
    for col, stats in analysis_results['categorical_stats'].items():
        latex_code += f"    {col} & Categorical & {stats['na_percentage']:.1f}\\% & Unique values: {stats['unique_values']} \\\\\n"
    
    # List fields
    for col, stats in analysis_results['list_stats'].items():
        latex_code += f"    {col} & List & {stats['na_percentage']:.1f}\\% & Unique values: {stats['unique_values']} \\\\\n"
    
    latex_code += r"  \bottomrule" + "\n"
    latex_code += r"  \end{tabular}" + "\n"
    latex_code += r"\end{table}"
    
    return latex_code

# Usage:
results = analyze_academic_dataset(df)
latex = generate_latex_table(results)
print(latex)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

def analyze_decade_column(df):
    """
    Analyze the 'decade' column in the academic papers dataset.
    Returns a dictionary with key statistics.
    """
    # Make sure we're working with a clean numeric series
    decade_series = pd.to_numeric(df['decade'], errors='coerce')
    
    # Basic statistics
    decade_stats = {
        'min': decade_series.min(),
        'max': decade_series.max(),
        'mode': decade_series.mode().iloc[0],  # Most common decade
        'count': decade_series.count(),  # Number of non-NA values
        'missing': decade_series.isna().sum(),
        'missing_percentage': (decade_series.isna().sum() / len(df)) * 100
    }
    
    # Papers per decade (for distribution analysis)
    decade_counts = decade_series.value_counts().sort_index()
    decade_stats['counts_by_decade'] = decade_counts.to_dict()
    
    # Calculate decade with highest number of papers
    max_decade = decade_counts.idxmax()
    decade_stats['most_papers_decade'] = max_decade
    decade_stats['most_papers_count'] = decade_counts[max_decade]
    
    # Calculate growth rates between decades
    decade_growth = {}
    previous_count = None
    for decade, count in decade_counts.items():
        if previous_count is not None and previous_count > 0:
            growth_rate = ((count - previous_count) / previous_count) * 100
            decade_growth[decade] = growth_rate
        previous_count = count
    
    decade_stats['growth_rates'] = decade_growth
    
    # Calculate the average growth rate
    if decade_growth:
        decade_stats['avg_growth_rate'] = sum(decade_growth.values()) / len(decade_growth)
    
    return decade_stats

def generate_decade_table_row(decade_stats):
    """
    Generate a LaTeX table row for the decade column.
    """
    min_decade = decade_stats['min']
    max_decade = decade_stats['max']
    mode_decade = decade_stats['mode']
    most_papers_decade = decade_stats['most_papers_decade']
    most_papers_count = decade_stats['most_papers_count']
    
    latex_row = (
        f"    Decade & Numeric & {decade_stats['missing_percentage']:.1f}\\% & "
        f"Range: {min_decade}-{max_decade}, "
        f"Most papers in {most_papers_decade}s ({most_papers_count})"
        " \\\\\n"
    )
    
    return latex_row

# Example usage:
# decade_stats = analyze_decade_column(raw_df)
# decade_row = generate_decade_table_row(decade_stats)
# print(decade_row)

# Optional: Create a visualization of papers per decade
def plot_papers_per_decade(decade_stats):
    """
    Create a bar chart of papers per decade.
    """
    decades = decade_stats['counts_by_decade']
    plt.figure(figsize=(12, 6))
    plt.bar(decades.keys(), decades.values())
    plt.xlabel('Decade')
    plt.ylabel('Number of Papers')
    plt.title('Distribution of Papers by Decade')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig('analyze_dataset/decade/papers_per_decade.png', dpi=300)
    plt.close()
# raw_df=pd.read_csv(
decade_stats = analyze_decade_column(df)
decade_row = generate_decade_table_row(decade_stats)
print(decade_row)
plot_papers_per_decade(decade_stats)

In [None]:
oa_na=(df['OpenAlex ID'].isna().sum() / len(df)) * 100
print(oa_na)

# Text column analysis

## Text column utils

### Preprocess utils

In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import langdetect

# Ensure NLTK resources are downloaded
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
nltk.download('punkt_tab')
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

def preprocess_text(text, remove_stopwords=True, language='english', custom_stopwords=None):
    """Text preprocessing"""
    if not isinstance(text, str) or pd.isna(text):
        return ""
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and digits
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords
    if remove_stopwords:
        custom_academic_stopwords = ['pp','Pp.', 'vol', 'eds', 'ed', 'new', 'review', 'content', 'access', 'image', 'sizeclick', 'decrease', 'click', 'size', 'increase', 'informationnotes', 'additional']
        try:
            # Get standard stopwords
            stop_words = set(stopwords.words(language))
            stop_words.update(custom_academic_stopwords)
            if 'Pp.' not in stop_words:
                raise
            # Add custom stopwords
            if custom_stopwords:
                stop_words.update(custom_stopwords)
                
            tokens = [word for word in tokens if word not in stop_words]
        except:
            # If language is not supported, skip stopword removal
            pass
    
    # Rejoin tokens
    return ' '.join(tokens)

def filter_by_language(df, column_name, target_language='en', drop_other_languages=True,
                       filtered_titles_path=filtered_titles_df_path, filtered_abstracts_path=filtered_abstracts_df_path):
    """
    Filter a dataframe based on the detected language of a text column
    
    Parameters:
    df (DataFrame): DataFrame containing the text column
    column_name (str): Name of the column to analyze
    target_language (str): Target language code to keep
    drop_other_languages (bool): Whether to drop rows with non-target languages
    
    Returns:
    tuple: (Filtered DataFrame, language statistics)
    """
    print("filtering languages...")

    if column_name == "Title":
        try:
            filtered_df = pd.read_csv(filtered_titles_path)
            print("Loaded filtered dataset...")
            return filtered_df, {}
        except:
            pass
    elif column_name == "Abstract":
        try:
            filtered_df = pd.read_csv(filtered_abstracts_path)
            print("Loaded filtered dataset...")
            return filtered_df, {}
        except:
            pass
    # Copy dataframe to avoid modifying original
    filtered_df = df.copy()
    
    # Add language column
    filtered_df['detected_language'] = filtered_df[column_name].apply(detect_language)
    
    # Get language statistics before filtering
    language_counts = filtered_df['detected_language'].value_counts().to_dict()
    
    # Count rows before filtering
    total_rows_before = len(filtered_df)
    total_valid_texts = len(filtered_df[filtered_df[column_name].apply(lambda x: isinstance(x, str) and not pd.isna(x))])
    
    # Language statistics before filtering
    language_stats = {
        'language_counts': language_counts,
        'total_texts': total_valid_texts,
        'language_detected': len(filtered_df[filtered_df['detected_language'].notna()]),
        'language_not_detected': len(filtered_df[filtered_df['detected_language'].isna()])
    }
    
    # Filter by language if requested
    if drop_other_languages:
        filtered_df = filtered_df[filtered_df['detected_language'] == target_language]
        
        # Count rows after filtering
        total_rows_after = len(filtered_df)
        rows_removed = total_rows_before - total_rows_after
        
        # Add filtering stats to language stats
        language_stats.update({
            'rows_before_filtering': total_rows_before,
            'rows_after_filtering': total_rows_after,
            'rows_removed': rows_removed,
            'percentage_removed': (rows_removed / total_rows_before) * 100 if total_rows_before > 0 else 0
        })
        
        # Print summary of language filtering
        print(f"\nLanguage Filtering Summary for column '{column_name}':")
        print(f"Total records analyzed: {total_valid_texts}")
        print(f"Languages detected:")
        
        for lang, count in language_counts.items():
            lang_name = lang if lang is not None else "Unknown"
            print(f"  - {lang_name}: {count} records")
        
        print(f"Filtering applied for language: {target_language}")
        print(f"Records before filtering: {total_rows_before}")
        print(f"Records after filtering: {total_rows_after}")
        print(f"Records removed: {rows_removed} ({language_stats['percentage_removed']:.2f}%)")

    # if column_name == "Title":
    #     filtered_df.to_csv(filtered_titles_df_path)
    #     print(f"Filtered dataset was saved into {filtered_titles_df_path}")
    # elif column_name == "Abstract":
    #     filtered_df.to_csv(filtered_abstracts_path)
    #     print(f"Filtered dataset was saved into {filtered_titles_df_path}")
    return filtered_df, language_stats

def preprocess_dataframe_text_col(df, text_column, remove_stopwords=True, language='english', 
                      filter_language=True, target_language='en', drop_other_languages=True,
                                 output_df_path=None):
    """
    Preprocess a dataframe with text column - includes language filtering, text preprocessing,
    and special handling for terms 'anti-semitism' and 'anti-zionism'
    
    Parameters:
    df (DataFrame): DataFrame containing the text column
    text_column (str): Name of the column to preprocess
    remove_stopwords (bool): Whether to remove stopwords
    language (str): Language for stopwords removal (e.g., 'english')
    filter_language (bool): Whether to filter by language
    target_language (str): Target language code to keep
    drop_other_languages (bool): Whether to drop rows with non-target languages
    output_df_path (str, optional): Path to save processed DataFrame
    
    Returns:
    tuple: (Processed DataFrame, language statistics)
    """
    processed_df = df.copy()
    language_stats = None
    
    # First apply language filtering if requested
    if filter_language:
        processed_df, language_stats = filter_by_language(
            processed_df, 
            text_column, 
            target_language=target_language, 
            drop_other_languages=drop_other_languages
        )
    
    if text_column == "Abstract":
        processed_df = processed_df[processed_df["Abstract"] != "No abstract available"]
    
    # Replace hyphens in specific terms before general preprocessing
    processed_df[text_column] = processed_df[text_column].str.replace(
        r'anti-semitism', 'antisemitism', case=False, regex=True
    )
    processed_df[text_column] = processed_df[text_column].str.replace(
        r'anti-zionism', 'antizionism', case=False, regex=True
    )
    
    # Remove bibliography and academic paper-related terms
    bibliography_patterns = [
        r'\b[Pp][Pp]\.\s*\d+(-\d+)?',  # pp. 123 or pp. 123-145
        r'\b[Pp][Pp]\b',               # pp or Pp
        r'\bvol\.\s*\d+',              # vol. 123
        r'\bVol\.\s*\d+',              # Vol. 123
        r'\bissue\s*\d+',              # issue 123
        r'\bIssue\s*\d+',              # Issue 123
        r'\beditor[s]?\b',             # editor or editors
        r'\bet\s+al\.',                # et al.
        r'\bibid\.',                   # ibid.
        r'\bop\.\s*cit\.',             # op. cit.
        r'\bcf\.',                     # cf.
        r'\bvide\b',                   # vide
        r'\b[Ii]n\s+press\b',          # in press or In press
        r'\b[Ff]orthcoming\b',         # forthcoming or Forthcoming
        r'\bDOI:.*?\b',                # DOI: followed by the identifier
        r'\bISBN:.*?\b',               # ISBN: followed by the number
        r'\bISSN:.*?\b',               # ISSN: followed by the number
        r'\bAccessed\s+on\b.*?\d{4}',  # Accessed on date
        r'\bretrieved\s+from\b'        # retrieved from
    ]
    
    for pattern in bibliography_patterns:
        processed_df[text_column] = processed_df[text_column].str.replace(
            pattern, '', regex=True
        )
        
    # Apply text preprocessing to the column
    processed_df[f'{text_column}_processed'] = processed_df[text_column].apply(
        lambda x: preprocess_text(x, remove_stopwords=remove_stopwords, language=language)
    )
    
    if output_df_path:
        processed_df.to_csv(output_df_path)
    
    return processed_df, language_stats

### Analysis utils

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from pathlib import Path
import nltk
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud
import langdetect
from pathlib import Path

def count_words(text):
    """
    Count number of words in text
    
    Parameters:
    text (str): Text to analyze
    
    Returns:
    int: Word count
    """
    if not isinstance(text, str) or pd.isna(text):
        return 0
    
    words = word_tokenize(text)
    return len(words)

def get_most_common_words(texts, n=10, preprocess=True, remove_stopwords=True, language='english'):
    """
    Get most common words across multiple texts
    
    Parameters:
    texts (list/Series): List or Series of texts to analyze
    n (int): Number of most common words to return
    preprocess (bool): Whether to preprocess texts
    remove_stopwords (bool): Whether to remove stopwords
    language (str): Language for stopwords removal
    
    Returns:
    dict: Dictionary of word counts
    """
    all_words = []
    
    for text in texts:
        if preprocess and isinstance(text, str) and not pd.isna(text):
            text = preprocess_text(text, remove_stopwords, language)
            
        if isinstance(text, str) and not pd.isna(text):
            all_words.extend(word_tokenize(text))
    
    word_counts = Counter(all_words)
    return dict(word_counts.most_common(n))

def create_wordcloud(texts, preprocess=True, remove_stopwords=True, language='english', max_words=100):
    """
    Create a word cloud from a collection of texts
    
    Parameters:
    texts (list/Series): List or Series of texts
    preprocess (bool): Whether to preprocess texts
    remove_stopwords (bool): Whether to remove stopwords
    language (str): Language for stopwords removal
    max_words (int): Maximum number of words in wordcloud
    
    Returns:
    WordCloud: WordCloud object
    """
    combined_text = ""
    
    for text in texts:
        if isinstance(text, str) and not pd.isna(text):
            if preprocess:
                text = preprocess_text(text, remove_stopwords, language)
            combined_text += " " + text
    
    wordcloud = WordCloud(
        width=800, 
        height=400, 
        max_words=max_words, 
        background_color='white',
        collocations=False
    ).generate(combined_text)
    
    return wordcloud



## General text analysis

In [None]:
def text_length_stats(texts):
    """
    Calculate statistics about text lengths
    
    Parameters:
    texts (list/Series): List or Series of texts
    
    Returns:
    dict: Dictionary with length statistics
    """
    # Convert to series if list
    if isinstance(texts, list):
        texts = pd.Series(texts)
    
    # Filter out non-string and NaN values
    valid_texts = texts[texts.apply(lambda x: isinstance(x, str) and not pd.isna(x))]
    
    # Calculate lengths
    lengths = valid_texts.apply(len)
    word_counts = valid_texts.apply(count_words)
    
    return {
        'char_mean': lengths.mean(),
        'char_median': lengths.median(),
        'char_min': lengths.min(),
        'char_max': lengths.max(),
        'word_mean': word_counts.mean(),
        'word_median': word_counts.median(),
        'word_min': word_counts.min(),
        'word_max': word_counts.max(),
        'total_texts': len(valid_texts)
    }

def visualize_text_length_trends_by_decade(df, text_column, subject_main_dir, subject_type, decade_column="decade"):
    """
    Visualize trends of text lengths (both characters and words) over time by decade
    
    Parameters:
    df (DataFrame): DataFrame containing the text and decade columns
    text_column (str): Name of the column with text to analyze
    decade_column (str): Name of the column with decade information
    subject_main_dir (str/Path): Main directory for all subject analyses
    subject_type (str): Type of subject analysis
    
    Returns:
    dict: Analysis results
    """
    # Extract text data, filtering out non-string and NaN values
    valid_df = df.dropna(subset=[text_column, decade_column])
    valid_df = valid_df[valid_df[text_column].apply(lambda x: isinstance(x, str))]
    
    if len(valid_df) == 0:
        return {"error": f"No valid text data found in column {text_column}"}
    
    # Calculate character and word lengths
    valid_df['char_length'] = valid_df[text_column].str.len()
    valid_df['word_count'] = valid_df[text_column].apply(count_words)
    
    # Group by decade and calculate average lengths
    decade_stats = valid_df.groupby(decade_column).agg({
        'char_length': 'mean',
        'word_count': 'mean'
    }).reset_index()
    
    # Sort by decade
    decade_stats = decade_stats.sort_values(by=decade_column)
    
    # Create figure with two subplots
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
    
    # Plot character length trends
    sns.lineplot(data=decade_stats, x=decade_column, y='char_length', marker='o', ax=ax1)
    ax1.set_title(f'Average Character Length by Decade for {text_column}')
    ax1.set_xlabel('Decade')
    ax1.set_ylabel('Average Number of Characters')
    
    # Plot word count trends
    sns.lineplot(data=decade_stats, x=decade_column, y='word_count', marker='o', ax=ax2)
    ax2.set_title(f'Average Word Count by Decade for {text_column}')
    ax2.set_xlabel('Decade')
    ax2.set_ylabel('Average Number of Words')
    
    plt.tight_layout()
    
    # Save the plot
    save_plot(plt, f"{text_column}_length_trends_by_decade.png", subject_main_dir, subject_type)
    
    return {
        "column_name": text_column,
        "decade_stats": decade_stats.to_dict(),
        "visualizations": [
            f"{subject_type}/{text_column}_length_trends_by_decade.png"
        ]
    }

def visualize_word_frequencies(df, column_name, subject_main_dir, subject_type, n=20, 
                             preprocess=True, remove_stopwords=True, language='english'):
    """
    Visualize the most frequent words in a text column
    
    Parameters:
    df (DataFrame): DataFrame containing the text column
    column_name (str): Name of the column to analyze
    subject_main_dir (str/Path): Main directory for all subject analyses
    subject_type (str): Type of subject analysis
    n (int): Number of most common words to show
    preprocess (bool): Whether to preprocess texts
    remove_stopwords (bool): Whether to remove stopwords
    language (str): Language for stopwords removal
    
    Returns:
    dict: Analysis results
    """
    # Extract text data, filtering out non-string and NaN values
    text_data = df[column_name].dropna()
    text_data = text_data[text_data.apply(lambda x: isinstance(x, str))]
    
    if len(text_data) == 0:
        return {"error": f"No valid text data found in column {column_name}"}
    
    # Get most common words
    common_words = get_most_common_words(
        text_data, 
        n=n, 
        preprocess=preprocess, 
        remove_stopwords=remove_stopwords, 
        language=language
    )
    
    # Create bar plot
    plt.figure(figsize=(12, 8))
    
    # Sort by frequency
    words = list(common_words.keys())
    counts = list(common_words.values())
    
    # Create horizontal bar plot
    plt.barh(range(len(words)), counts, align='center')
    plt.yticks(range(len(words)), words)
    
    # Add labels and title
    stopwords_text = "without stopwords" if remove_stopwords else "with stopwords"
    plt.title(f'Top {n} Most Common Words in {column_name} ({stopwords_text})')
    plt.xlabel('Frequency')
    plt.ylabel('Words')
    
    # Add count labels to bars
    for i, count in enumerate(counts):
        plt.text(count + 0.1, i, str(count), va='center')
    
    plt.tight_layout()
    
    # Save the plot
    stopwords_flag = "no_stopwords" if remove_stopwords else "with_stopwords"
    filename = f"{column_name}_top{n}_words_{stopwords_flag}.png"
    save_plot(plt, filename, subject_main_dir, subject_type)
    
    # Create word cloud
    wordcloud = create_wordcloud(
        text_data, 
        preprocess=preprocess, 
        remove_stopwords=remove_stopwords, 
        language=language
    )
    
    plt.figure(figsize=(12, 8))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'Word Cloud for {column_name} ({stopwords_text})')
    
    # Save wordcloud
    wc_filename = f"{column_name}_wordcloud_{stopwords_flag}.png"
    save_plot(plt, wc_filename, subject_main_dir, subject_type)
    
    return {
        "column_name": column_name,
        "top_words": common_words,
        "visualizations": [
            f"{subject_type}/{filename}",
            f"{subject_type}/{wc_filename}"
        ]
    }

In [None]:
import pandas as pd
from pathlib import Path

def analyze_text_column(df, column_name, subject_main_dir, filter_language=True, 
                      target_language='en', remove_stopwords=True, language='english', 
                       proc_df_out_path=None, subject_type=None):
    """
    Run general text analysis on a string column
    
    Parameters:
    df (DataFrame): DataFrame containing the column to analyze
    column_name (str): Name of the column to analyze
    subject_main_dir (str/Path): Main directory for all subject analyses
    filter_language (bool): Whether to filter by language
    target_language (str): Target language code to keep
    remove_stopwords (bool): Whether to remove stopwords in preprocessing
    language (str): Language for stopwords removal
    
    Returns:
    dict: Analysis results or error message
    """
    # Validate that the column exists
    if column_name not in df.columns:
        return {"error": f"Column '{column_name}' not found in DataFrame"}
    
    # Check if column contains string values
    text_data = df[column_name].dropna()
    string_values = text_data[text_data.apply(lambda x: isinstance(x, str))]
    
    if len(string_values) == 0:
        return {"error": f"Column '{column_name}' does not contain any string values"}

    if not subject_type:
        # Create subject type from column name (lowercase)
        subject_type = column_name.lower()
    
    print(f"\nAnalyzing text column: '{column_name}'")
    print(f"Using subject type: '{subject_type}'")
    
    # Step 1: Preprocess the data with optional language filtering
    print("\nStep 1: Preprocessing data...")
    processed_df, language_stats = preprocess_dataframe_text_col(
        df=df,
        text_column=column_name,
        remove_stopwords=remove_stopwords,
        language=language,
        filter_language=filter_language,
        target_language=target_language,
        drop_other_languages=filter_language,
        output_df_path=proc_df_out_path
    )
    
    # Step 2: Analyze text length distribution
    print("\nStep 2: Analyzing text length distribution...")
    length_results = visualize_text_length_trends_by_decade(
        df=processed_df,
        text_column=column_name,
        subject_main_dir=subject_main_dir,
        subject_type=subject_type
    )
    
    # Step 3: Analyze word frequencies
    print("\nStep 3: Analyzing word frequencies...")
    word_freq_results = visualize_word_frequencies(
        df=processed_df,
        column_name=column_name,
        subject_main_dir=subject_main_dir,
        subject_type=subject_type,
        n=20,
        preprocess=True,
        remove_stopwords=remove_stopwords,
        language=language
    )
    
    # Compile all results
    results = {
        "column_name": column_name,
        "subject_type": subject_type,
        "language_stats": language_stats,
        "length_stats": length_results["stats"] if "stats" in length_results else None,
        "top_words": word_freq_results["top_words"] if "top_words" in word_freq_results else None,
        "visualizations": (
            length_results.get("visualizations", []) + 
            word_freq_results.get("visualizations", [])
        )
    }
    
    print(f"\nAnalysis complete for column '{column_name}'")
    print(f"Generated {len(results['visualizations'])} visualizations in {subject_main_dir}/{subject_type}/")
    
    return results, processed_df



## Entity recognition

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import spacy
from tqdm import tqdm

def extract_entities(df, column_name, model='en_core_web_sm', entity_types=None, min_count=1):
    """
    Extract named entities from a text column
    
    Parameters:
    df (DataFrame): DataFrame containing the text column
    column_name (str): Name of the column to analyze
    model (str): spaCy model to use for NER
    entity_types (list): List of entity types to extract (None for all)
    min_count (int): Minimum count for an entity to be included
    
    Returns:
    dict: Dictionary containing entity analysis results
    """
    # Validate that the column exists
    if column_name not in df.columns:
        return {"error": f"Column '{column_name}' not found in DataFrame"}
    
    # Load spaCy model
    try:
        nlp = spacy.load(model)
    except OSError:
        # If model not installed, attempt to download it
        print(f"Model {model} not found. Installing...")
        import subprocess
        subprocess.call(f"python -m spacy download {model}", shell=True)
        try:
            nlp = spacy.load(model)
        except:
            return {"error": f"Failed to load spaCy model {model}. Please install it manually."}
    
    # Extract text data, filtering out non-string and NaN values
    text_data = df[column_name].dropna()
    text_data = text_data[text_data.apply(lambda x: isinstance(x, str))]
    
    if len(text_data) == 0:
        return {"error": f"Column '{column_name}' does not contain any string values"}
    
    print(f"Extracting entities from {len(text_data)} text entries...")
    
    # Process texts with spaCy
    all_entities = []
    entity_texts = []
    
    for text in tqdm(text_data):
        doc = nlp(text)
        
        # Extract entities
        for ent in doc.ents:
            # Filter by entity type if specified
            if entity_types is None or ent.label_ in entity_types:
                all_entities.append((ent.text, ent.label_))
                entity_texts.append(ent.text)
    
    # Count entities by text
    entity_counts = Counter(entity_texts)
    
    # Filter by minimum count
    filtered_entities = {k: v for k, v in entity_counts.items() if v >= min_count}
    
    # Count by entity type
    entity_types_count = Counter([entity[1] for entity in all_entities])
    
    # Create a list of entities with their type and count
    entity_details = []
    for entity_text, entity_type in set(all_entities):
        count = entity_counts[entity_text]
        if count >= min_count:
            entity_details.append({
                'text': entity_text,
                'type': entity_type,
                'count': count
            })
    
    # Sort by count
    entity_details = sorted(entity_details, key=lambda x: x['count'], reverse=True)
    
    return {
        "entity_counts": filtered_entities,
        "entity_types_count": dict(entity_types_count),
        "entity_details": entity_details,
        "total_entities": len(all_entities),
        "unique_entities": len(filtered_entities)
    }


In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
def visualize_entity_wordcloud(entity_results, subject_main_dir, subject_type, max_words=100, colormap='viridis'):
    """
    Create a word cloud visualization of entities
    
    Parameters:
    entity_results (dict): Results from extract_entities function
    subject_main_dir (str/Path): Main directory for all subject analyses
    subject_type (str): Type of subject analysis
    max_words (int): Maximum number of words to include in word cloud
    colormap (str): Matplotlib colormap for word cloud
    
    Returns:
    str: Path to saved visualization
    """
    
    if "error" in entity_results:
        return {"error": entity_results["error"]}
    
    # Create frequency dictionary for wordcloud
    entity_freq = entity_results["entity_counts"]
    
    # Create wordcloud
    wordcloud = WordCloud(
        width=800, 
        height=500, 
        max_words=max_words, 
        background_color='white',
        colormap=colormap,
        collocations=False
    ).generate_from_frequencies(entity_freq)
    
    # Display wordcloud
    plt.figure(figsize=(12, 8))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'Entity Word Cloud')
    plt.tight_layout()
    
    # Save plot
    filename = "entity_wordcloud.png"
    save_plot(plt, filename, subject_main_dir, subject_type)
    
    return f"{subject_type}/{filename}"

In [None]:
def visualize_entity_trends(entity_results, subject_main_dir, subject_type, max_entities=20):
    """
    Visualize entity trends from extracted entity data
    
    Parameters:
    entity_results (dict): Results from extract_entities function
    subject_main_dir (str/Path): Main directory for all subject analyses
    subject_type (str): Type of subject analysis
    max_entities (int): Maximum number of entities to show in visualizations
    
    Returns:
    dict: Dictionary with visualization paths
    """
    if "error" in entity_results:
        return {"error": entity_results["error"]}
    
    visualizations = []
    
    # [Previous visualizations code remains the same]
    
    # Add entity word cloud visualization
    wordcloud_viz = visualize_entity_wordcloud(entity_results, subject_main_dir, subject_type)
    visualizations.append(wordcloud_viz)
    
    return {
        "visualizations": visualizations,
        "total_entities": entity_results["total_entities"],
        "unique_entities": entity_results["unique_entities"]
    }

In [None]:
def analyze_entities_in_column(df, column_name, subject_main_dir, min_count=2, max_entities=20, model='en_core_web_sm', preprocess=True,
                              proc_df_out_path=None, subject_type=None):
    """
    Run complete entity extraction and visualization on a text column
    
    Parameters:
    df (DataFrame): DataFrame containing the text column
    column_name (str): Name of the column to analyze
    subject_main_dir (str/Path): Main directory for all subject analyses
    min_count (int): Minimum count for an entity to be included
    max_entities (int): Maximum number of entities to show in visualizations
    model (str): spaCy model to use for NER
    
    Returns:
    dict: Combined results from entity extraction and visualization
    """
    if preprocess:
        df, language_stats = preprocess_dataframe_text_col(df=df, text_column=column_name, output_df_path=proc_df_out_path)
    if not subject_type:
        # Create subject type from column name (lowercase)
        subject_type = column_name.lower()
    
    print(f"Running entity recognition on '{column_name}' column...")
    
    # Extract entities
    entity_results = extract_entities(
        df=df, 
        column_name=column_name, 
        model=model,
        entity_types=None,
        min_count=min_count
    )
    
    if "error" in entity_results:
        print(f"Error: {entity_results['error']}")
        return entity_results
    
    # Print basic stats
    print(f"Found {entity_results['total_entities']} total entities")
    print(f"Found {entity_results['unique_entities']} unique entities")
    
    print("\nTop entity types:")
    for entity_type, count in sorted(entity_results['entity_types_count'].items(), 
                                  key=lambda x: x[1], reverse=True)[:5]:
        print(f"  - {entity_type}: {count}")

    print("\nTop entities:")
    for entity in entity_results['entity_details'][:10]:
        print(f"  - {entity['text']} ({entity['type']}): {entity['count']} occurrences")
    
    # Visualize entity trends
    viz_results = visualize_entity_trends(
        entity_results=entity_results,
        subject_main_dir=subject_main_dir,
        subject_type=subject_type,
        max_entities=max_entities
    )
    
    print(f"\nGenerated {len(viz_results['visualizations'])} visualizations:")
    for viz in viz_results['visualizations']:
        print(f"  - {viz}")
    
    # Combine results
    combined_results = {
        "column_name": column_name,
        "subject_type": subject_type,
        "entity_results": entity_results,
        "visualization_results": viz_results
    }
    
    return combined_results

## Bert-Topic

In [None]:
def perform_topic_modeling(df, column_name, model=None, language="english", 
                          nr_topics="auto", min_topic_size=10):
    """
    Perform topic modeling on text data using BERTopic
    
    Parameters:
    df (DataFrame): DataFrame containing the text column
    column_name (str): Name of the column to analyze
    model (BERTopic, optional): Pre-configured BERTopic model
    language (str): Language for stopwords removal and preprocessing
    nr_topics (int or str): Number of topics to find ('auto' or specific number)
    min_topic_size (int): Minimum size of topics
    
    Returns:
    tuple: (model, topics, probabilities, processed_df)
    """
    # Extract text data, filtering out non-string and NaN values
    processed_df = df.copy()
    text_data = processed_df[column_name].dropna()
    processed_df = processed_df[processed_df[column_name].apply(lambda x: isinstance(x, str) and pd.notna(x))]
    
    if len(processed_df) == 0:
        return {"error": f"No valid text data found in column {column_name}"}
    
    # Apply specific filtering if needed
    if column_name == "Abstract":
        processed_df = processed_df[processed_df["Abstract"] != "No abstract available"]
    
    # Get clean text data
    documents = processed_df[column_name].tolist()
    
    # Create model if not provided
    if model is None:
        model = setup_bertopic(language, nr_topics, min_topic_size)
        if model is None:
            return {"error": "Failed to initialize BERTopic model"}
    
    # Fit the model on our text data
    topics, probabilities = model.fit_transform(documents)
    
    # Add topics to the dataframe
    processed_df["bertopic_topic"] = topics
    
    # Fix for the "numpy.float64 has no len()" error
    # Check if probabilities is a numpy array and handle accordingly
    if hasattr(probabilities, 'ndim') and probabilities.ndim == 2:
        # If probabilities is a 2D array, get the max value for each row
        processed_df["bertopic_probability"] = [prob.max() for prob in probabilities]
    elif hasattr(probabilities, 'ndim') and probabilities.ndim == 1:
        # If probabilities is a 1D array, use it directly
        processed_df["bertopic_probability"] = probabilities
    else:
        # Handle case where probabilities might be a list of arrays or other structures
        processed_df["bertopic_probability"] = [
            prob.max() if hasattr(prob, 'max') else 
            (max(prob) if hasattr(prob, '__len__') and len(prob) > 0 else 0) 
            for prob in probabilities
        ]
    
    return model, topics, probabilities, processed_df


def setup_bertopic(language="english", nr_topics="auto", min_topic_size=10):
    """
    Set up and configure a BERTopic model
    
    Parameters:
    language (str): Language for stopwords removal and preprocessing
    nr_topics (int or str): Number of topics to find ('auto' or specific number)
    min_topic_size (int): Minimum size of topics
    
    Returns:
    BERTopic model: Configured model instance
    """
    try:
        from bertopic import BERTopic
        from sklearn.feature_extraction.text import CountVectorizer
        from nltk.corpus import stopwords
        import nltk
        
        # Download stopwords if needed
        try:
            nltk.data.find('corpora/stopwords')
        except LookupError:
            nltk.download('stopwords')
        
        # Get stopwords for the specified language
        stop_words = stopwords.words(language)
        
        # Set up the vectorizer with stopwords
        vectorizer = CountVectorizer(stop_words=stop_words)
        
        # Set environment variable to avoid the tokenizers parallelism warning
        import os
        os.environ["TOKENIZERS_PARALLELISM"] = "false"
        
        # Create and return the BERTopic model
        return BERTopic(
            language=language,
            nr_topics=nr_topics,
            min_topic_size=min_topic_size,
            vectorizer_model=vectorizer
        )
    
    except ImportError:
        print("Please install required packages with: pip install bertopic nltk")
        return None


def visualize_topic_wordclouds(model, subject_main_dir, subject_type, 
                              column_name, top_n_topics=10):
    """
    Visualize topics as wordclouds
    
    Parameters:
    model (BERTopic): Fitted BERTopic model
    subject_main_dir (str/Path): Main directory for all subject analyses
    subject_type (str): Type of subject analysis
    column_name (str): Name of the column analyzed
    top_n_topics (int): Number of top topics to visualize
    
    Returns:
    dict: Visualization details
    """
    try:
        # Get topic info dataframe
        topic_info = model.get_topic_info()
        
        # Get topics excluding the -1 outlier topic
        if -1 in model.get_topics():
            top_topics = [topic for topic in topic_info['Topic'].tolist() 
                         if topic != -1][:top_n_topics]
        else:
            top_topics = topic_info['Topic'].tolist()[:top_n_topics]
        
        # Create figure for word clouds
        fig = model.visualize_topics(topics=top_topics)
        
        # Save the visualization
        from pathlib import Path
        Path(f"{subject_main_dir}/{subject_type}").mkdir(parents=True, exist_ok=True)
        
        filename = f"{column_name}_topic_wordclouds.html"
        fig.write_html(f"{subject_main_dir}/{subject_type}/{filename}")
        
        return {
            "visualization": f"{subject_type}/{filename}",
            "type": "topic_wordclouds",
            "topics": top_topics
        }
    except Exception as e:
        import traceback
        return {"error": f"Failed to create topic wordclouds: {str(e)}\n{traceback.format_exc()}"}


def visualize_topic_barchart(model, subject_main_dir, subject_type, column_name, n_topics=10):
    """
    Visualize top terms for each topic as bar charts
    
    Parameters:
    model (BERTopic): Fitted BERTopic model
    subject_main_dir (str/Path): Main directory for all subject analyses
    subject_type (str): Type of subject analysis
    column_name (str): Name of the column analyzed
    n_topics (int): Number of topics to show
    
    Returns:
    dict: Visualization details
    """
    try:
        # Get topic info dataframe
        topic_info = model.get_topic_info()
        
        # Exclude -1 topic (outliers)
        topics = [topic for topic in topic_info['Topic'].tolist() 
                 if topic != -1][:n_topics]
        
        # Create visualization
        fig = model.visualize_barchart(topics=topics, top_n_topics=n_topics)
        
        # Save the visualization
        from pathlib import Path
        Path(f"{subject_main_dir}/{subject_type}").mkdir(parents=True, exist_ok=True)
        
        filename = f"{column_name}_topic_barchart.html"
        fig.write_html(f"{subject_main_dir}/{subject_type}/{filename}")
        
        return {
            "visualization": f"{subject_type}/{filename}",
            "type": "topic_barchart",
            "topics": topics
        }
    except Exception as e:
        import traceback
        return {"error": f"Failed to create topic barchart: {str(e)}\n{traceback.format_exc()}"}


def visualize_topics_over_time(model, processed_df, decade_column, subject_main_dir, 
                              subject_type, column_name, top_n_topics=10):
    """
    Visualize topic trends over decades
    
    Parameters:
    model (BERTopic): Fitted BERTopic model
    processed_df (DataFrame): DataFrame with assigned topics
    decade_column (str): Name of the column with decade information
    subject_main_dir (str/Path): Main directory for all subject analyses
    subject_type (str): Type of subject analysis
    column_name (str): Name of the column analyzed
    top_n_topics (int): Number of top topics to visualize
    
    Returns:
    dict: Visualization details
    """
    try:
        import pandas as pd
        import matplotlib.pyplot as plt
        import numpy as np
        from pathlib import Path
        
        # Ensure decade column exists
        if decade_column not in processed_df.columns:
            return {"error": f"Decade column '{decade_column}' not found in DataFrame"}
        
        # Create copy to avoid modifying original
        df_time = processed_df.copy()
        
        # Convert topics to document-topic matrix
        topics = df_time["bertopic_topic"].tolist()
        documents = df_time[column_name].tolist()
        
        # Handle timestamps properly
        if pd.api.types.is_datetime64_dtype(df_time[decade_column]):
            # Already datetime format
            timestamps = df_time[decade_column].tolist()
        else:
            # Convert to datetime - handle both numeric and string formats
            try:
                # Try to convert directly
                timestamps = pd.to_datetime(df_time[decade_column]).tolist()
            except:
                # If that fails, try to interpret as year values
                try:
                    # First convert to string if numeric
                    year_strings = df_time[decade_column].astype(str).tolist()
                    # Then convert to datetime with January 1st of that year
                    timestamps = [pd.to_datetime(f"{year}-01-01") for year in year_strings]
                except:
                    return {"error": f"Could not convert '{decade_column}' to datetime format"}
        
        # Create directories if they don't exist
        Path(f"{subject_main_dir}/{subject_type}").mkdir(parents=True, exist_ok=True)
        
        # Get basic topic information for labels
        topic_info = model.get_topic_info()
        
        # Set up bins for decades
        if isinstance(timestamps[0], pd.Timestamp):
            min_year = min(ts.year for ts in timestamps)
            max_year = max(ts.year for ts in timestamps)
        else:
            # Fallback if conversion somehow failed
            try:
                min_year = int(min(df_time[decade_column]))
                max_year = int(max(df_time[decade_column]))
            except:
                min_year = 1900  # Fallback
                max_year = 2020  # Fallback
        
        # Round to decades
        min_decade = (min_year // 10) * 10
        max_decade = ((max_year + 9) // 10) * 10
        decades = range(min_decade, max_decade + 10, 10)
        
        # Skip BERTopic's built-in time visualization which is causing errors
        # Instead, create our own visualization with Matplotlib
        
        plt.figure(figsize=(14, 8))
        
        # Add a decade label column for grouping
        if isinstance(timestamps[0], pd.Timestamp):
            df_time['decade_label'] = [((ts.year // 10) * 10) for ts in timestamps]
        else:
            # Fallback
            df_time['decade_label'] = df_time[decade_column].astype(int) // 10 * 10
        
        # Get top topics excluding outliers
        top_topics = [topic for topic in topic_info['Topic'].tolist() 
                     if topic != -1][:top_n_topics]
        
        # Group by decade and count topics
        decade_topic_counts = df_time.groupby(['decade_label', 'bertopic_topic']).size().unstack().fillna(0)
        
        # Filter to top topics if they exist in the unstack result
        available_topics = set(decade_topic_counts.columns).intersection(top_topics)
        if available_topics:
            decade_topic_counts = decade_topic_counts[[col for col in decade_topic_counts.columns if col in top_topics]]
        
        # Convert to percentages
        totals = decade_topic_counts.sum(axis=1)
        decade_topic_pcts = decade_topic_counts.div(totals, axis=0) * 100
        
        # Get topic names/labels
        topic_names = {}
        for topic in top_topics:
            if topic in model.get_topics():
                # Get the top 3 words for this topic
                topic_words = model.get_topic(topic)
                if topic_words:  # Check if topic_words is not empty
                    words = [word for word, _ in topic_words][:3]
                    topic_names[topic] = f"Topic {topic}: {', '.join(words)}"
                else:
                    topic_names[topic] = f"Topic {topic}"
            else:
                topic_names[topic] = f"Topic {topic}"
        
        # Plot with proper legend
        for topic in decade_topic_counts.columns:
            if topic in topic_names:  # Only plot if we have a name for this topic
                label = topic_names[topic]
                plt.plot(decade_topic_pcts.index, decade_topic_pcts[topic], marker='o', linewidth=2, label=label)
        
        plt.title(f'Topic Trends by Decade for {column_name}')
        plt.xlabel('Decade')
        plt.ylabel('Topic Prevalence (%)')
        plt.xticks(rotation=45)
        plt.legend(loc='best')
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        
        # Save the matplotlib plot
        plt_filename = f"{column_name}_decade_topic_trends.png"
        
        # Define save_plot function
        def save_plot(plt_obj, filename, main_dir, sub_dir):
            plt_obj.savefig(f"{main_dir}/{sub_dir}/{filename}")
            plt_obj.close()
        
        save_plot(plt, plt_filename, subject_main_dir, subject_type)
        
        # Create a simple HTML to display
        html_content = f"""
        <html>
        <head>
            <title>Topic Trends Over Time - {column_name}</title>
        </head>
        <body>
            <h1>Topic Trends Over Time - {column_name}</h1>
            <img src="{plt_filename}" alt="Topic Trends" style="width:100%;">
            <h2>Topic Descriptions</h2>
            <ul>
        """
        
        for topic, name in topic_names.items():
            words = model.get_topic(topic)
            if words:
                word_list = ", ".join([f"{word} ({weight:.3f})" for word, weight in words[:10]])
                html_content += f"<li><strong>{name}</strong>: {word_list}</li>\n"
        
        html_content += """
            </ul>
        </body>
        </html>
        """
        
        # Save HTML
        html_filename = f"{column_name}_topics_over_time.html"
        with open(f"{subject_main_dir}/{subject_type}/{html_filename}", "w") as f:
            f.write(html_content)
        
        return {
            "visualizations": [
                f"{subject_type}/{html_filename}",
                f"{subject_type}/{plt_filename}"
            ],
            "type": "topics_over_time",
            "topics": top_topics
        }
    except Exception as e:
        import traceback
        return {"error": f"Failed to create topics over time visualization: {str(e)}\n{traceback.format_exc()}"}


def run_bertopic_analysis(df, column_name, decade_column="decade", subject_main_dir="analyze_dataset", 
                        language="english", nr_topics="auto", min_topic_size=10, top_n_topics=10):
    """
    Run comprehensive BERTopic analysis on text data with all visualizations
    
    Parameters:
    df (DataFrame): DataFrame containing the text column
    column_name (str): Name of the column to analyze
    decade_column (str, optional): Name of the column with decade information for trend analysis
    subject_main_dir (str/Path): Main directory for all subject analyses
    language (str): Language for stopwords removal and preprocessing
    nr_topics (int or str): Number of topics to find ('auto' or specific number)
    min_topic_size (int): Minimum size of topics
    top_n_topics (int): Number of top topics to visualize
    
    Returns:
    dict: Analysis results
    """
    # Print status
    print(f"Running BERTopic analysis on '{column_name}'...")
    subject_type=column_name.lower()
    
    try:
        # Import required packages
        from bertopic import BERTopic
        import nltk
        from sklearn.feature_extraction.text import CountVectorizer
        from pathlib import Path
        import pandas as pd
        import numpy as np
        import matplotlib.pyplot as plt
        
        # Set environment variable to avoid the tokenizers parallelism warning
        import os
        os.environ["TOKENIZERS_PARALLELISM"] = "false"
        
        # Add necessary imports for visualization
        import plotly.graph_objects as go
        
        # Create subject directory if it doesn't exist
        if subject_main_dir:
            subject_dir = Path(subject_main_dir) / subject_type
            subject_dir.mkdir(parents=True, exist_ok=True)
        
        # Set up model directly here for better control
        def setup_bertopic(language="english", nr_topics="auto", min_topic_size=10):
            try:
                # Download stopwords if needed
                try:
                    nltk.data.find('corpora/stopwords')
                except LookupError:
                    nltk.download('stopwords')
                
                # Get stopwords for the specified language
                from nltk.corpus import stopwords
                stop_words = stopwords.words(language)
                
                # Set up the vectorizer with stopwords
                vectorizer = CountVectorizer(stop_words=stop_words)
                
                # Create and return the BERTopic model
                model = BERTopic(
                    language=language,
                    nr_topics=nr_topics,
                    min_topic_size=min_topic_size,
                    vectorizer_model=vectorizer
                )
                return model
            
            except ImportError:
                print("Please install required packages with: pip install bertopic nltk")
                return None
        
        # Create the model
        model = setup_bertopic(language, nr_topics, min_topic_size)
        if model is None:
            return {"error": "Failed to initialize BERTopic model"}
        
        # Extract text data, filtering out non-string and NaN values
        processed_df = df.copy()
        processed_df = processed_df[processed_df[column_name].apply(
            lambda x: isinstance(x, str) and pd.notna(x))]
        
        if len(processed_df) == 0:
            return {"error": f"No valid text data found in column {column_name}"}
        
        # Apply specific filtering if needed
        if column_name == "Abstract":
            processed_df = processed_df[processed_df["Abstract"] != "No abstract available"]
        
        # Get clean text data
        documents = processed_df[column_name].tolist()
        
        # Fit the model on our text data
        topics, probabilities = model.fit_transform(documents)
        
        # Add topics to the dataframe
        processed_df["bertopic_topic"] = topics
        
        # Handle probabilities properly
        if hasattr(probabilities, 'ndim') and probabilities.ndim == 2:
            # If probabilities is a 2D array, get the max value for each row
            processed_df["bertopic_probability"] = [prob.max() for prob in probabilities]
        elif hasattr(probabilities, 'ndim') and probabilities.ndim == 1:
            # If probabilities is a 1D array, use it directly
            processed_df["bertopic_probability"] = probabilities
        else:
            # Handle case where probabilities might be a list of arrays or other structures
            processed_df["bertopic_probability"] = [
                prob.max() if hasattr(prob, 'max') else 
                (max(prob) if hasattr(prob, '__len__') and len(prob) > 0 else 0) 
                for prob in probabilities
            ]
        
        # Get basic topic information
        topic_info = model.get_topic_info()
        print(f"Identified {len(topic_info)-1} topics (excluding outlier topic)")
        
        # Generate visualizations
        results = {
            "column_analyzed": column_name,
            "topic_count": len(model.get_topics()),
            "visualizations": []
        }
        
        # 1. Topic Word Clouds
        print("Generating topic word clouds...")
        wc_result = visualize_topic_wordclouds(
            model=model,
            subject_main_dir=subject_main_dir,
            subject_type=subject_type,
            column_name=column_name,
            top_n_topics=top_n_topics
        )
        if "error" not in wc_result:
            results["visualizations"].append(wc_result["visualization"])
        else:
            print(f"Error generating word clouds: {wc_result['error']}")
        
        # 2. Topic Bar Charts
        print("Generating topic bar charts...")
        bar_result = visualize_topic_barchart(
            model=model,
            subject_main_dir=subject_main_dir,
            subject_type=subject_type,
            column_name=column_name,
            n_topics=top_n_topics
        )
        if "error" not in bar_result:
            results["visualizations"].append(bar_result["visualization"])
        else:
            print(f"Error generating bar charts: {bar_result['error']}")
        
        # 3. Topic Similarity
        print("Generating topic similarity heatmap...")
        try:
            fig = model.visualize_heatmap()
            filename = f"{column_name}_topic_similarity.html"
            fig.write_html(f"{subject_main_dir}/{subject_type}/{filename}")
            results["visualizations"].append(f"{subject_type}/{filename}")
        except Exception as e:
            print(f"Error generating similarity heatmap: {str(e)}")
        
        # 4. Topic Hierarchy
        print("Generating topic hierarchy visualization...")
        try:
            fig = model.visualize_hierarchy()
            filename = f"{column_name}_topic_hierarchy.html"
            fig.write_html(f"{subject_main_dir}/{subject_type}/{filename}")
            results["visualizations"].append(f"{subject_type}/{filename}")
        except Exception as e:
            print(f"Error generating hierarchy visualization: {str(e)}")
        
        # 5. Topics over Time (if decade column exists)
        if decade_column and decade_column in df.columns:
            print(f"Generating topic trends over decades using '{decade_column}'...")
            time_result = visualize_topics_over_time(
                model=model,
                processed_df=processed_df,
                decade_column=decade_column,
                subject_main_dir=subject_main_dir,
                subject_type=subject_type,
                column_name=column_name,
                top_n_topics=top_n_topics
            )
            if "error" not in time_result:
                results["visualizations"].extend(time_result["visualizations"])
            else:
                print(f"Error generating topics over time: {time_result['error']}")
        
        # Save topic info as CSV
        if subject_main_dir:
            topic_info_path = f"{subject_main_dir}/{subject_type}/{column_name}_topic_info.csv"
            topic_info.to_csv(topic_info_path, index=False)
            results["topic_info_path"] = topic_info_path
            print(f"Topic information saved to {topic_info_path}")
        
        # Add topic assignments to results
        results["document_topics"] = topics
        results["document_topic_probs"] = probabilities
        results["processed_df"] = processed_df
        results["model"] = model
        
        print("BERTopic analysis completed successfully!")
        return results
        
    except ImportError as e:
        return {"error": f"Required package missing: {str(e)}. Please install with: pip install bertopic nltk scikit-learn plotly matplotlib pandas"}
    except Exception as e:
        import traceback
        return {"error": f"BERTopic analysis failed: {str(e)}\n{traceback.format_exc()}"}

### BERTopic utils

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pathlib import Path
import plotly.graph_objects as go
from typing import List, Dict, Tuple, Union, Optional, Any

def save_plot(plt, filename, subject_main_dir, subject_type):
    """
    Save the current matplotlib plot to a file in the appropriate subject subdirectory
    
    Parameters:
    plt: matplotlib.pyplot instance
    filename (str): Name of the file to save
    subject_main_dir (str/Path): Main directory for all subject analyses
    subject_type (str): Type of subject analysis (e.g., 'domains', 'fields', 'topics')
    """
    # Create subject-specific subdirectory
    subject_dir = Path(subject_main_dir) / subject_type
    subject_dir.mkdir(parents=True, exist_ok=True)
    
    # Save the plot
    save_path = subject_dir / filename
    plt.savefig(save_path, bbox_inches='tight', dpi=300)
    plt.close()

def save_plotly_figure(fig, filename, subject_main_dir, subject_type):
    """
    Save a Plotly figure to HTML and optionally as a static image
    
    Parameters:
    fig: Plotly figure object
    filename (str): Name of the file to save (without extension)
    subject_main_dir (str/Path): Main directory for all subject analyses
    subject_type (str): Type of subject analysis
    
    Returns:
    tuple: Paths to HTML and PNG files
    """
    # Create subject-specific subdirectory
    subject_dir = Path(subject_main_dir) / subject_type
    subject_dir.mkdir(parents=True, exist_ok=True)
    
    # Save as HTML (interactive)
    html_path = subject_dir / f"{filename}.html"
    fig.write_html(str(html_path))
    
    # Also save as PNG for static viewing/embedding
    png_path = subject_dir / f"{filename}.png"
    fig.write_image(str(png_path))
    
    return str(html_path), str(png_path)

def get_topic_labels(model, topics, include_top_n_words=3):
    """
    Generate descriptive labels for topics with top words
    
    Parameters:
    model: BERTopic model
    topics (list): List of topic IDs
    include_top_n_words (int): Number of top words to include in label
    
    Returns:
    dict: Dictionary mapping topic IDs to descriptive labels
    """
    topic_labels = {}
    
    for topic in topics:
        words = model.get_topic(topic)
        if words:
            top_words = ", ".join([word for word, _ in words[:include_top_n_words]])
            topic_labels[topic] = f"Topic {topic}: {top_words}"
        else:
            topic_labels[topic] = f"Topic {topic}"
    
    return topic_labels

def get_top_topics(model, n=10, exclude_outliers=True):
    """
    Get the top N topics by document count
    
    Parameters:
    model: BERTopic model
    n (int): Number of topics to return
    exclude_outliers (bool): Whether to exclude the outlier topic (-1)
    
    Returns:
    list: List of top topic IDs
    """
    topic_info = model.get_topic_info()
    
    if exclude_outliers:
        filtered_topics = topic_info[topic_info['Topic'] != -1]
    else:
        filtered_topics = topic_info
    
    return filtered_topics.nlargest(n, 'Count')['Topic'].tolist()

### Basic BERTopic visualizations

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from wordcloud import WordCloud
import matplotlib.colors as mcolors
from pathlib import Path

def create_topic_wordcloud(model, topic_id, topic_label=None, colormap='viridis'):
    """
    Create a wordcloud visualization for a specific topic with topic number in the title
    
    Parameters:
    model: BERTopic model
    topic_id (int): ID of the topic to visualize
    topic_label (str, optional): Custom label for the topic
    colormap (str): Matplotlib colormap to use for the wordcloud
    
    Returns:
    matplotlib.figure.Figure: The figure containing the wordcloud
    """
    # Get the words and their weights for the topic
    words = model.get_topic(topic_id)
    if not words:
        return None
        
    # Create a dictionary of word:weight for the wordcloud
    word_dict = {word: weight for word, weight in words}
    
    # Generate the color function based on weights
    def color_func(word, font_size, position, orientation, random_state=None, **kwargs):
        weight = word_dict.get(word, 0.5)  # Default to middle weight if word not found
        colors = plt.cm.get_cmap(colormap)(np.linspace(0, 1, 100))
        color_idx = int(weight * 99)  # Scale weight to 0-99 range
        return tuple(int(255 * c) for c in colors[color_idx][:3])
    
    # Create the wordcloud
    wc = WordCloud(
        background_color='white',
        max_words=50,
        width=800,
        height=400,
        prefer_horizontal=1.0,
        color_func=color_func,
        max_font_size=100,
        random_state=42
    ).generate_from_frequencies(word_dict)
    
    # Create figure and add the wordcloud
    fig, ax = plt.subplots(figsize=(10, 6))
    ax.imshow(wc, interpolation='bilinear')
    
    # Add title with topic number
    if topic_label:
        title = topic_label
    else:
        title = f"Topic {topic_id}"
        
    # Add top 5 words to the title
    top_words = ", ".join([word for word, _ in words[:5]])
    title = f"{title}: {top_words}"
    
    ax.set_title(title, fontsize=16)
    ax.axis('off')
    
    return fig

def create_topic_wordclouds_grid(model, topics=None, rows=3, cols=3, title="Topic Wordclouds", subject_main_dir=None, subject_type=None):
    """
    Create a grid of wordclouds for multiple topics with topic numbers as labels
    
    Parameters:
    model: BERTopic model
    topics (list, optional): List of topics to visualize
    rows (int): Number of rows in the grid
    cols (int): Number of columns in the grid
    title (str): Title for the visualization
    subject_main_dir (str/Path, optional): Directory to save the plot
    subject_type (str, optional): Subdirectory type for saving
    
    Returns:
    matplotlib.figure.Figure: The figure containing the wordcloud grid
    """
    # If topics not specified, use top N non-outlier topics
    if topics is None:
        topic_info = model.get_topic_info()
        filtered_topics = topic_info[topic_info['Topic'] != -1]
        n_topics = rows * cols
        topics = filtered_topics.nlargest(n_topics, 'Count')['Topic'].tolist()
    else:
        # Limit to rows*cols topics
        topics = topics[:rows*cols]
    
    # Create figure with subplots
    fig, axes = plt.subplots(rows, cols, figsize=(cols*5, rows*4))
    fig.suptitle(title, fontsize=20, y=0.98)
    
    # Flatten axes array for easier iteration
    if rows > 1 or cols > 1:
        axes_flat = axes.flatten()
    else:
        axes_flat = [axes]
    
    # Generate wordclouds for each topic
    for i, (ax, topic) in enumerate(zip(axes_flat, topics)):
        # Get the words and their weights for the topic
        words = model.get_topic(topic)
        if not words:
            ax.text(0.5, 0.5, f"No words for Topic {topic}", 
                    horizontalalignment='center', verticalalignment='center')
            ax.axis('off')
            continue
            
        # Create a dictionary of word:weight for the wordcloud
        word_dict = {word: weight for word, weight in words}
        
        # Create the wordcloud
        wc = WordCloud(
            background_color='white',
            max_words=50,
            width=400,
            height=400,
            prefer_horizontal=1.0,
            colormap='viridis',
            max_font_size=100,
            random_state=42
        ).generate_from_frequencies(word_dict)
        
        # Display the wordcloud
        ax.imshow(wc, interpolation='bilinear')
        
        # Add title with topic number
        top_words = ", ".join([word for word, _ in words[:3]])
        ax.set_title(f"Topic {topic}: {top_words}", fontsize=12)
        ax.axis('off')
    
    # Hide any unused subplots
    for j in range(len(topics), len(axes_flat)):
        axes_flat[j].axis('off')
    
    plt.tight_layout(rect=[0, 0, 1, 0.96])  # Adjust for the main title
    
    # Save the plot if directory is provided
    if subject_main_dir and subject_type:
        save_path = Path(subject_main_dir) / subject_type / f"{title.replace(' ', '_').lower()}.png"
        save_path.parent.mkdir(parents=True, exist_ok=True)
        plt.savefig(save_path, bbox_inches='tight', dpi=300)
    
    return fig

def visualize_topic_wordclouds(model, subject_main_dir, subject_type, 
                             column_name, top_n_topics=10):
    """
    Visualize topics as wordclouds with topic numbers as labels
    
    Parameters:
    model: BERTopic model
    subject_main_dir (str/Path): Main directory for all subject analyses
    subject_type (str): Type of subject analysis
    column_name (str): Name of the column analyzed
    top_n_topics (int): Number of top topics to visualize
    
    Returns:
    dict: Visualization details
    """
    try:
        # Get topic info dataframe
        topic_info = model.get_topic_info()
        
        # Get topics excluding the -1 outlier topic
        if -1 in topic_info['Topic'].values:
            top_topics = [topic for topic in topic_info['Topic'].tolist() 
                        if topic != -1][:top_n_topics]
        else:
            top_topics = topic_info['Topic'].tolist()[:top_n_topics]
        
        # Create grid of wordclouds
        rows = (top_n_topics + 2) // 3  # Ceiling division to get enough rows
        grid_fig = create_topic_wordclouds_grid(
            model=model,
            topics=top_topics,
            rows=rows,
            cols=3,
            title=f"{column_name} Topic Wordclouds",
            subject_main_dir=subject_main_dir,
            subject_type=subject_type
        )
        
        # Save the grid figure
        grid_filename = f"{column_name}_topic_wordclouds_grid.png"
        save_path = Path(subject_main_dir) / subject_type / grid_filename
        plt.savefig(save_path, bbox_inches='tight', dpi=300)
        plt.close(grid_fig)
        
        # Also create interactive wordcloud visualization using BERTopic's built-in method
        try:
            interactive_fig = model.visualize_topics(topics=top_topics)
            html_filename = f"{column_name}_topic_wordclouds.html"
            html_path = Path(subject_main_dir) / subject_type / html_filename
            interactive_fig.write_html(str(html_path))
        except Exception as e:
            print(f"Warning: Could not create interactive wordcloud: {str(e)}")
        
        return {
            "visualization": f"{subject_type}/{grid_filename}",
            "type": "topic_wordclouds",
            "topics": top_topics
        }
    except Exception as e:
        import traceback
        return {"error": f"Failed to create topic wordclouds: {str(e)}\n{traceback.format_exc()}"}

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import plotly.graph_objects as go
from pathlib import Path
from scipy.cluster.hierarchy import linkage, dendrogram
import networkx as nx
import plotly.express as px
from typing import List, Dict, Tuple, Union, Optional, Any

def create_topic_hierarchy(model, topics=None, title="Topic Hierarchy"):
    """
    Create a hierarchical visualization of topics with topic numbers in labels
    
    Parameters:
    model: BERTopic model 
    topics (list, optional): List of topics to include
    title (str): Title for the visualization
    
    Returns:
    plotly.graph_objects.Figure: Interactive hierarchy figure
    """
    try:
        # Use built-in BERTopic visualization if available
        fig = model.visualize_hierarchy(topics=topics, title=title)
        return fig
    except Exception as e:
        print(f"Warning: Built-in hierarchy visualization failed: {str(e)}")
        print("Falling back to custom implementation...")
        
        # If topics not specified, use all non-outlier topics
        if topics is None:
            topic_info = model.get_topic_info()
            topics = [t for t in topic_info['Topic'].tolist() if t != -1]
        
        # Get topic labels with top words
        topic_labels = {}
        for topic in topics:
            words = model.get_topic(topic)
            if words:
                top_words = ", ".join([word for word, _ in words[:3]])
                topic_labels[topic] = f"Topic {topic}: {top_words}"
            else:
                topic_labels[topic] = f"Topic {topic}"
        
        # Calculate similarity matrix
        similarity_matrix = 1 - model._calculate_topic_similarities(topics, topics)
        
        # Perform hierarchical clustering
        Z = linkage(similarity_matrix, 'ward')
        
        # Create dendrogram
        fig, ax = plt.subplots(figsize=(15, 10))
        dendrogram(
            Z,
            labels=[topic_labels[t] for t in topics],
            orientation='right',
            leaf_font_size=10
        )
        ax.set_title(title, fontsize=16)
        ax.set_xlabel('Distance', fontsize=12)
        
        return fig

def create_topic_tree(model, topics=None, title="Topic Hierarchical Tree"):
    """
    Create a tree visualization of topic hierarchy with topic numbers in labels
    
    Parameters:
    model: BERTopic model
    topics (list, optional): List of topics to include
    title (str): Title for the visualization
    
    Returns:
    plotly.graph_objects.Figure: Interactive tree figure
    """
    try:
        # Use built-in BERTopic visualization if available
        fig = model.visualize_hierarchy(topics=topics, title=title)
        return fig
    except Exception as e:
        print(f"Warning: Could not create hierarchical tree: {str(e)}")
        return None

def create_topic_network(model, topics=None, threshold=0.5, title="Topic Similarity Network"):
    """
    Create a network visualization of topic relationships with topic numbers in labels
    
    Parameters:
    model: BERTopic model
    topics (list, optional): List of topics to include
    threshold (float): Similarity threshold for displaying connections
    title (str): Title for the visualization
    
    Returns:
    plotly.graph_objects.Figure: Interactive network figure
    """
    # If topics not specified, use all non-outlier topics
    if topics is None:
        topic_info = model.get_topic_info()
        topics = [t for t in topic_info['Topic'].tolist() if t != -1]
    
    # Calculate similarity matrix
    similarity_matrix = 1 - model._calculate_topic_similarities(topics, topics)
    
    # Create graph
    G = nx.Graph()
    
    # Add nodes
    for topic in topics:
        # Get top words for the topic
        words = model.get_topic(topic)
        if words:
            top_words = ", ".join([word for word, _ in words[:3]])
            # Add size based on topic prevalence
            size = model.get_topic_info().loc[model.get_topic_info()['Topic'] == topic, 'Count'].values[0]
            G.add_node(topic, label=f"Topic {topic}", top_words=top_words, size=size)
        else:
            G.add_node(topic, label=f"Topic {topic}", top_words="", size=10)
    
    # Add edges for topics that are similar above the threshold
    for i, topic1 in enumerate(topics):
        for j, topic2 in enumerate(topics):
            if i < j:  # Only process each pair once
                sim = similarity_matrix[i, j]
                if sim > threshold:
                    G.add_edge(topic1, topic2, weight=sim)
    
    # Get positions using a spring layout
    pos = nx.spring_layout(G, k=0.5, seed=42)
    
    # Get node sizes based on topic prevalence (normalized)
    sizes = [G.nodes[node]['size'] for node in G.nodes()]
    max_size = max(sizes) if sizes else 100
    sizes = [50 + (s/max_size) * 500 for s in sizes]
    
    # Create edge traces
    edge_x = []
    edge_y = []
    edge_traces = []
    
    for edge in G.edges():
        x0, y0 = pos[edge[0]]
        x1, y1 = pos[edge[1]]
        weight = G.edges[edge]['weight']
        
        edge_trace = go.Scatter(
            x=[x0, x1, None], y=[y0, y1, None],
            line=dict(width=weight*5, color='rgba(50,50,50,0.5)'),
            hoverinfo='none',
            mode='lines')
        
        edge_traces.append(edge_trace)
    
    # Create node trace
    node_x = []
    node_y = []
    
    for node in G.nodes():
        x, y = pos[node]
        node_x.append(x)
        node_y.append(y)
    
    # Create a continuous colormap for node colors
    node_colors = [node for node in G.nodes()]
    
    node_trace = go.Scatter(
        x=node_x, y=node_y,
        mode='markers+text',
        text=[f"T{node}" for node in G.nodes()],
        textposition="top center",
        textfont=dict(size=10, color='black'),
        marker=dict(
            showscale=True,
            colorscale='Viridis',
            color=node_colors,
            size=sizes,
            line_width=2,
            line=dict(color='black', width=2)
        ),
        hoverinfo='text',
        hovertext=[f"Topic {node}<br>{G.nodes[node]['top_words']}<br>Size: {G.nodes[node]['size']}" 
                  for node in G.nodes()]
    )
    
    # Create figure
    fig = go.Figure(data=edge_traces + [node_trace],
                   layout=go.Layout(
                       title=title,
                       showlegend=False,
                       hovermode='closest',
                       margin=dict(b=20, l=5, r=5, t=40),
                       xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                       yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                       height=800,
                       width=800
                   ))
    
    return fig

def create_topic_heatmap(model, topics=None, title="Topic Similarity Heatmap"):
    """
    Create a custom heatmap for topic similarity with topic numbers in labels
    
    Parameters:
    model: BERTopic model
    topics (list, optional): List of topics to include in the heatmap
    title (str): Title for the heatmap
    
    Returns:
    plotly.graph_objects.Figure: Interactive heatmap figure
    """
    try:
        # Use built-in BERTopic visualization if available
        fig = model.visualize_heatmap(topics=topics)
        fig.update_layout(title=title)
        return fig
    except Exception as e:
        print(f"Warning: Built-in heatmap visualization failed: {str(e)}")
        print("Falling back to custom implementation...")
        
        # If topics not specified, use all non-outlier topics
        if topics is None:
            topic_info = model.get_topic_info()
            topics = [t for t in topic_info['Topic'].tolist() if t != -1]
        
        # Calculate similarity matrix between topics
        similarity_matrix = 1 - model._calculate_topic_similarities(topics, topics)
        
        # Create topic labels with top words
        topic_labels = {}
        for topic in topics:
            words = model.get_topic(topic)
            if words:
                top_words = ", ".join([word for word, _ in words[:3]])
                topic_labels[topic] = f"Topic {topic}: {top_words}"
            else:
                topic_labels[topic] = f"Topic {topic}"
        
        # Convert to list for plotly
        labels = [topic_labels[t] for t in topics]
        
        # Create heatmap
        fig = go.Figure(data=go.Heatmap(
            z=similarity_matrix,
            x=labels,
            y=labels,
            colorscale='Viridis',
            showscale=True,
            zmin=0, zmax=1
        ))
        
        # Update layout
        fig.update_layout(
            title=title,
            xaxis_title="Topics",
            yaxis_title="Topics",
            height=800,
            width=800,
            xaxis={'tickangle': 45},
            font=dict(size=10)
        )
        
        return fig

def visualize_hierarchical_documents(model, docs_embeddings=None, docs=None, topics=None,
                                    title="Hierarchical Document Clustering"):
    """
    Visualize documents hierarchically with topic coloring
    
    Parameters:
    model: BERTopic model
    docs_embeddings (array, optional): Document embeddings
    docs (list, optional): List of documents
    topics (list, optional): List of document topics
    title (str): Title for the visualization
    
    Returns:
    plotly.graph_objects.Figure: Interactive hierarchy visualization
    """
    try:
        # Use built-in BERTopic visualization if available
        # Need to have the topic_model.topic_ attribute containing document-topic mapping
        fig = model.visualize_hierarchical_documents(docs=docs, hierarchical_topics=topics, 
                                                   title=title)
        return fig
    except Exception as e:
        print(f"Warning: Could not create hierarchical document visualization: {str(e)}")
        return None
    
def visualize_document_datamap(model, docs=None, topics=None, embeddings=None, 
                              title="Document Datamap"):
    """
    Create a 2D map of documents colored by topics with topic numbers in labels
    
    Parameters:
    model: BERTopic model
    docs (list, optional): List of documents
    topics (list, optional): List of document topics
    embeddings (array, optional): Document embeddings
    title (str): Title for the visualization
    
    Returns:
    plotly.graph_objects.Figure: Interactive document map
    """
    try:
        # Use built-in BERTopic visualization if available
        fig = model.visualize_documents(docs=docs, topics=topics, embeddings=embeddings,
                                     title=title)
        return fig
    except Exception as e:
        print(f"Warning: Could not create document datamap: {str(e)}")
        return None

In [None]:
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
import numpy as np
from pathlib import Path
import pandas as pd
from typing import List, Dict, Tuple, Union, Optional, Any

def create_topic_barchart(model, topics=None, n_terms=10, title="Topic Term Importance"):
    """
    Create a bar chart visualization of term importance for topics with topic numbers in labels
    
    Parameters:
    model: BERTopic model
    topics (list, optional): List of topics to visualize
    n_terms (int): Number of terms to show per topic
    title (str): Title for the visualization
    
    Returns:
    plotly.graph_objects.Figure: Interactive bar chart
    """
    try:
        # Use built-in BERTopic visualization if available
        if topics is None:
            # Get top topics excluding outliers
            topic_info = model.get_topic_info()
            topics = [t for t in topic_info['Topic'].tolist() if t != -1][:5]  # Limit to 5 topics by default
            
        fig = model.visualize_barchart(topics=topics, top_n_topics=len(topics), n_words=n_terms)
        fig.update_layout(title=title)
        return fig
    except Exception as e:
        print(f"Warning: Built-in barchart visualization failed: {str(e)}")
        print("Falling back to custom implementation...")
        
        # If topics not specified, use top 5 non-outlier topics
        if topics is None:
            topic_info = model.get_topic_info()
            filtered_topics = topic_info[topic_info['Topic'] != -1]
            topics = filtered_topics.nlargest(5, 'Count')['Topic'].tolist()
        
        # Create subplots, 1 per topic
        fig = go.Figure()
        
        # Process each topic
        for topic in topics:
            words = model.get_topic(topic)
            if not words:
                continue
                
            # Extract words and scores
            terms = [word for word, _ in words[:n_terms]]
            scores = [score for _, score in words[:n_terms]]
            
            # Reverse lists for better visualization (largest on top)
            terms.reverse()
            scores.reverse()
            
            # Add trace for this topic
            fig.add_trace(go.Bar(
                y=terms,
                x=scores,
                name=f"Topic {topic}",
                orientation='h'
            ))
        
        # Update layout
        fig.update_layout(
            title=title,
            xaxis_title="Term Score",
            yaxis_title="Terms",
            height=600,
            width=1000,
            font=dict(size=12),
            barmode='group'
        )
        
        return fig

def visualize_topic_barchart(model, subject_main_dir, subject_type, column_name, n_topics=10):
    """
    Visualize top terms for each topic as bar charts with topic numbers in labels
    
    Parameters:
    model: BERTopic model
    subject_main_dir (str/Path): Main directory for all subject analyses
    subject_type (str): Type of subject analysis
    column_name (str): Name of the column analyzed
    n_topics (int): Number of topics to show
    
    Returns:
    dict: Visualization details
    """
    try:
        # Get topic info dataframe
        topic_info = model.get_topic_info()
        
        # Exclude -1 topic (outliers)
        topics = [topic for topic in topic_info['Topic'].tolist() 
                 if topic != -1][:n_topics]
        
        # Create visualization
        fig = create_topic_barchart(
            model=model, 
            topics=topics, 
            n_terms=15,  # Show more terms
            title=f"{column_name} Top Terms by Topic"
        )
        
        # Save the visualization
        from pathlib import Path
        Path(f"{subject_main_dir}/{subject_type}").mkdir(parents=True, exist_ok=True)
        
        filename = f"{column_name}_topic_barchart.html"
        fig.write_html(f"{subject_main_dir}/{subject_type}/{filename}")
        
        # Also save as PNG for embedding
        png_filename = f"{column_name}_topic_barchart.png"
        fig.write_image(f"{subject_main_dir}/{subject_type}/{png_filename}")
        
        return {
            "visualization": f"{subject_type}/{filename}",
            "static_image": f"{subject_type}/{png_filename}",
            "type": "topic_barchart",
            "topics": topics
        }
    except Exception as e:
        import traceback
        return {"error": f"Failed to create topic barchart: {str(e)}\n{traceback.format_exc()}"}

def create_topic_term_scores(model, topics=None, n_terms=10, title="Topic Term Scores"):
    """
    Visualize the importance scores of terms for each topic with topic numbers in labels
    
    Parameters:
    model: BERTopic model
    topics (list, optional): List of topics to visualize
    n_terms (int): Number of terms to show per topic
    title (str): Title for the visualization
    
    Returns:
    plotly.graph_objects.Figure: Interactive bar chart figure
    """
    # If topics not specified, use top 5 non-outlier topics
    if topics is None:
        topic_info = model.get_topic_info()
        filtered_topics = topic_info[topic_info['Topic'] != -1]
        topics = filtered_topics.nlargest(5, 'Count')['Topic'].tolist()
    
    # Create figure with subplots
    fig = go.Figure()
    
    # Process each topic
    for topic in topics:
        words = model.get_topic(topic)
        if not words:
            continue
            
        # Extract words and scores
        terms = [word for word, _ in words[:n_terms]]
        scores = [score for _, score in words[:n_terms]]
        
        # Reverse lists for better visualization (largest on top)
        terms.reverse()
        scores.reverse()
        
        # Add trace for this topic
        fig.add_trace(go.Bar(
            y=terms,
            x=scores,
            name=f"Topic {topic}",
            orientation='h'
        ))
    
    # Update layout
    fig.update_layout(
        title=title,
        xaxis_title="Term Score",
        yaxis_title="Terms",
        height=600,
        width=1000,
        font=dict(size=12),
        barmode='group'
    )
    
    return fig

def visualize_term_rank(model, subject_main_dir, subject_type, column_name, n_topics=5, n_terms=10):
    """
    Visualize the term score decline chart for topics with topic numbers in labels
    
    Parameters:
    model: BERTopic model
    subject_main_dir (str/Path): Main directory for all subject analyses
    subject_type (str): Type of subject analysis
    column_name (str): Name of the column analyzed
    n_topics (int): Number of topics to visualize
    n_terms (int): Number of terms to show per topic
    
    Returns:
    dict: Visualization details
    """
    try:
        # Get topic info dataframe
        topic_info = model.get_topic_info()
        
        # Exclude -1 topic (outliers) and get top N topics
        topics = [topic for topic in topic_info['Topic'].tolist() 
                 if topic != -1][:n_topics]
        
        # Create figure
        fig = go.Figure()
        
        # Process each topic
        for topic in topics:
            words = model.get_topic(topic)
            if not words:
                continue
                
            # Get scores
            scores = [score for _, score in words[:n_terms]]
            
            # Add line plot for this topic
            fig.add_trace(go.Scatter(
                x=list(range(1, len(scores) + 1)),
                y=scores,
                mode='lines+markers',
                name=f"Topic {topic}"
            ))
        
        # Update layout
        fig.update_layout(
            title=f"{column_name} Term Score Decline by Topic",
            xaxis_title="Term Rank",
            yaxis_title="Term Score",
            height=600,
            width=900,
            font=dict(size=12),
            legend_title="Topics"
        )
        
        # Save the visualization
        Path(f"{subject_main_dir}/{subject_type}").mkdir(parents=True, exist_ok=True)
        
        filename = f"{column_name}_term_rank.html"
        fig.write_html(f"{subject_main_dir}/{subject_type}/{filename}")
        
        # Also save as PNG for embedding
        png_filename = f"{column_name}_term_rank.png"
        fig.write_image(f"{subject_main_dir}/{subject_type}/{png_filename}")
        
        return {
            "visualization": f"{subject_type}/{filename}",
            "static_image": f"{subject_type}/{png_filename}",
            "type": "term_rank",
            "topics": topics
        }
    except Exception as e:
        import traceback
        return {"error": f"Failed to create term rank visualization: {str(e)}\n{traceback.format_exc()}"}

### Advanced analysis

In [None]:
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import numpy as np
from pathlib import Path
import pandas as pd
from typing import List, Dict, Tuple, Union, Optional, Any

def create_topic_evolution(model, df, topic_column, time_column, interval='year',
                         topics=None, top_n=10, title="Topic Evolution Over Time"):
    """
    Visualize how topics evolve over time with topic numbers in labels
    
    Parameters:
    model: BERTopic model
    df (DataFrame): DataFrame with topics assigned
    topic_column (str): Column name containing the assigned topics
    time_column (str): Column name containing the time information
    interval (str): Time interval for grouping ('year', 'month', 'day', 'decade')
    topics (list, optional): List of topics to include
    top_n (int): Number of top topics to show
    title (str): Title for the visualization
    
    Returns:
    plotly.graph_objects.Figure: Interactive time series figure
    """
    # Ensure the time column is in datetime format
    if not pd.api.types.is_datetime64_dtype(df[time_column]):
        try:
            df[time_column] = pd.to_datetime(df[time_column])
        except:
            # If can't convert to datetime, try treating as year values
            try:
                df[time_column] = pd.to_datetime(df[time_column].astype(str) + '-01-01')
            except:
                raise ValueError(f"Could not convert {time_column} to datetime format")
    
    # Create a copy to avoid modifying the original
    data = df.copy()
    
    # Add interval column for grouping
    if interval == 'year':
        data['interval'] = data[time_column].dt.year
    elif interval == 'month':
        data['interval'] = data[time_column].dt.to_period('M').astype(str)
    elif interval == 'day':
        data['interval'] = data[time_column].dt.date
    elif interval == 'decade':
        data['interval'] = (data[time_column].dt.year // 10) * 10
    else:
        raise ValueError(f"Unsupported interval: {interval}")
    
    # Get topic labels with top words
    topic_info = model.get_topic_info()
    
    # If topics not specified, get top N non-outlier topics by count
    if topics is None:
        # Filter out outlier topic (-1)
        filtered_topics = topic_info[topic_info['Topic'] != -1]
        # Get top N topics by count
        topics = filtered_topics.nlargest(top_n, 'Count')['Topic'].tolist()
    
    # Create topic labels with top words
    topic_labels = {}
    for topic in topics:
        words = model.get_topic(topic)
        if words:
            top_words = ", ".join([word for word, _ in words[:3]])
            topic_labels[topic] = f"Topic {topic}: {top_words}"
        else:
            topic_labels[topic] = f"Topic {topic}"
    
    # Group by interval and topic, count documents
    topic_counts = data.groupby(['interval', topic_column]).size().unstack(fill_value=0)
    
    # Keep only the selected topics, if they exist in the data
    available_topics = [t for t in topics if t in topic_counts.columns]
    if available_topics:
        topic_counts = topic_counts[available_topics]
    
    # Calculate the topic proportion for each interval
    topic_props = topic_counts.div(topic_counts.sum(axis=1), axis=0) * 100
    
    # Create Plotly figure
    fig = go.Figure()
    
    # Add a line for each topic
    for topic in topic_props.columns:
        if topic in topic_labels:
            fig.add_trace(go.Scatter(
                x=topic_props.index,
                y=topic_props[topic],
                mode='lines+markers',
                name=topic_labels[topic],
                line=dict(width=2),
                marker=dict(size=8)
            ))
    
    # Update layout
    fig.update_layout(
        title=title,
        xaxis_title=f"Time ({interval.capitalize()})",
        yaxis_title="Topic Proportion (%)",
        legend_title="Topics",
        hovermode="x unified",
        height=600,
        width=1000,
        font=dict(size=12)
    )
    
    return fig

def visualize_topics_over_time(model, processed_df, decade_column, subject_main_dir, 
                             subject_type, column_name, top_n_topics=10, interval='decade'):
    """
    Visualize topic trends over time with topic numbers in labels
    
    Parameters:
    model (BERTopic): Fitted BERTopic model
    processed_df (DataFrame): DataFrame with assigned topics
    decade_column (str): Name of the column with time information
    subject_main_dir (str/Path): Main directory for all subject analyses
    subject_type (str): Type of subject analysis
    column_name (str): Name of the column analyzed
    top_n_topics (int): Number of top topics to visualize
    interval (str): Time interval for grouping ('year', 'month', 'day', 'decade')
    
    Returns:
    dict: Visualization details
    """
    try:
        # Ensure decade column exists
        if decade_column not in processed_df.columns:
            return {"error": f"Time column '{decade_column}' not found in DataFrame"}
        
        # Create the topic evolution visualization
        fig = create_topic_evolution(
            model=model,
            df=processed_df,
            topic_column='bertopic_topic',  # Column with assigned topics
            time_column=decade_column,
            interval=interval,
            top_n=top_n_topics,
            title=f"Topic Evolution for {column_name} Over Time"
        )
        
        # Save the visualization
        Path(f"{subject_main_dir}/{subject_type}").mkdir(parents=True, exist_ok=True)
        
        html_filename = f"{column_name}_topics_over_time.html"
        fig.write_html(f"{subject_main_dir}/{subject_type}/{html_filename}")
        
        # Also save as PNG for static viewing/embedding
        png_filename = f"{column_name}_topics_over_time.png"
        fig.write_image(f"{subject_main_dir}/{subject_type}/{png_filename}")
        
        # Also create a simpler matplotlib version for better embedding in reports
        plt.figure(figsize=(12, 8))
        
        # Group data by time and topic
        topic_counts = processed_df.groupby([decade_column, 'bertopic_topic']).size().unstack(fill_value=0)
        
        # Get top topics
        topic_info = model.get_topic_info()
        filtered_topics = topic_info[topic_info['Topic'] != -1]
        top_topics = filtered_topics.nlargest(top_n_topics, 'Count')['Topic'].tolist()
        
        # Filter to available top topics
        available_topics = [t for t in top_topics if t in topic_counts.columns]
        if available_topics:
            topic_counts = topic_counts[available_topics]
        
        # Calculate proportions
        topic_props = topic_counts.div(topic_counts.sum(axis=1), axis=0) * 100
        
        # Create topic labels
        topic_labels = {}
        for topic in available_topics:
            words = model.get_topic(topic)
            if words:
                top_words = ", ".join([word for word, _ in words[:3]])
                topic_labels[topic] = f"Topic {topic}: {top_words}"
            else:
                topic_labels[topic] = f"Topic {topic}"
        
        # Plot each topic
        for topic in topic_props.columns:
            if topic in topic_labels:
                plt.plot(topic_props.index, topic_props[topic], marker='o', linewidth=2, label=topic_labels[topic])
        
        plt.title(f"Topic Trends for {column_name}", fontsize=14)
        plt.xlabel(decade_column, fontsize=12)
        plt.ylabel("Topic Proportion (%)", fontsize=12)
        plt.grid(True, alpha=0.3)
        plt.legend(loc='best')
        plt.tight_layout()
        
        # Save the matplotlib version
        mpl_filename = f"{column_name}_topics_over_time_mpl.png"
        plt.savefig(f"{subject_main_dir}/{subject_type}/{mpl_filename}", dpi=300, bbox_inches='tight')
        plt.close()
        
        return {
            "visualizations": [
                f"{subject_type}/{html_filename}",
                f"{subject_type}/{png_filename}",
                f"{subject_type}/{mpl_filename}"
            ],
            "type": "topics_over_time",
            "topics": available_topics
        }
    except Exception as e:
        import traceback
        return {"error": f"Failed to create topics over time visualization: {str(e)}\n{traceback.format_exc()}"}

def create_topic_distribution_over_time(model, df, topic_column, time_column, interval='year',
                                    top_n_topics=5, title="Topic Distribution Over Time"):
    """
    Create a stacked area chart showing topic distribution over time with topic numbers in labels
    
    Parameters:
    model: BERTopic model
    df (DataFrame): DataFrame with topics assigned
    topic_column (str): Column name containing the assigned topics
    time_column (str): Column name containing the time information
    interval (str): Time interval for grouping ('year', 'month', 'day', 'decade')
    top_n_topics (int): Number of top topics to show
    title (str): Title for the visualization
    
    Returns:
    plotly.graph_objects.Figure: Interactive stacked area chart
    """
    # Ensure the time column is in datetime format
    if not pd.api.types.is_datetime64_dtype(df[time_column]):
        try:
            df[time_column] = pd.to_datetime(df[time_column])
        except:
            # Try treating as year values
            try:
                df[time_column] = pd.to_datetime(df[time_column].astype(str) + '-01-01')
            except:
                raise ValueError(f"Could not convert {time_column} to datetime format")
    
    # Create a copy to avoid modifying the original
    data = df.copy()
    
    # Add interval column for grouping
    if interval == 'year':
        data['interval'] = data[time_column].dt.year
    elif interval == 'month':
        data['interval'] = data[time_column].dt.to_period('M').astype(str)
    elif interval == 'day':
        data['interval'] = data[time_column].dt.date
    elif interval == 'decade':
        data['interval'] = (data[time_column].dt.year // 10) * 10
    else:
        raise ValueError(f"Unsupported interval: {interval}")
    
    # Get top N topics by count (excluding outliers)
    topic_counts = data[topic_column].value_counts()
    top_topics = [t for t in topic_counts.index if t != -1][:top_n_topics]
    
    # Create an "Other" category for all other topics
    data['topic_group'] = data[topic_column].apply(
        lambda x: x if x in top_topics else 'Other')
    
    # Group by interval and topic, count documents
    grouped = data.groupby(['interval', 'topic_group']).size().unstack(fill_value=0)
    
    # Calculate proportions
    props = grouped.div(grouped.sum(axis=1), axis=0) * 100
    
    # Get topic labels with top words
    topic_labels = {}
    for topic in top_topics:
        words = model.get_topic(topic)
        if words:
            top_words = ", ".join([word for word, _ in words[:3]])
            topic_labels[topic] = f"Topic {topic}: {top_words}"
        else:
            topic_labels[topic] = f"Topic {topic}"
    
    topic_labels['Other'] = "Other Topics"
    
    # Create stacked area chart
    fig = go.Figure()
    
    # Add area for each topic
    for topic in props.columns:
        if topic in topic_labels:
            name = topic_labels.get(topic, f"Topic {topic}")
            fig.add_trace(go.Scatter(
                x=props.index,
                y=props[topic],
                mode='lines',
                stackgroup='one',
                name=name
            ))
    
    # Update layout
    fig.update_layout(
        title=title,
        xaxis_title=f"Time ({interval.capitalize()})",
        yaxis_title="Topic Proportion (%)",
        legend_title="Topics",
        hovermode="x unified",
        height=600,
        width=1000,
        font=dict(size=12)
    )
    
    return fig

def visualize_topic_distribution_over_time(model, processed_df, time_column, subject_main_dir,
                                        subject_type, column_name, top_n_topics=5, interval='decade'):
    """
    Visualize topic distribution as a stacked area chart over time with topic numbers in labels
    
    Parameters:
    model (BERTopic): Fitted BERTopic model
    processed_df (DataFrame): DataFrame with assigned topics
    time_column (str): Name of the column with time information
    subject_main_dir (str/Path): Main directory for all subject analyses
    subject_type (str): Type of subject analysis
    column_name (str): Name of the column analyzed
    top_n_topics (int): Number of top topics to visualize
    interval (str): Time interval for grouping ('year', 'month', 'day', 'decade')
    
    Returns:
    dict: Visualization details
    """
    try:
        # Ensure time column exists
        if time_column not in processed_df.columns:
            return {"error": f"Time column '{time_column}' not found in DataFrame"}
        
        # Create the stacked area chart
        fig = create_topic_distribution_over_time(
            model=model,
            df=processed_df,
            topic_column='bertopic_topic',  # Column with assigned topics
            time_column=time_column,
            interval=interval,
            top_n_topics=top_n_topics,
            title=f"Topic Distribution for {column_name} Over Time"
        )
        
        # Save the visualization
        Path(f"{subject_main_dir}/{subject_type}").mkdir(parents=True, exist_ok=True)
        
        html_filename = f"{column_name}_topic_distribution_over_time.html"
        fig.write_html(f"{subject_main_dir}/{subject_type}/{html_filename}")
        
        # Also save as PNG for static viewing/embedding
        png_filename = f"{column_name}_topic_distribution_over_time.png"
        fig.write_image(f"{subject_main_dir}/{subject_type}/{png_filename}")
        
        return {
            "visualizations": [
                f"{subject_type}/{html_filename}",
                f"{subject_type}/{png_filename}"
            ],
            "type": "topic_distribution_over_time"
        }
    except Exception as e:
        import traceback
        return {"error": f"Failed to create topic distribution over time: {str(e)}\n{traceback.format_exc()}"}

In [None]:
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
import numpy as np
from pathlib import Path
import pandas as pd
from typing import List, Dict, Tuple, Union, Optional, Any
import umap

def create_topic_distribution(model, topics, probabilities, title="Topic Probability Distribution"):
    """
    Visualize the distribution of topic probabilities with topic numbers in labels
    
    Parameters:
    model: BERTopic model
    topics (list): List of assigned topics for each document
    probabilities (list/array): Topic probabilities for each document
    title (str): Title for the visualization
    
    Returns:
    matplotlib.figure.Figure: The figure containing the distribution plot
    """
    # Convert topics and probabilities to dataframe
    df = pd.DataFrame({'topic': topics})
    
    # Count documents per topic
    topic_counts = df['topic'].value_counts().sort_index()
    
    # Get probabilities per topic
    topic_probs = {}
    
    # Handle different probability formats
    if hasattr(probabilities, 'ndim') and probabilities.ndim == 2:
        # 2D array of probabilities
        for i, topic in enumerate(topics):
            if topic not in topic_probs:
                topic_probs[topic] = []
            topic_probs[topic].append(probabilities[i].max())
    else:
        # List of probabilities or 1D array
        for i, topic in enumerate(topics):
            if topic not in topic_probs:
                topic_probs[topic] = []
            
            if hasattr(probabilities[i], 'max'):
                # If it's an array with a max method
                topic_probs[topic].append(probabilities[i].max())
            elif hasattr(probabilities[i], '__iter__'):
                # If it's iterable, find max
                topic_probs[topic].append(max(probabilities[i]) if len(probabilities[i]) > 0 else 0)
            else:
                # Just use the value
                topic_probs[topic].append(probabilities[i])
    
    # Calculate mean probability per topic
    topic_mean_probs = {topic: np.mean(probs) for topic, probs in topic_probs.items() if len(probs) > 0}
    
    # Create plot with two subplots side by side
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8))
    
    # Get topic labels with top words
    topic_labels = {}
    for topic in topic_counts.index:
        if topic == -1:
            topic_labels[topic] = "Topic -1 (Outliers)"
            continue
            
        words = model.get_topic(topic)
        if words:
            top_words = ", ".join([word for word, _ in words[:3]])
            topic_labels[topic] = f"T{topic}: {top_words}"
        else:
            topic_labels[topic] = f"Topic {topic}"
    
    # Plot topic counts
    topic_df = pd.DataFrame({
        'Topic': [topic_labels.get(t, f"Topic {t}") for t in topic_counts.index],
        'Count': topic_counts.values,
        'RawTopic': topic_counts.index
    })
    topic_df = topic_df.sort_values('Count', ascending=False)
    
    # For top 20 topics only
    top_topics_df = topic_df.head(20)
    
    # Plot with shorter topic labels for readability
    ax1.bar(range(len(top_topics_df)), top_topics_df['Count'], 
           tick_label=[f"T{t}" for t in top_topics_df['RawTopic']])
    ax1.set_title('Top 20 Topics by Document Count')
    ax1.set_xlabel('Topic')
    ax1.set_ylabel('Document Count')
    ax1.tick_params(axis='x', rotation=90)
    
    # Plot mean probabilities
    if topic_mean_probs:
        prob_df = pd.DataFrame({
            'Topic': [topic_labels.get(t, f"Topic {t}") for t in topic_mean_probs.keys()],
            'Mean Probability': list(topic_mean_probs.values()),
            'RawTopic': list(topic_mean_probs.keys())
        })
        prob_df = prob_df.sort_values('Mean Probability', ascending=False)
        
        # For top 20 topics only
        top_prob_df = prob_df.head(20)
        
        # Plot with shorter topic labels for readability
        ax2.bar(range(len(top_prob_df)), top_prob_df['Mean Probability'],
               tick_label=[f"T{t}" for t in top_prob_df['RawTopic']])
        ax2.set_title('Top 20 Topics by Mean Probability')
        ax2.set_xlabel('Topic')
        ax2.set_ylabel('Mean Probability')
        ax2.tick_params(axis='x', rotation=90)
    
    plt.tight_layout()
    plt.suptitle(title, fontsize=16, y=1.05)
    
    return fig

def visualize_topic_distribution(model, topics, probabilities, subject_main_dir, subject_type, column_name):
    """
    Visualize the distribution of topics and their probabilities with topic numbers in labels
    
    Parameters:
    model: BERTopic model
    topics (list): List of assigned topics for each document
    probabilities (list/array): Topic probabilities for each document
    subject_main_dir (str/Path): Main directory for all subject analyses
    subject_type (str): Type of subject analysis
    column_name (str): Name of the column analyzed
    
    Returns:
    dict: Visualization details
    """
    try:
        # Create the distribution plot
        fig = create_topic_distribution(
            model=model,
            topics=topics,
            probabilities=probabilities,
            title=f"{column_name} Topic Distribution"
        )
        
        # Save the plot
        Path(f"{subject_main_dir}/{subject_type}").mkdir(parents=True, exist_ok=True)
        
        filename = f"{column_name}_topic_distribution.png"
        plt.savefig(f"{subject_main_dir}/{subject_type}/{filename}", bbox_inches='tight', dpi=300)
        plt.close(fig)
        
        # Also create a more interactive visualization with Plotly
        try:
            # Count documents per topic
            topic_df = pd.DataFrame({'topic': topics})
            topic_counts = topic_df['topic'].value_counts().sort_index()
            
            # Get topic labels
            topic_labels = {}
            for topic in topic_counts.index:
                if topic == -1:
                    topic_labels[topic] = "Topic -1 (Outliers)"
                    continue
                    
                words = model.get_topic(topic)
                if words:
                    top_words = ", ".join([word for word, _ in words[:3]])
                    topic_labels[topic] = f"Topic {topic}: {top_words}"
                else:
                    topic_labels[topic] = f"Topic {topic}"
            
            # Create interactive bar chart
            plotly_df = pd.DataFrame({
                'Topic': [topic_labels.get(t, f"Topic {t}") for t in topic_counts.index],
                'Count': topic_counts.values,
                'RawTopic': topic_counts.index
            })
            plotly_df = plotly_df.sort_values('Count', ascending=False)
            
            # Limit to top 30 topics for readability
            plotly_df = plotly_df.head(30)
            
            fig_plotly = px.bar(
                plotly_df, 
                x='RawTopic', 
                y='Count', 
                hover_data=['Topic'],
                labels={'Count': 'Document Count', 'RawTopic': 'Topic ID'},
                title=f"{column_name} Topic Distribution (Top 30 Topics)",
                color='Count',
                color_continuous_scale='viridis'
            )
            
            # Update layout
            fig_plotly.update_layout(
                xaxis_title="Topic ID",
                yaxis_title="Document Count",
                height=600,
                width=1000,
                font=dict(size=12)
            )
            
            # Save as HTML
            html_filename = f"{column_name}_topic_distribution.html"
            fig_plotly.write_html(f"{subject_main_dir}/{subject_type}/{html_filename}")
            
            # Also save as PNG
            plotly_png_filename = f"{column_name}_topic_distribution_plotly.png"
            fig_plotly.write_image(f"{subject_main_dir}/{subject_type}/{plotly_png_filename}")
            
            return {
                "visualizations": [
                    f"{subject_type}/{filename}",
                    f"{subject_type}/{html_filename}",
                    f"{subject_type}/{plotly_png_filename}"
                ],
                "type": "topic_distribution"
            }
            
        except Exception as e:
            print(f"Warning: Could not create interactive visualization: {str(e)}")
            return {
                "visualization": f"{subject_type}/{filename}",
                "type": "topic_distribution"
            }
            
    except Exception as e:
        import traceback
        return {"error": f"Failed to create topic distribution: {str(e)}\n{traceback.format_exc()}"}

def create_document_topic_map(model, embeddings, topics, probabilities=None, top_n=500, 
                             sample_method='random', title="Document-Topic Map"):
    """
    Create an interactive scatter plot of documents colored by their topics with topic numbers in labels
    
    Parameters:
    model: BERTopic model
    embeddings (array): Document embeddings
    topics (list): Topic assignments
    probabilities (list/array, optional): Topic probabilities
    top_n (int): Number of documents to visualize
    sample_method (str): Method to sample documents ('random', 'probability')
    title (str): Title for the visualization
    
    Returns:
    plotly.graph_objects.Figure: Interactive scatter plot
    """
    # Convert inputs to numpy arrays if they aren't already
    embeddings = np.array(embeddings)
    topics = np.array(topics)
    
    # Determine document sample based on method
    if len(embeddings) > top_n:
        if sample_method == 'random':
            # Random sampling
            indices = np.random.choice(len(embeddings), top_n, replace=False)
        elif sample_method == 'probability' and probabilities is not None:
            # Sample based on highest probability
            if hasattr(probabilities, 'ndim') and probabilities.ndim == 2:
                # 2D array of probabilities
                probs = np.array([prob.max() for prob in probabilities])
            else:
                # Convert to array if it's a list
                probs = np.array([
                    prob.max() if hasattr(prob, 'max') else 
                    (max(prob) if hasattr(prob, '__iter__') and len(prob) > 0 else prob) 
                    for prob in probabilities
                ])
            
            # Get indices of highest probability documents
            indices = np.argsort(probs)[-top_n:]
        else:
            # Default to random
            indices = np.random.choice(len(embeddings), top_n, replace=False)
            
        # Apply sampling
        embeddings = embeddings[indices]
        topics = topics[indices]
        
    # Reduce dimensionality to 2D for visualization
    reducer = umap.UMAP(n_components=2, random_state=42, metric='cosine')
    umap_embeddings = reducer.fit_transform(embeddings)
    
    # Create a dataframe for plotting
    df = pd.DataFrame({
        'x': umap_embeddings[:, 0],
        'y': umap_embeddings[:, 1],
        'topic': topics
    })
    
    # Get topic labels with top words
    topic_labels = {}
    for topic in np.unique(topics):
        if topic == -1:
            topic_labels[topic] = "Topic -1 (Outliers)"
            continue
            
        words = model.get_topic(topic)
        if words:
            top_words = ", ".join([word for word, _ in words[:3]])
            topic_labels[topic] = f"Topic {topic}: {top_words}"
        else:
            topic_labels[topic] = f"Topic {topic}"
    
    # Add topic labels to dataframe
    df['topic_label'] = df['topic'].map(topic_labels)
    
    # Create Plotly figure
    fig = px.scatter(
        df, x='x', y='y', color='topic_label',
        hover_data=['topic_label'],
        color_discrete_sequence=px.colors.qualitative.Bold,
        title=title
    )
    
    # Update layout
    fig.update_layout(
        height=800,
        width=1000,
        legend_title="Topics",
        font=dict(size=12),
        legend=dict(itemsizing='constant')
    )
    
    # Update traces
    fig.update_traces(
        marker=dict(size=8, opacity=0.7, line=dict(width=1, color='white')),
        selector=dict(mode='markers')
    )
    
    return fig

def visualize_document_topic_map(model, embeddings, topics, probabilities, subject_main_dir, 
                                subject_type, column_name, top_n=500):
    """
    Visualize documents in 2D space colored by their topics with topic numbers in labels
    
    Parameters:
    model: BERTopic model
    embeddings (array): Document embeddings
    topics (list): Topic assignments
    probabilities (list/array): Topic probabilities
    subject_main_dir (str/Path): Main directory for all subject analyses
    subject_type (str): Type of subject analysis
    column_name (str): Name of the column analyzed
    top_n (int): Number of documents to visualize
    
    Returns:
    dict: Visualization details
    """
    try:
        # Create the document map
        fig = create_document_topic_map(
            model=model,
            embeddings=embeddings,
            topics=topics,
            probabilities=probabilities,
            top_n=top_n,
            title=f"{column_name} Document-Topic Map"
        )
        
        # Save the visualization
        Path(f"{subject_main_dir}/{subject_type}").mkdir(parents=True, exist_ok=True)
        
        html_filename = f"{column_name}_document_topic_map.html"
        fig.write_html(f"{subject_main_dir}/{subject_type}/{html_filename}")
        
        # Also save as PNG
        png_filename = f"{column_name}_document_topic_map.png"
        fig.write_image(f"{subject_main_dir}/{subject_type}/{png_filename}")
        
        return {
            "visualizations": [
                f"{subject_type}/{html_filename}",
                f"{subject_type}/{png_filename}"
            ],
            "type": "document_topic_map"
        }
    except Exception as e:
        import traceback
        return {"error": f"Failed to create document topic map: {str(e)}\n{traceback.format_exc()}"}

def create_topic_class_distribution(model, df, topic_column, class_column, 
                                  topics=None, top_n=10, normalize=True,
                                  title="Topic Distribution by Class"):
    """
    Visualize the distribution of topics across different classes with topic numbers in labels
    
    Parameters:
    model: BERTopic model
    df (DataFrame): DataFrame with topics assigned
    topic_column (str): Column name containing the assigned topics
    class_column (str): Column name containing the class information
    topics (list, optional): List of topics to include
    top_n (int): Number of top topics to show
    normalize (bool): Whether to normalize counts to percentages
    title (str): Title for the visualization
    
    Returns:
    plotly.graph_objects.Figure: Interactive heatmap figure
    """
    # Create a copy to avoid modifying the original
    data = df.copy()
    
    # If topics not specified, get top N non-outlier topics by count
    if topics is None:
        topic_counts = data[topic_column].value_counts()
        # Filter out outlier topic (-1)
        filtered_topics = [t for t in topic_counts.index if t != -1]
        # Get top N topics
        topics = [t for t in filtered_topics[:top_n]]
    
    # Get unique classes
    classes = data[class_column].unique()
    
    # Create topic labels with top words
    topic_labels = {}
    for topic in topics:
        words = model.get_topic(topic)
        if words:
            top_words = ", ".join([word for word, _ in words[:3]])
            topic_labels[topic] = f"Topic {topic}: {top_words}"
        else:
            topic_labels[topic] = f"Topic {topic}"
    
    # Group by class and topic, count documents
    cross_tab = pd.crosstab(
        data[class_column], 
        data[topic_column],
        normalize='index' if normalize else False
    )
    
    # Keep only selected topics
    available_topics = [t for t in topics if t in cross_tab.columns]
    if available_topics:
        cross_tab = cross_tab[available_topics]
    
    # Rename columns with topic labels
    cross_tab = cross_tab.rename(columns=topic_labels)
    
    # Create heatmap
    fig = go.Figure(data=go.Heatmap(
        z=cross_tab.values,
        x=cross_tab.columns,
        y=cross_tab.index,
        colorscale='Viridis',
        colorbar=dict(
            title="Percentage" if normalize else "Count",
        )
    ))
    
    # Update layout
    fig.update_layout(
        title=title,
        xaxis_title="Topics",
        yaxis_title=class_column,
        height=600,
        width=1000,
        font=dict(size=12),
        xaxis={'tickangle': 45}
    )
    
    return fig

def visualize_topic_class_distribution(model, processed_df, class_column, subject_main_dir,
                                     subject_type, column_name, top_n_topics=10):
    """
    Visualize how topics are distributed across classes with topic numbers in labels
    
    Parameters:
    model: BERTopic model
    processed_df (DataFrame): DataFrame with topics assigned
    class_column (str): Column name containing the class information
    subject_main_dir (str/Path): Main directory for all subject analyses
    subject_type (str): Type of subject analysis
    column_name (str): Name of the column analyzed
    top_n_topics (int): Number of top topics to visualize
    
    Returns:
    dict: Visualization details
    """
    try:
        # Ensure class column exists
        if class_column not in processed_df.columns:
            return {"error": f"Class column '{class_column}' not found in DataFrame"}
        
        # Create the class distribution visualization
        fig = create_topic_class_distribution(
            model=model,
            df=processed_df,
            topic_column='bertopic_topic',  # Column with assigned topics
            class_column=class_column,
            top_n=top_n_topics,
            normalize=True,  # Use percentages
            title=f"Topic Distribution for {column_name} by {class_column}"
        )
        
        # Save the visualization
        Path(f"{subject_main_dir}/{subject_type}").mkdir(parents=True, exist_ok=True)
        
        html_filename = f"{column_name}_topic_by_{class_column}.html"
        fig.write_html(f"{subject_main_dir}/{subject_type}/{html_filename}")
        
        # Also save as PNG
        png_filename = f"{column_name}_topic_by_{class_column}.png"
        fig.write_image(f"{subject_main_dir}/{subject_type}/{png_filename}")
        
        return {
            "visualizations": [
                f"{subject_type}/{html_filename}",
                f"{subject_type}/{png_filename}"
            ],
            "type": "topic_class_distribution"
        }
    except Exception as e:
        import traceback
        return {"error": f"Failed to create topic class distribution: {str(e)}\n{traceback.format_exc()}"}

### Main BERTopic

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import traceback
import os
from typing import List, Dict, Tuple, Union, Optional, Any

def run_comprehensive_bertopic_analysis(
    df, 
    column_name, 
    time_column=None,
    class_column=None,
    subject_main_dir="analyze_dataset", 
    language="english", 
    nr_topics="auto", 
    min_topic_size=10, 
    top_n_topics=10,
    visualize_documents=True,
    sample_documents=500
):
    """
    Run comprehensive BERTopic analysis on text data with all visualizations including topic numbers in labels
    
    Parameters:
    df (DataFrame): DataFrame containing the text column
    column_name (str): Name of the column to analyze
    time_column (str, optional): Name of the column with time information for temporal analysis
    class_column (str, optional): Name of the column with class information for class distribution
    subject_main_dir (str/Path): Main directory for all subject analyses
    language (str): Language for stopwords removal and preprocessing
    nr_topics (int or str): Number of topics to find ('auto' or specific number)
    min_topic_size (int): Minimum size of topics
    top_n_topics (int): Number of top topics to visualize
    visualize_documents (bool): Whether to visualize document-topic maps
    sample_documents (int): Number of documents to sample for document visualizations
    
    Returns:
    dict: Analysis results
    """
    # Print status
    print(f"Running enhanced BERTopic analysis on '{column_name}'...")
    subject_type = column_name.lower()
    
    try:
        # Import required packages
        from bertopic import BERTopic
        import nltk
        from sklearn.feature_extraction.text import CountVectorizer
        from pathlib import Path
        
        # Set environment variable to avoid the tokenizers parallelism warning
        import os
        os.environ["TOKENIZERS_PARALLELISM"] = "false"
        
        # Create subject directory if it doesn't exist
        if subject_main_dir:
            subject_dir = Path(subject_main_dir) / subject_type
            subject_dir.mkdir(parents=True, exist_ok=True)
        
        # Set up BERTopic model
        def setup_bertopic(language="english", nr_topics="auto", min_topic_size=10):
            try:
                # Download stopwords if needed
                try:
                    nltk.data.find('corpora/stopwords')
                except LookupError:
                    nltk.download('stopwords')
                
                # Get stopwords for the specified language
                from nltk.corpus import stopwords
                stop_words = stopwords.words(language)
                
                # Set up the vectorizer with stopwords
                vectorizer = CountVectorizer(stop_words=stop_words)
                
                # Create and return the BERTopic model
                model = BERTopic(
                    language=language,
                    nr_topics=nr_topics,
                    min_topic_size=min_topic_size,
                    vectorizer_model=vectorizer,
                    calculate_probabilities=True  # Always calculate probabilities for better visualizations
                )
                return model
            
            except ImportError:
                print("Please install required packages with: pip install bertopic nltk")
                return None
        
        # Create the model
        model = setup_bertopic(language, nr_topics, min_topic_size)
        if model is None:
            return {"error": "Failed to initialize BERTopic model"}
        
        # Extract text data, filtering out non-string and NaN values
        processed_df = df.copy()
        processed_df = processed_df[processed_df[column_name].apply(
            lambda x: isinstance(x, str) and pd.notna(x))]
        
        if len(processed_df) == 0:
            return {"error": f"No valid text data found in column {column_name}"}
        
        # Apply specific filtering if needed
        if column_name == "Abstract":
            processed_df = processed_df[processed_df["Abstract"] != "No abstract available"]
        
        # Get clean text data
        documents = processed_df[column_name].tolist()
        
        # Fit the model on our text data
        print(f"Fitting BERTopic model on {len(documents)} documents...")
        
        # Check if we should use embedding models
        use_embeddings = len(documents) > 100  # Only use embeddings for larger datasets
        
        if use_embeddings:
            try:
                # Try to use sentence-transformers for embedding if available
                from sentence_transformers import SentenceTransformer
                
                # Use a smaller multilingual model for efficiency
                embedding_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
                embeddings = embedding_model.encode(documents, show_progress_bar=True)
                
                # Fit the model with embeddings
                topics, probabilities = model.fit_transform(documents, embeddings)
                print("Used sentence-transformers for document embedding")
                
            except ImportError:
                # Fall back to default BERTopic embedding
                topics, probabilities = model.fit_transform(documents)
                embeddings = model.embedding_model.transform(documents)
                print("Used default BERTopic embedding")
        else:
            # For small datasets, use default BERTopic approach
            topics, probabilities = model.fit_transform(documents)
            # Get embeddings for visualizations
            embeddings = model.embedding_model.transform(documents)
        
        # Add topics to the dataframe
        processed_df["bertopic_topic"] = topics
        
        # Handle probabilities properly
        if hasattr(probabilities, 'ndim') and probabilities.ndim == 2:
            # If probabilities is a 2D array, get the max value for each row
            processed_df["bertopic_probability"] = [prob.max() for prob in probabilities]
        elif hasattr(probabilities, 'ndim') and probabilities.ndim == 1:
            # If probabilities is a 1D array, use it directly
            processed_df["bertopic_probability"] = probabilities
        else:
            # Handle case where probabilities might be a list of arrays or other structures
            processed_df["bertopic_probability"] = [
                prob.max() if hasattr(prob, 'max') else 
                (max(prob) if hasattr(prob, '__len__') and len(prob) > 0 else 0) 
                for prob in probabilities
            ]
        
        # Get basic topic information
        topic_info = model.get_topic_info()
        print(f"Identified {len(topic_info)-1} topics (excluding outlier topic)")
        
        # Generate visualizations
        results = {
            "column_analyzed": column_name,
            "topic_count": len(model.get_topics()),
            "visualizations": []
        }
        
        
        # 1. Topic Word Clouds
        print("Generating topic wordclouds...")
        wc_result = visualize_topic_wordclouds(
            model=model,
            subject_main_dir=subject_main_dir,
            subject_type=subject_type,
            column_name=column_name,
            top_n_topics=top_n_topics
        )
        if "error" not in wc_result:
            if "visualization" in wc_result:
                results["visualizations"].append(wc_result["visualization"])
            if "visualizations" in wc_result:
                results["visualizations"].extend(wc_result["visualizations"])
        else:
            print(f"Error generating wordclouds: {wc_result['error']}")
        
        # 2. Topic Bar Charts
        print("Generating topic bar charts...")
        bar_result = visualize_topic_barchart(
            model=model,
            subject_main_dir=subject_main_dir,
            subject_type=subject_type,
            column_name=column_name,
            n_topics=top_n_topics
        )
        if "error" not in bar_result:
            if "visualization" in bar_result:
                results["visualizations"].append(bar_result["visualization"])
            if "visualizations" in bar_result:
                results["visualizations"].extend(bar_result["visualizations"])
        else:
            print(f"Error generating bar charts: {bar_result['error']}")
        
        # 3. Term Score Decline
        print("Generating term score decline visualization...")
        term_rank_result = visualize_term_rank(
            model=model,
            subject_main_dir=subject_main_dir,
            subject_type=subject_type,
            column_name=column_name,
            n_topics=5  # Use fewer topics for readability
        )
        if "error" not in term_rank_result:
            if "visualization" in term_rank_result:
                results["visualizations"].append(term_rank_result["visualization"])
            if "visualizations" in term_rank_result:
                results["visualizations"].extend(term_rank_result["visualizations"])
        else:
            print(f"Error generating term rank visualization: {term_rank_result['error']}")
        
        # 4. Topic Similarity Heatmap
        print("Generating topic similarity heatmap...")
        try:
            heatmap_fig = create_topic_heatmap(
                model=model,
                title=f"{column_name} Topic Similarity Heatmap"
            )
            heatmap_filename = f"{column_name}_topic_similarity.html"
            heatmap_path = Path(subject_main_dir) / subject_type / heatmap_filename
            heatmap_fig.write_html(str(heatmap_path))
            
            # Also save as PNG
            heatmap_png = f"{column_name}_topic_similarity.png"
            heatmap_png_path = Path(subject_main_dir) / subject_type / heatmap_png
            heatmap_fig.write_image(str(heatmap_png_path))
            
            results["visualizations"].extend([
                f"{subject_type}/{heatmap_filename}",
                f"{subject_type}/{heatmap_png}"
            ])
        except Exception as e:
            print(f"Error generating similarity heatmap: {str(e)}")
        
        # 5. Topic Hierarchy
        print("Generating topic hierarchy visualization...")
        try:
            hierarchy_fig = create_topic_hierarchy(
                model=model,
                title=f"{column_name} Topic Hierarchy"
            )
            
            if isinstance(hierarchy_fig, plt.Figure):
                # Matplotlib figure
                hierarchy_filename = f"{column_name}_topic_hierarchy.png"
                hierarchy_path = Path(subject_main_dir) / subject_type / hierarchy_filename
                hierarchy_fig.savefig(hierarchy_path, bbox_inches='tight', dpi=300)
                plt.close(hierarchy_fig)
                results["visualizations"].append(f"{subject_type}/{hierarchy_filename}")
            else:
                # Plotly figure
                hierarchy_filename = f"{column_name}_topic_hierarchy.html"
                hierarchy_path = Path(subject_main_dir) / subject_type / hierarchy_filename
                hierarchy_fig.write_html(str(hierarchy_path))
                
                # Also save as PNG
                hierarchy_png = f"{column_name}_topic_hierarchy.png"
                hierarchy_png_path = Path(subject_main_dir) / subject_type / hierarchy_png
                hierarchy_fig.write_image(str(hierarchy_png_path))
                
                results["visualizations"].extend([
                    f"{subject_type}/{hierarchy_filename}",
                    f"{subject_type}/{hierarchy_png}"
                ])
        except Exception as e:
            print(f"Error generating topic hierarchy: {str(e)}")
        
        # 6. Topic Network
        print("Generating topic network visualization...")
        try:
            network_fig = create_topic_network(
                model=model,
                title=f"{column_name} Topic Similarity Network"
            )
            network_filename = f"{column_name}_topic_network.html"
            network_path = Path(subject_main_dir) / subject_type / network_filename
            network_fig.write_html(str(network_path))
            
            # Also save as PNG
            network_png = f"{column_name}_topic_network.png"
            network_png_path = Path(subject_main_dir) / subject_type / network_png
            network_fig.write_image(str(network_png_path))
            
            results["visualizations"].extend([
                f"{subject_type}/{network_filename}",
                f"{subject_type}/{network_png}"
            ])
        except Exception as e:
            print(f"Error generating topic network: {str(e)}")
        
        # 7. Topic Distribution
        print("Generating topic distribution visualization...")
        dist_result = visualize_topic_distribution(
            model=model,
            topics=topics,
            probabilities=probabilities,
            subject_main_dir=subject_main_dir,
            subject_type=subject_type,
            column_name=column_name
        )
        if "error" not in dist_result:
            if "visualization" in dist_result:
                results["visualizations"].append(dist_result["visualization"])
            if "visualizations" in dist_result:
                results["visualizations"].extend(dist_result["visualizations"])
        else:
            print(f"Error generating topic distribution: {dist_result['error']}")
        
        # 8. Document-Topic Map (if requested)
        if visualize_documents:
            print("Generating document-topic map...")
            try:
                map_result = visualize_document_topic_map(
                    model=model,
                    embeddings=embeddings,
                    topics=topics,
                    probabilities=probabilities,
                    subject_main_dir=subject_main_dir,
                    subject_type=subject_type,
                    column_name=column_name,
                    top_n=sample_documents  # Sample a subset of documents
                )
                if "error" not in map_result:
                    if "visualization" in map_result:
                        results["visualizations"].append(map_result["visualization"])
                    if "visualizations" in map_result:
                        results["visualizations"].extend(map_result["visualizations"])
                else:
                    print(f"Error generating document-topic map: {map_result['error']}")
            except Exception as e:
                print(f"Error generating document-topic map: {str(e)}")
                
            # 9. Document DataMap
            print("Generating document datamap visualization...")
            try:
                datamap_fig = visualize_document_datamap(
                    model=model,
                    docs=documents[:sample_documents] if len(documents) > sample_documents else documents,
                    topics=topics[:sample_documents] if len(topics) > sample_documents else topics,
                    embeddings=embeddings[:sample_documents] if len(embeddings) > sample_documents else embeddings,
                    title=f"{column_name} Document DataMap"
                )
                
                if datamap_fig:
                    datamap_filename = f"{column_name}_document_datamap.html"
                    datamap_path = Path(subject_main_dir) / subject_type / datamap_filename
                    datamap_fig.write_html(str(datamap_path))
                    
                    # Also save as PNG
                    datamap_png = f"{column_name}_document_datamap.png"
                    datamap_png_path = Path(subject_main_dir) / subject_type / datamap_png
                    datamap_fig.write_image(str(datamap_png_path))
                    
                    results["visualizations"].extend([
                        f"{subject_type}/{datamap_filename}",
                        f"{subject_type}/{datamap_png}"
                    ])
            except Exception as e:
                print(f"Error generating document datamap: {str(e)}")
        
        # 10. Topics over Time (if time column provided)
        if time_column and time_column in processed_df.columns:
            print("Generating topics over time visualization...")
            time_result = visualize_topics_over_time(
                model=model,
                processed_df=processed_df,
                decade_column=time_column,
                subject_main_dir=subject_main_dir,
                subject_type=subject_type,
                column_name=column_name,
                top_n_topics=top_n_topics
            )
            if "error" not in time_result:
                if "visualization" in time_result:
                    results["visualizations"].append(time_result["visualization"])
                if "visualizations" in time_result:
                    results["visualizations"].extend(time_result["visualizations"])
            else:
                print(f"Error generating topics over time: {time_result['error']}")
                
            # 11. Topic Distribution over Time
            print("Generating topic distribution over time...")
            dist_time_result = visualize_topic_distribution_over_time(
                model=model,
                processed_df=processed_df,
                time_column=time_column,
                subject_main_dir=subject_main_dir,
                subject_type=subject_type,
                column_name=column_name,
                top_n_topics=top_n_topics
            )
            if "error" not in dist_time_result:
                if "visualization" in dist_time_result:
                    results["visualizations"].append(dist_time_result["visualization"])
                if "visualizations" in dist_time_result:
                    results["visualizations"].extend(dist_time_result["visualizations"])
            else:
                print(f"Error generating topic distribution over time: {dist_time_result['error']}")
        else:
            print(f"Skipping temporal analysis: No time column provided or column not found in dataframe")
        
        # 12. Topic Class Distribution (if class column provided)
        if class_column and class_column in processed_df.columns:
            print("Generating topic class distribution visualization...")
            class_result = visualize_topic_class_distribution(
                model=model,
                processed_df=processed_df,
                class_column=class_column,
                subject_main_dir=subject_main_dir,
                subject_type=subject_type,
                column_name=column_name,
                top_n_topics=top_n_topics
            )
            if "error" not in class_result:
                if "visualization" in class_result:
                    results["visualizations"].append(class_result["visualization"])
                if "visualizations" in class_result:
                    results["visualizations"].extend(class_result["visualizations"])
            else:
                print(f"Error generating topic class distribution: {class_result['error']}")
        else:
            print(f"Skipping class distribution analysis: No class column provided or column not found in dataframe")
        
        # 13. Topic Probability Distribution
        try:
            print("Generating topic probability distribution visualization...")
            if hasattr(model, 'visualize_distribution'):
                # Use built-in method if available (newer BERTopic versions)
                fig = model.visualize_distribution(probabilities[0], min_probability=0.01)
                dist_filename = f"{column_name}_topic_probability_distribution.html"
                dist_path = Path(subject_main_dir) / subject_type / dist_filename
                fig.write_html(str(dist_path))
                
                # Also save as PNG
                dist_png = f"{column_name}_topic_probability_distribution.png"
                dist_png_path = Path(subject_main_dir) / subject_type / dist_png
                fig.write_image(str(dist_png_path))
                
                results["visualizations"].extend([
                    f"{subject_type}/{dist_filename}",
                    f"{subject_type}/{dist_png}"
                ])
        except Exception as e:
            print(f"Error generating probability distribution: {str(e)}")
        
        # 14. Add summary visualization with links to all visualizations
        try:
            print("Generating visualization summary...")
            html_content = f"""
            <html>
            <head>
                <title>{column_name} Topic Modeling Analysis</title>
                <style>
                    body {{ font-family: Arial, sans-serif; margin: 20px; }}
                    h1, h2 {{ color: #2c3e50; }}
                    .viz-container {{ display: flex; flex-wrap: wrap; gap: 20px; }}
                    .viz-item {{ 
                        border: 1px solid #ddd; 
                        border-radius: 5px; 
                        padding: 15px; 
                        margin-bottom: 15px;
                        width: 45%;
                    }}
                    .viz-item h3 {{ margin-top: 0; }}
                    .viz-item img {{ max-width: 100%; height: auto; }}
                    a {{ color: #3498db; text-decoration: none; }}
                    a:hover {{ text-decoration: underline; }}
                </style>
            </head>
            <body>
                <h1>{column_name} Topic Modeling Analysis</h1>
                <p>Analysis performed with BERTopic using {len(model.get_topics())} topics.</p>
                
                <h2>Available Visualizations:</h2>
                <div class="viz-container">
            """
            
            # Group visualizations by type
            viz_types = {
                'wordclouds': ['wordcloud', 'wordclouds'],
                'barchart': ['barchart', 'bar'],
                'term_rank': ['term_rank', 'term_score'],
                'similarity': ['similarity', 'heatmap'],
                'hierarchy': ['hierarchy', 'tree'],
                'network': ['network'],
                'distribution': ['distribution'],
                'document_map': ['document', 'datamap'],
                'time': ['time', 'temporal'],
                'class': ['class']
            }
            
            # Add each visualization to the HTML
            for viz in results["visualizations"]:
                if isinstance(viz, str):
                    viz_path = viz
                    viz_name = viz.split('/')[-1]
                    
                    # Determine visualization type for grouping
                    viz_type = "Other"
                    for t_name, keywords in viz_types.items():
                        if any(keyword in viz_name.lower() for keyword in keywords):
                            viz_type = t_name
                            break
                    
                    # Get appropriate title based on filename
                    viz_title = viz_name.replace('_', ' ').replace('.html', '').replace('.png', '').title()
                    
                    # Determine if it's an image or HTML file
                    is_image = viz_name.endswith(('.png', '.jpg', '.jpeg', '.gif'))
                    is_html = viz_name.endswith('.html')
                    
                    html_content += f"""
                    <div class="viz-item">
                        <h3>{viz_title}</h3>
                    """
                    
                    if is_image:
                        html_content += f"""
                        <img src="../{viz_path}" alt="{viz_title}">
                        <p><a href="../{viz_path}" target="_blank">Open full-size image</a></p>
                        """
                    elif is_html:
                        html_content += f"""
                        <iframe src="../{viz_path}" width="100%" height="300px"></iframe>
                        <p><a href="../{viz_path}" target="_blank">Open interactive visualization</a></p>
                        """
                    else:
                        html_content += f"""
                        <p><a href="../{viz_path}" target="_blank">Open visualization</a></p>
                        """
                    
                    html_content += "</div>\n"
            
            html_content += """
                </div>
            </body>
            </html>
            """
            
            # Save the HTML summary
            summary_filename = f"{column_name}_topic_modeling_summary.html"
            summary_path = Path(subject_main_dir) / subject_type / summary_filename
            with open(summary_path, 'w') as f:
                f.write(html_content)
            
            results["summary"] = f"{subject_type}/{summary_filename}"
            
        except Exception as e:
            print(f"Error generating visualization summary: {str(e)}")
        
        # Save topic info as CSV
        if subject_main_dir:
            topic_info_path = f"{subject_main_dir}/{subject_type}/{column_name}_topic_info.csv"
            topic_info.to_csv(topic_info_path, index=False)
            results["topic_info_path"] = topic_info_path
            print(f"Topic information saved to {topic_info_path}")
        
        # Add topic assignments to results
        results["document_topics"] = topics
        results["document_topic_probs"] = probabilities
        results["processed_df"] = processed_df
        results["model"] = model
        
        print("Comprehensive BERTopic analysis completed successfully!")
        return results
        
    except ImportError as e:
        return {"error": f"Required package missing: {str(e)}. Please install with: pip install bertopic nltk scikit-learn plotly matplotlib pandas"}
    except Exception as e:
        import traceback
        return {"error": f"BERTopic analysis failed: {str(e)}\n{traceback.format_exc()}"}

## Sentiment analysis

In [None]:
def run_sentiment_analysis(df, column_name, subject_type=None, model_name="nlptown/bert-base-multilingual-uncased-sentiment", 
                           batch_size=32, max_samples=None, save_results=True):
    """
    Perform sentiment analysis on a text column using a multilingual transformer model.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        The dataframe containing the text data
    column_name : str
        The name of the column to analyze
    subject_type : str, optional
        The type of subject for saving results. If None, uses lowercase column_name
    model_name : str, optional
        The HuggingFace model to use for sentiment analysis
    batch_size : int, optional
        Batch size for processing texts
    max_samples : int, optional
        Maximum number of samples to process (for testing)
    save_results : bool, optional
        Whether to save results to disk
        
    Returns:
    --------
    pandas.DataFrame
        The original dataframe with added sentiment columns
    """
    import os
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    import time
    from transformers import pipeline
    import torch
    print(f"Running sentiment analysis on column: {column_name}")
    
    # Create a copy of the dataframe to avoid modifying the original
    result_df = df.copy()
    
    # Set subject type
    if subject_type is None:
        subject_type = column_name.lower()
    
    # Prepare directory for saving results
    if save_results:
        # Define subject_main_dir if not already defined in your environment
        subject_main_dir = os.environ.get('SUBJECT_MAIN_DIR', 'analyze_dataset')
        save_dir = os.path.join(subject_main_dir, subject_type, "sentiment_analysis")
        os.makedirs(save_dir, exist_ok=True)
        print(f"Results will be saved to: {save_dir}")
    
    # Filter out missing values
    valid_mask = result_df[column_name].notna()
    texts = result_df.loc[valid_mask, column_name].astype(str)
    
    # Limit the number of samples if specified
    if max_samples is not None and len(texts) > max_samples:
        sample_indices = np.random.choice(texts.index, max_samples, replace=False)
        texts = texts.loc[sample_indices]
        print(f"Analyzing {max_samples} random samples out of {valid_mask.sum()} valid texts")
    else:
        print(f"Analyzing all {valid_mask.sum()} valid texts")
    
    # Initialize the sentiment analysis pipeline
    print(f"Loading model: {model_name}")
    device = 0 if torch.cuda.is_available() else -1
    print(f"Device set to use cuda:{device}" if device >= 0 else "Device set to use CPU")
    sentiment_analyzer = pipeline("sentiment-analysis", model=model_name, device=device)
    
    # Process texts in batches
    all_results = []
    texts_list = texts.tolist()
    
    # Use custom progress tracking instead of tqdm
    total_batches = (len(texts_list) + batch_size - 1) // batch_size
    print(f"Processing {total_batches} batches...")
    
    start_time = time.time()
    for i in range(0, len(texts_list), batch_size):
        # Calculate and display progress
        batch_num = i // batch_size + 1
        elapsed = time.time() - start_time
        if batch_num > 1:
            avg_time_per_batch = elapsed / (batch_num - 1)
            est_remaining_time = avg_time_per_batch * (total_batches - batch_num + 1)
            print(f"Analyzing sentiment: batch {batch_num}/{total_batches} - " 
                  f"{batch_num/total_batches*100:.1f}% complete - "
                  f"Est. remaining: {est_remaining_time:.1f}s", end='\r')
        
        batch = texts_list[i:i+batch_size]
        
        # Some texts might be too long - truncate them
        truncated_batch = [text[:512] if len(text) > 512 else text for text in batch]
        
        try:
            results = sentiment_analyzer(truncated_batch)
            all_results.extend(results)
        except Exception as e:
            print(f"Error processing batch {i//batch_size}: {e}")
            # Add empty results to maintain alignment
            all_results.extend([{"label": "UNKNOWN", "score": 0.0}] * len(batch))
    
    print("\nSentiment analysis complete!")
    
    # Convert sentiment labels to numerical scores
    # The model uses labels like "1 star", "2 stars", etc.
    sentiment_scores = []
    for result in all_results:
        label = result["label"]
        score = result["score"]
        
        # Extract numerical rating if available (for star-based models)
        if label.startswith(("1 ", "2 ", "3 ", "4 ", "5 ")):
            numeric_score = int(label[0])
        # Handle POSITIVE/NEGATIVE/NEUTRAL labels
        elif label == "POSITIVE":
            numeric_score = 5
        elif label == "NEGATIVE":
            numeric_score = 1
        elif label == "NEUTRAL":
            numeric_score = 3
        else:
            numeric_score = 0  # Unknown label format
            
        sentiment_scores.append({
            "sentiment_label": label,
            "sentiment_confidence": score,
            "sentiment_score": numeric_score
        })
    
    # Add sentiment results to the dataframe
    for i, idx in enumerate(texts.index):
        if i < len(sentiment_scores):  # Safety check
            for key, value in sentiment_scores[i].items():
                result_df.loc[idx, key] = value
    
    # Fill NaN sentiment values for rows that weren't analyzed
    sentiment_columns = ["sentiment_label", "sentiment_confidence", "sentiment_score"]
    for col in sentiment_columns:
        if col in result_df.columns:
            missing_mask = ~result_df[col].notna()
            result_df.loc[missing_mask, col] = "N/A" if col == "sentiment_label" else np.nan
    
    # Generate visualizations if save_results is True
    if save_results:
        # Distribution of sentiment scores
        plt.figure(figsize=(10, 6))
        sns.histplot(result_df["sentiment_score"].dropna(), bins=5, kde=True)
        plt.title(f"Distribution of Sentiment Scores for {column_name}")
        plt.xlabel("Sentiment Score (1-5)")
        plt.ylabel("Count")
        plt.tight_layout()
        plt.savefig(os.path.join(save_dir, "sentiment_distribution.png"))
        plt.close()
        
        # Pie chart of sentiment categories
        plt.figure(figsize=(10, 6))
        sentiment_counts = result_df["sentiment_label"].value_counts()
        plt.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%')
        plt.title(f"Sentiment Categories for {column_name}")
        plt.tight_layout()
        plt.savefig(os.path.join(save_dir, "sentiment_categories_pie.png"))
        plt.close()
        
        # Save the results to CSV
        result_df[sentiment_columns + [column_name]].to_csv(
            os.path.join(save_dir, f"{subject_type}_sentiment_analysis.csv"), index=False
        )
        
        print(f"Sentiment analysis results saved to {save_dir}")
    
    return result_df

# Example usage:
# sentiment_df = run_sentiment_analysis(df, "Abstract", max_samples=1000)

### Zero shot sentiment analysis

In [None]:
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer

def run_zero_shot_analysis(df, column_name, decade_column="decade", subject_type=None, 
                           model_name="facebook/bart-large-mnli", 
                           labels=None, top_n_labels=3,
                           batch_size=16, max_samples=None, save_results=True):
    """
    Analyze academic papers using zero-shot classification with enhanced visualizations
    
    Parameters:
    -----------
    df : pandas.DataFrame
        The dataframe containing the text data
    column_name : str
        The name of the column to analyze
    decade_column : str
        Name of the column containing decade information for trend analysis
    subject_type : str, optional
        The type of subject for saving results. If None, uses lowercase column_name
    model_name : str, optional
        The HuggingFace model to use for zero-shot classification
    labels : list, optional
        List of class labels to use for classification
    top_n_labels : int, optional
        Number of top labels to include in visualizations (default: 3)
    batch_size : int, optional
        Batch size for processing texts
    max_samples : int, optional
        Maximum number of samples to process (for testing)
    save_results : bool, optional
        Whether to save results to disk
        
    Returns:
    --------
    pandas.DataFrame
        The original dataframe with added classification columns
    """
    import os
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    import time
    import re
    import torch
    
    # Default labels for antisemitism research if none provided
    if labels is None:
        labels = [
            "historical analysis of antisemitism",
            "contemporary antisemitism", 
            "antisemitism in politics",
            "antisemitism in media",
            "antisemitism in religion",
            "addressing or combating antisemitism",
            "causes of antisemitism",
            "impact of antisemitism",
            "antisemitic incidents",
            "antisemitic theories and ideologies"
        ]
    
    print(f"Running zero-shot classification on column: {column_name}")
    print(f"Using labels: {labels}")
    
    # Create a copy of the dataframe to avoid modifying the original
    result_df = df.copy()
    
    # Set subject type
    if subject_type is None:
        subject_type = column_name.lower()
    
    # Prepare directory for saving results
    if save_results:
        # Define subject_main_dir if not already defined in your environment
        subject_main_dir = os.environ.get('SUBJECT_MAIN_DIR', 'analyze_dataset')
        save_dir = os.path.join(subject_main_dir, subject_type, "zero_shot_analysis")
        os.makedirs(save_dir, exist_ok=True)
        print(f"Results will be saved to: {save_dir}")
    
    # Filter out missing values
    valid_mask = result_df[column_name].notna()
    texts = result_df.loc[valid_mask, column_name].astype(str)
    
    # Remove very short texts (likely not meaningful for analysis)
    texts = texts[texts.str.len() > 10]
    
    # Limit the number of samples if specified
    if max_samples is not None and len(texts) > max_samples:
        sample_indices = np.random.choice(texts.index, max_samples, replace=False)
        texts = texts.loc[sample_indices]
        print(f"Analyzing {max_samples} random samples out of {valid_mask.sum()} valid texts")
    else:
        print(f"Analyzing all {len(texts)} valid texts")
    
    # Initialize the zero-shot classification pipeline
    print(f"Loading model: {model_name}")
    device = 0 if torch.cuda.is_available() else -1
    print(f"Device set to use cuda:{device}" if device >= 0 else "Device set to use CPU")
    
    # Load model and tokenizer separately
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    
    # Create the pipeline
    classifier = pipeline("zero-shot-classification", 
                         model=model, 
                         tokenizer=tokenizer,
                         device=0 if torch.cuda.is_available() else -1)
    
    # Process texts in batches
    all_results = []
    texts_list = texts.tolist()
    
    # Use custom progress tracking
    total_batches = (len(texts_list) + batch_size - 1) // batch_size
    print(f"Processing {total_batches} batches...")
        
    start_time = time.time()
    for i in range(0, len(texts_list), batch_size):
        # Calculate and display progress
        batch_num = i // batch_size + 1
        elapsed = time.time() - start_time
        if batch_num > 1:
            avg_time_per_batch = elapsed / (batch_num - 1)
            est_remaining_time = avg_time_per_batch * (total_batches - batch_num + 1)
            print(f"Classifying: batch {batch_num}/{total_batches} - " 
                  f"{batch_num/total_batches*100:.1f}% complete - "
                  f"Est. remaining: {est_remaining_time:.1f}s", end='\r')
        
        batch = texts_list[i:i+batch_size]
        
        # Clean the academic text - remove citations, clean up formatting
        cleaned_batch = []
        for text in batch:
            # Remove citation patterns like [1], [2-4], etc.
            text = re.sub(r'\[\d+(?:-\d+)?\]', '', text)
            # Remove excessive whitespace
            text = re.sub(r'\s+', ' ', text).strip()
            # Truncate if too long for the model (most models have 512-1024 token limits)
            text = text[:1024] if len(text) > 1024 else text
            cleaned_batch.append(text)
        
        try:
            # Process each text individually to handle errors better
            batch_results = []
            for text in cleaned_batch:
                try:
                    # Skip very short texts
                    if len(text.strip()) < 10:
                        batch_results.append({
                            "text": text,
                            "labels": labels,
                            "scores": [0] * len(labels),
                            "top_label": "N/A",
                            "top_score": 0.0
                        })
                        continue
                        
                    result = classifier(text, labels, multi_label=True)
                    
                    # Store results in a more accessible format
                    batch_results.append({
                        "text": text,
                        "labels": result["labels"],
                        "scores": result["scores"],
                        "top_label": result["labels"][0],
                        "top_score": result["scores"][0]
                    })
                except Exception as e:
                    print(f"\nError processing text: {str(e)}")
                    print(f"Problematic text: {text[:100]}...")
                    # Add placeholder result
                    batch_results.append({
                        "text": text,
                        "labels": labels,
                        "scores": [0] * len(labels),
                        "top_label": "ERROR",
                        "top_score": 0.0
                    })
            
            all_results.extend(batch_results)
            
        except Exception as e:
            print(f"\nError processing batch {batch_num}: {str(e)}")
            # Add placeholder results to maintain alignment
            for text in cleaned_batch:
                all_results.append({
                    "text": text,
                    "labels": labels,
                    "scores": [0] * len(labels),
                    "top_label": "ERROR",
                    "top_score": 0.0
                })
    
    print("\nClassification complete!")
    
    # Add classification results to the dataframe
    # Create new columns for each label
    for label in labels:
        result_df[f"zs_{label.replace(' ', '_')}"] = np.nan
    
    # Add a column for the top label and its score
    result_df["zs_top_label"] = np.nan
    result_df["zs_top_score"] = np.nan
    
    # Add results to dataframe
    for i, idx in enumerate(texts.index):
        if i < len(all_results):  # Safety check
            result = all_results[i]
            
            # Add top label and score
            result_df.loc[idx, "zs_top_label"] = result["top_label"]
            result_df.loc[idx, "zs_top_score"] = result["top_score"]
            
            # Add individual scores for each label
            for label, score in zip(result["labels"], result["scores"]):
                col_name = f"zs_{label.replace(' ', '_')}"
                if col_name in result_df.columns:
                    result_df.loc[idx, col_name] = score
    
    # Generate visualizations if save_results is True
    if save_results:
        # 1. Distribution of data between labels by paper count
        plt.figure(figsize=(14, 10))
        label_counts = result_df["zs_top_label"].value_counts()
        sns.barplot(x=label_counts.values, y=label_counts.index, palette="viridis")
        plt.title(f"Distribution of Papers by Top Label for {column_name}", fontsize=14)
        plt.xlabel("Number of Papers", fontsize=12)
        plt.ylabel("Label", fontsize=12)
        plt.grid(axis='x', linestyle='--', alpha=0.7)
        plt.tight_layout()
        plt.savefig(os.path.join(save_dir, "label_distribution_by_paper_count.png"), dpi=300, bbox_inches='tight')
        plt.close()
        
        # 2. Distribution of confidence scores for top N labels
        # Get the top N most frequent labels
        top_n_frequent_labels = label_counts.nlargest(top_n_labels).index.tolist()
        
        plt.figure(figsize=(12, 8))
        for label in top_n_frequent_labels:
            # Get confidence scores for papers with this label as top label
            scores = result_df[result_df["zs_top_label"] == label]["zs_top_score"]
            if len(scores) > 0:  # Only plot if we have data
                sns.kdeplot(scores, fill=True, label=f"{label} (n={len(scores)})")
        
        plt.title(f"Confidence Score Distribution for Top Labels - {column_name}", fontsize=14)
        plt.xlabel("Confidence Score", fontsize=12)
        plt.ylabel("Density", fontsize=12)
        plt.legend(loc='best')
        plt.grid(linestyle='--', alpha=0.7)
        plt.tight_layout()
        plt.savefig(os.path.join(save_dir, f"top_labels_confidence_distribution.png"), dpi=300, bbox_inches='tight')
        plt.close()
        
        # 3a. Trend analysis by decade (if decade column exists)
        if decade_column in result_df.columns:
            # Filter to rows with valid decade information and top labels
            trend_df = result_df.dropna(subset=[decade_column, "zs_top_label"])
            
            # Convert decade to numeric if it's not already
            if not pd.api.types.is_numeric_dtype(trend_df[decade_column]):
                try:
                    # Try to extract year from various date formats
                    trend_df = trend_df.copy()  # To avoid SettingWithCopyWarning
                    trend_df["decade_numeric"] = pd.to_datetime(trend_df[decade_column], errors='coerce').dt.year // 10 * 10
                except:
                    # If that fails, try to extract digits directly
                    trend_df["decade_numeric"] = trend_df[decade_column].astype(str).str.extract('(\d{4})').astype(float) // 10 * 10
            else:
                # If it's already numeric, just floor to decade
                trend_df["decade_numeric"] = trend_df[decade_column] // 10 * 10
            
            # Drop rows with invalid decade values
            trend_df = trend_df.dropna(subset=["decade_numeric"])
            
            if len(trend_df) > 0:
                # Count papers by decade and top label
                decade_label_counts = pd.crosstab(
                    trend_df["decade_numeric"], 
                    trend_df["zs_top_label"],
                    normalize='index'
                ) * 100  # Convert to percentage
                
                # Visualization 3a: Line chart showing trends in label prevalence
                plt.figure(figsize=(14, 8))
                
                # Get the most prominent label for each decade
                most_prominent_per_decade = decade_label_counts.idxmax(axis=1)
                
                # Find labels that were most prominent in at least one decade
                prominent_labels = most_prominent_per_decade.unique()
                
                # Plot only prominent labels
                for label in prominent_labels:
                    if label in decade_label_counts.columns:
                        plt.plot(
                            decade_label_counts.index, 
                            decade_label_counts[label], 
                            marker='o', 
                            linewidth=2, 
                            label=label
                        )
                
                plt.title(f"Trends in Label Prevalence by Decade - {column_name}", fontsize=14)
                plt.xlabel("Decade", fontsize=12)
                plt.ylabel("Percentage of Papers (%)", fontsize=12)
                plt.grid(linestyle='--', alpha=0.7)
                plt.legend(loc='best')
                plt.xticks(rotation=45)
                plt.tight_layout()
                plt.savefig(os.path.join(save_dir, "label_trends_by_decade.png"), dpi=300, bbox_inches='tight')
                plt.close()
                
                # Visualization 3b: Heatmap of most prominent labels per decade
                plt.figure(figsize=(14, 10))
                
                # Create a binary indicator matrix where 1 indicates the most prominent label
                top_label_matrix = pd.DataFrame(0, 
                                               index=decade_label_counts.index, 
                                               columns=decade_label_counts.columns)
                
                for decade in decade_label_counts.index:
                    top_label = most_prominent_per_decade.loc[decade]
                    if top_label in top_label_matrix.columns:
                        top_label_matrix.loc[decade, top_label] = 1
                
                # Create the heatmap
                sns.heatmap(top_label_matrix, cmap="YlOrRd", cbar=False, linewidths=.5)
                plt.title(f"Most Prominent Label by Decade - {column_name}", fontsize=14)
                plt.xlabel("Label", fontsize=12)
                plt.ylabel("Decade", fontsize=12)
                plt.xticks(rotation=45, ha='right')
                plt.tight_layout()
                plt.savefig(os.path.join(save_dir, "most_prominent_label_by_decade.png"), dpi=300, bbox_inches='tight')
                plt.close()
                
                # Visualization 3c: Alternative visualization - Stacked area chart
                plt.figure(figsize=(14, 8))
                decade_label_counts.plot.area(figsize=(14, 8), alpha=0.7, stacked=True)
                plt.title(f"Relative Distribution of Labels by Decade - {column_name}", fontsize=14)
                plt.xlabel("Decade", fontsize=12)
                plt.ylabel("Percentage of Papers (%)", fontsize=12)
                plt.grid(linestyle='--', alpha=0.7)
                plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
                plt.tight_layout()
                plt.savefig(os.path.join(save_dir, "label_distribution_stacked_by_decade.png"), dpi=300, bbox_inches='tight')
                plt.close()
        
        # Additional visualization: Label correlation heatmap
        score_cols = [col for col in result_df.columns if col.startswith("zs_") and col not in ["zs_top_label", "zs_top_score"]]
        if len(score_cols) > 1:  # Only if we have multiple labels
            plt.figure(figsize=(12, 10))
            corr_matrix = result_df[score_cols].corr()
            mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
            sns.heatmap(
                corr_matrix, 
                mask=mask, 
                cmap="coolwarm", 
                annot=True, 
                fmt=".2f", 
                linewidths=.5,
                vmin=-1, 
                vmax=1,
                center=0
            )
            plt.title(f"Correlation Between Label Scores - {column_name}", fontsize=14)
            plt.tight_layout()
            plt.savefig(os.path.join(save_dir, "label_score_correlation.png"), dpi=300, bbox_inches='tight')
            plt.close()
        
        # Save the results to CSV
        result_df[["zs_top_label", "zs_top_score"] + score_cols + [column_name]].to_csv(
            os.path.join(save_dir, f"{subject_type}_zero_shot_analysis.csv"), index=False
        )
        
        print(f"Zero-shot analysis results saved to {save_dir}")
    
    return result_df

# Example usage:
results_df = run_zero_shot_analysis(
    df, 
    "Title", 
    model_name = "nlptown/bert-base-multilingual-uncased-sentiment",
)
results_df = run_zero_shot_analysis(
    df, 
    "Abstract", 
    model_name = "nlptown/bert-base-multilingual-uncased-sentiment",
)

## Keyword analysis

In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from pathlib import Path
import numpy as np
from matplotlib.gridspec import GridSpec
from community import best_partition  # python-louvain package for community detection
import matplotlib.colors as mcolors

def visualize_keyword_network(df, 
                             keywords, 
                             title_column="Title", 
                             list_column=None, 
                             top_n=10, 
                             subject_main_dir="output", 
                             subject_type="networks",
                             filename_prefix="keyword_network"):
    """
    Create and visualize a network graph where keywords connect to most common entities
    
    Parameters:
    df (DataFrame): DataFrame containing the data
    keywords (list): List of keywords to filter titles
    title_column (str): Name of the column containing titles to filter
    list_column (str): Name of the column containing lists of string values
    top_n (int): Number of top entities to include in the graph
    subject_main_dir (str/Path): Main directory for saving visualizations
    subject_type (str): Subfolder name for this analysis type
    filename_prefix (str): Prefix for saved files
    
    Returns:
    nx.Graph: The created network graph
    """
    # Step 1: Filter dataframe for titles containing at least one keyword
    filtered_df = filter_by_keywords(df, keywords, title_column)
    
    if filtered_df.empty:
        print(f"No rows found with titles containing the specified keywords: {keywords}")
        return None
        
    # Step 2: Get top entities from the list column
    if list_column is None:
        raise ValueError("list_column must be specified")
        
    top_entities = get_top_entities(filtered_df, list_column, top_n)
    
    # Step 3: Create network graph
    G = create_network_graph(filtered_df, keywords, list_column, top_entities)
    
    # Step 4: Visualize and save the graph
    visualize_graph(G, keywords, top_entities, 
                   f"{filename_prefix}_base", 
                   subject_main_dir, 
                   subject_type)
    
    # Step 5: Analyze and visualize communities
    visualize_communities(G, keywords, top_entities, 
                         f"{filename_prefix}_communities", 
                         subject_main_dir, 
                         subject_type)
    
    # Step 6: Find and visualize cliques
    visualize_cliques(G, keywords, top_entities, 
                     f"{filename_prefix}_cliques", 
                     subject_main_dir, 
                     subject_type)
    
    return G

def filter_by_keywords(df, keywords, title_column):
    """Filter dataframe for rows where title contains at least one keyword"""
    # Create case-insensitive filter for each keyword
    filters = []
    for keyword in keywords:
        filters.append(df[title_column].str.contains(keyword, case=False))
        
    # Combine filters with OR operation
    combined_filter = filters[0]
    for f in filters[1:]:
        combined_filter = combined_filter | f
        
    return df[combined_filter].copy()

def get_top_entities(df, list_column, top_n):
    """Get top n most common entities from a column of lists"""
    # Flatten the lists and count occurrences
    all_entities = []
    
    for entities_list in df[list_column]:
        # Handle string representation of lists (if needed)
        if isinstance(entities_list, str):
            try:
                entities_list = eval(entities_list)
            except:
                continue
                
        # Skip if not a list
        if not isinstance(entities_list, list):
            continue
            
        all_entities.extend(entities_list)
    
    # Count and get top n
    counter = Counter(all_entities)
    return [item for item, count in counter.most_common(top_n)]

def create_network_graph(df, keywords, list_column, top_entities):
    """Create a network graph connecting keywords with entities"""
    G = nx.Graph()
    
    # Add keyword nodes
    for keyword in keywords:
        G.add_node(keyword, type='keyword')
    
    # Add entity nodes
    for entity in top_entities:
        G.add_node(entity, type='entity')
    
    # Add edges based on co-occurrences
    for _, row in df.iterrows():
        title = row[df.columns[0]]  # First column assumed to be title
        
        # Get entities for this row
        entities = row[list_column]
        if isinstance(entities, str):
            try:
                entities = eval(entities)
            except:
                continue
                
        if not isinstance(entities, list):
            continue
            
        # Filter to only include top entities
        row_entities = [e for e in entities if e in top_entities]
        
        # Connect keywords in title to entities
        for keyword in keywords:
            if keyword.lower() in title.lower():
                for entity in row_entities:
                    # Add edge or increment weight if exists
                    if G.has_edge(keyword, entity):
                        G[keyword][entity]['weight'] += 1
                    else:
                        G.add_edge(keyword, entity, weight=1)
    
    # Calculate node centrality
    centrality = nx.degree_centrality(G)
    nx.set_node_attributes(G, centrality, 'centrality')
    
    return G

def visualize_graph(G, keywords, top_entities, filename, subject_main_dir, subject_type):
    """Visualize and save the network graph"""
    plt.figure(figsize=(14, 10))
    
    # Set node positions using spring layout
    pos = nx.spring_layout(G, k=0.3, seed=42)
    
    # Get node attributes
    node_types = nx.get_node_attributes(G, 'type')
    centrality = nx.get_node_attributes(G, 'centrality')
    
    # Prepare node lists by type
    keyword_nodes = [node for node in G.nodes() if node in keywords]
    entity_nodes = [node for node in G.nodes() if node in top_entities]
    
    # Node sizes based on centrality (scaled differently for each type)
    keyword_sizes = [centrality.get(node, 0.1) * 3000 for node in keyword_nodes]
    entity_sizes = [centrality.get(node, 0.1) * 2000 for node in entity_nodes]
    
    # Edge weights for line thickness and color
    edges = G.edges()
    weights = [G[u][v]['weight'] for u, v in edges]
    max_weight = max(weights) if weights else 1
    normalized_weights = [w/max_weight for w in weights]
    
    # Draw nodes
    nx.draw_networkx_nodes(G, pos, nodelist=keyword_nodes, 
                          node_size=keyword_sizes, 
                          node_color='red', 
                          node_shape='o', 
                          alpha=0.8)
    
    nx.draw_networkx_nodes(G, pos, nodelist=entity_nodes, 
                          node_size=entity_sizes, 
                          node_color='blue', 
                          node_shape='s', 
                          alpha=0.6)
    
    # Draw edges with varying thickness
    for (u, v, data) in G.edges(data=True):
        width = data['weight'] * 2 / max_weight
        alpha = 0.3 + (0.7 * data['weight'] / max_weight)
        nx.draw_networkx_edges(G, pos, edgelist=[(u, v)], width=width, alpha=alpha, 
                              edge_color='gray')
    
    # Draw labels with varying sizes
    keyword_labels = {node: node for node in keyword_nodes}
    entity_labels = {node: node for node in entity_nodes}
    
    nx.draw_networkx_labels(G, pos, labels=keyword_labels, font_size=12, font_weight='bold')
    nx.draw_networkx_labels(G, pos, labels=entity_labels, font_size=10)
    
    # Add legend
    plt.plot([0], [0], 'ro', markersize=10, label='Keywords')
    plt.plot([0], [0], 'bs', markersize=10, label='Entities')
    plt.legend(loc='upper right')
    
    plt.title(f"Network of Keywords and Top {len(top_entities)} Entities")
    plt.axis('off')
    
    # Save the plot
    save_plot(plt, f"{filename}.png", subject_main_dir, subject_type)

def visualize_communities(G, keywords, top_entities, filename, subject_main_dir, subject_type):
    """Detect and visualize communities in the network"""
    # Detect communities using Louvain algorithm
    partition = best_partition(G)
    communities = {}
    
    # Group nodes by community
    for node, community_id in partition.items():
        if community_id not in communities:
            communities[community_id] = []
        communities[community_id].append(node)
    
    # Count communities
    num_communities = len(communities)
    
    if num_communities <= 1:
        print("Only one community detected, skipping community visualization")
        return
    
    # Create a colormap for communities
    cmap = plt.cm.get_cmap('tab20', num_communities)
    
    plt.figure(figsize=(16, 12))
    
    # Set node positions using spring layout
    pos = nx.spring_layout(G, k=0.3, seed=42)
    
    # Get centrality for node sizes
    centrality = nx.get_node_attributes(G, 'centrality')
    
    # Draw nodes colored by community
    for i, (community_id, nodes) in enumerate(communities.items()):
        nx.draw_networkx_nodes(G, pos, nodelist=nodes, 
                              node_size=[centrality.get(node, 0.1) * 2500 for node in nodes],
                              node_color=[cmap(i)], 
                              label=f'Community {community_id}')
    
    # Draw edges with varying thickness
    for (u, v, data) in G.edges(data=True):
        width = data['weight'] * 1.5 / max([G[u][v]['weight'] for u, v in G.edges()])
        alpha = 0.2 + (0.6 * data['weight'] / max([G[u][v]['weight'] for u, v in G.edges()]))
        
        # If nodes are in the same community, use community color
        if partition[u] == partition[v]:
            edge_color = cmap(partition[u])
        else:
            edge_color = 'gray'
            
        nx.draw_networkx_edges(G, pos, edgelist=[(u, v)], width=width, alpha=alpha, 
                              edge_color=edge_color)
    
    # Draw labels with appropriate sizes based on node type
    keyword_labels = {node: node for node in keywords if node in G.nodes()}
    entity_labels = {node: node for node in top_entities if node in G.nodes()}
    
    nx.draw_networkx_labels(G, pos, labels=keyword_labels, font_size=12, font_weight='bold')
    nx.draw_networkx_labels(G, pos, labels=entity_labels, font_size=10)
    
    plt.title(f"Community Structure in Keyword-Entity Network (Detected: {num_communities} communities)")
    plt.axis('off')
    plt.legend(scatterpoints=1, loc='upper right')
    
    # Save the plot
    save_plot(plt, f"{filename}.png", subject_main_dir, subject_type)
    
    # Create community composition analysis plot
    plt.figure(figsize=(12, 8))
    community_sizes = [len(nodes) for nodes in communities.values()]
    
    # Count keyword and entity nodes in each community
    keyword_counts = []
    entity_counts = []
    
    for community_nodes in communities.values():
        keyword_count = sum(1 for node in community_nodes if node in keywords)
        entity_count = sum(1 for node in community_nodes if node in top_entities)
        keyword_counts.append(keyword_count)
        entity_counts.append(entity_count)
    
    # Create a stacked bar chart
    community_ids = list(communities.keys())
    
    plt.bar(community_ids, keyword_counts, label='Keywords')
    plt.bar(community_ids, entity_counts, bottom=keyword_counts, label='Entities')
    
    plt.xlabel('Community ID')
    plt.ylabel('Number of Nodes')
    plt.title('Composition of Communities')
    plt.legend()
    plt.xticks(community_ids)
    plt.tight_layout()
    
    # Save the composition plot
    save_plot(plt, f"{filename}_composition.png", subject_main_dir, subject_type)

def visualize_cliques(G, keywords, top_entities, filename, subject_main_dir, subject_type):
    """Find and visualize cliques in the network"""
    # Find all maximal cliques
    cliques = list(nx.find_cliques(G))
    
    # Filter for cliques with at least 3 nodes
    significant_cliques = [c for c in cliques if len(c) >= 3]
    
    if not significant_cliques:
        print("No significant cliques (size >= 3) found in the network")
        return
    
    # Sort cliques by size (largest first)
    significant_cliques.sort(key=len, reverse=True)
    
    # Limit to top 6 largest cliques for visualization
    cliques_to_plot = significant_cliques[:min(6, len(significant_cliques))]
    num_cliques = len(cliques_to_plot)
    
    # Create figure with subplots
    fig = plt.figure(figsize=(20, 10))
    
    # Calculate grid dimensions
    if num_cliques <= 3:
        rows, cols = 1, num_cliques
    else:
        rows, cols = 2, 3
    
    # Set node positions for the full graph (to keep consistent positioning)
    pos = nx.spring_layout(G, k=0.3, seed=42)
    
    # Create a colormap
    cmap = plt.cm.get_cmap('tab10', num_cliques)
    
    # Iterate through cliques to plot each one
    for i, clique in enumerate(cliques_to_plot):
        plt.subplot(rows, cols, i+1)
        
        # Create subgraph for this clique
        subgraph = G.subgraph(clique)
        
        # Get centrality for node sizes
        centrality = nx.degree_centrality(subgraph)
        
        # Identify node types in this clique
        keyword_nodes = [node for node in clique if node in keywords]
        entity_nodes = [node for node in clique if node in top_entities]
        
        # Draw nodes
        nx.draw_networkx_nodes(subgraph, pos, nodelist=keyword_nodes, 
                              node_size=[centrality.get(node, 0.1) * 2000 for node in keyword_nodes],
                              node_color='red', 
                              node_shape='o', 
                              alpha=0.8)
        
        nx.draw_networkx_nodes(subgraph, pos, nodelist=entity_nodes, 
                              node_size=[centrality.get(node, 0.1) * 1500 for node in entity_nodes],
                              node_color='blue', 
                              node_shape='s', 
                              alpha=0.6)
        
        # Draw edges
        edge_weights = [subgraph[u][v]['weight'] for u, v in subgraph.edges()]
        max_weight = max(edge_weights) if edge_weights else 1
        
        for (u, v, data) in subgraph.edges(data=True):
            width = data['weight'] * 2 / max_weight
            alpha = 0.4 + (0.6 * data['weight'] / max_weight)
            nx.draw_networkx_edges(subgraph, pos, edgelist=[(u, v)], width=width, alpha=alpha, 
                                  edge_color='gray')
        
        # Draw labels
        nx.draw_networkx_labels(subgraph, pos, font_size=9, font_weight='bold')
        
        plt.title(f"Clique {i+1}: {len(clique)} nodes")
        plt.axis('off')
    
    plt.suptitle(f"Top {num_cliques} Cliques in the Network", fontsize=16)
    plt.tight_layout(rect=[0, 0, 1, 0.96])  # Adjust for the suptitle
    
    # Save the plot
    save_plot(plt, f"{filename}.png", subject_main_dir, subject_type)
    
    # Create clique composition summary
    plt.figure(figsize=(12, 8))
    
    clique_sizes = [len(c) for c in cliques_to_plot]
    clique_labels = [f"Clique {i+1}" for i in range(num_cliques)]
    
    # Count keywords and entities in each clique
    keyword_counts = []
    entity_counts = []
    
    for clique in cliques_to_plot:
        keyword_count = sum(1 for node in clique if node in keywords)
        entity_count = sum(1 for node in clique if node in top_entities)
        keyword_counts.append(keyword_count)
        entity_counts.append(entity_count)
    
    # Create stacked bar chart
    plt.bar(clique_labels, keyword_counts, label='Keywords')
    plt.bar(clique_labels, entity_counts, bottom=keyword_counts, label='Entities')
    
    plt.xlabel('Clique')
    plt.ylabel('Number of Nodes')
    plt.title('Composition of Top Cliques')
    plt.legend()
    plt.xticks(rotation=45)
    plt.tight_layout()
    
    # Save the composition plot
    save_plot(plt, f"{filename}_composition.png", subject_main_dir, subject_type)

# Example usage:

# Define keywords
keywords = ['antisemitism', 'antizionist', 'anti-zionism', 'jew', 'jews','jewish','holocaust', 'nazi', 'nazism']

# Visualize network
G = visualize_keyword_network(
    df=df,
    keywords=keywords,
    title_column="Title",
    list_column="Topics",  # Column containing lists of strings
    top_n=15,               # Top 15 most common authors
    subject_main_dir=subject_main_dir,
    subject_type="keyword_netword",
    filename_prefix=None
)


# Titles Analysis

## Title general analysis

In [None]:
results, processed_df=analyze_text_column(df, "Title", subject_main_dir,
                                          proc_df_out_path=filtered_titles_df_path)

## Title Entity recognition 

In [None]:
title_entity_analysis = analyze_entities_in_column(
    df=processed_df,
    column_name='Title',
    subject_main_dir=subject_main_dir, 
    proc_df_out_path=filtered_titles_df_path
)

## Title Bert-Topic analysis

In [None]:
processed_df, lstat=preprocess_dataframe_text_col(df, text_column="Title")

In [None]:
perform_topic_modeling(processed_df, "Title")

In [None]:
bertopic_results = run_bertopic_analysis(
    df=processed_df,                          # Your dataframe
    column_name="Title",         # Text column to analyze
    decade_column="decade",         # Column with decade information (for trend analysis)
    subject_main_dir=subject_main_dir, # Main directory for saving results
    language="english",             # Text language
    nr_topics=15,                   # Number of topics (or "auto")
    min_topic_size=10,              # Min documents per topic
    top_n_topics=8                  # Number of top topics to visualize
)

# Check for errors
if "error" in bertopic_results:
    print(f"Error: {bertopic_results['error']}")
else:
    print(f"Analysis complete with {bertopic_results['topic_count']} topics")
    print("Created visualizations:")
    for viz in bertopic_results['visualizations']:
        print(f"- {viz}")
    
    # Access the processed dataframe with topic assignments
    topic_df = bertopic_results['processed_df']
    print(f"Documents with topic assignments: {len(topic_df)}")

In [None]:
"""
Example of analyzing document titles with BERTopic using decade-based temporal analysis
"""

import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import os


def analyze_column_by_decade(df, column_name='Title', output_dir='results/title_analysis'):
    """
    Analyze document titles with BERTopic, including decade-based temporal analysis
    
    Parameters:
    df_path (str): Path to the CSV file containing the data
    output_dir (str): Directory where analysis results will be saved
    
    Returns:
    dict: Analysis results
    """

    
    print(f"Starting title analysis with {len(df)} documents...")
    
    # Run comprehensive BERTopic analysis
    results = run_comprehensive_bertopic_analysis(
        df=df,
        column_name=column_name,            # Analyze the Title column
        time_column='decade',           # Use decade column for temporal analysis
        class_column='Type',    # Optional: if you have a document type column
        subject_main_dir=output_dir,    # Output directory
        language='english',             # Language for stopwords
        nr_topics='auto',               # Let BERTopic determine the optimal number of topics
        min_topic_size=15,              # Minimum size for each topic
        top_n_topics=15,                # Show top 15 topics in visualizations
        visualize_documents=True,       # Create document-topic visualizations
        sample_documents=1000           # Sample 1000 documents for visualizations
    )
    
    if "error" in results:
        print(f"Error during analysis: {results['error']}")
        return results
    
    print(f"Analysis complete! Found {results['topic_count']} topics.")
    print(f"Summary visualization available at: {results['summary']}")
    
    # Return results for further analysis if needed
    return results

def analyze_multiple_columns(df, columns=['Title', 'Abstract'], output_dir='results/topic_analysis'):
    """
    Analyze multiple text columns with BERTopic
    
    Parameters:
    df_path (str): Path to the CSV file containing the data
    columns (list): List of column names to analyze
    output_dir (str): Directory where analysis results will be saved
    
    Returns:
    dict: Dictionary of analysis results for each column
    """
    print(f"Loading data from {df_path}...")
    
    # Load the dataset
    df = pd.read_csv(df_path)
    
    # Store results for each column
    all_results = {}
    
    for column in columns:
        print(f"\n--- Starting analysis of {column} column ---")
        
        # Define column-specific output directory
        column_output_dir = f"{output_dir}/{column.lower()}"
        
        # Run analysis for this column
        all_results[column] = run_comprehensive_bertopic_analysis(
            df=df,
            column_name=column,           # Analyze this specific column
            time_column='decade',         # Use decade column for temporal analysis
            class_column='Type',  # Optional: if you have a document type column
            subject_main_dir=column_output_dir,
            language='english',
            nr_topics='auto',
            min_topic_size=15,
            top_n_topics=15
        )
        
        if "error" in all_results[column]:
            print(f"Error analyzing {column}: {all_results[column]['error']}")
        else:
            print(f"Analysis of {column} complete! Found {all_results[column]['topic_count']} topics.")
    
    return all_results

if __name__ == "__main__":
    abs_processed_df = pd.read_csv('datasets/filtered_abstracts_dataset.csv')
    # Analyze just titles
    abs_results = analyze_column_by_decade(abs_processed_df, column_name='Abstract')
    
    # Or analyze multiple columns
    # all_results = analyze_multiple_columns(df_path, columns=['Title', 'Abstract'])

## Title semantic analysis

In [None]:
sentiment_df = run_sentiment_analysis(df, "Title", max_samples=1000, model_name = "cardiffnlp/twitter-xlm-roberta-base-sentiment")
sentiment_df

# Abstract analysis

## General anlysis

In [None]:
results, abs_processed_df=analyze_text_column(df, "Abstract", subject_main_dir)

## Abstract Entity recognition

In [None]:
title_entity_analysis = analyze_entities_in_column(
    df=abs_processed_df,
    column_name='Abstract',
    subject_main_dir=subject_main_dir, 
    proc_df_out_path=filtered_abstracts_df_path
)

In [None]:
bertopic_results = run_bertopic_analysis(
    df=abs_processed_df,                          # Your dataframe
    column_name="Abstract",         # Text column to analyze
    decade_column="decade",         # Column with decade information (for trend analysis)
    subject_main_dir=subject_main_dir, # Main directory for saving results
    language="english",             # Text language
    nr_topics=15,                   # Number of topics (or "auto")
    min_topic_size=10,              # Min documents per topic
    top_n_topics=8                  # Number of top topics to visualize
)

# Check for errors
if "error" in bertopic_results:
    print(f"Error: {bertopic_results['error']}")
else:
    print(f"Analysis complete with {bertopic_results['topic_count']} topics")
    print("Created visualizations:")
    for viz in bertopic_results['visualizations']:
        print(f"- {viz}")
    
    # Access the processed dataframe with topic assignments
    topic_df = bertopic_results['processed_df']
    print(f"Documents with topic assignments: {len(topic_df)}")

## Abstract BERTopic

In [None]:
bertopic_results = run_bertopic_analysis(
    df=processed_df,                          # Your dataframe
    column_name="Abstract",         # Text column to analyze
    decade_column="decade",         # Column with decade information (for trend analysis)
    subject_main_dir=subject_main_dir, # Main directory for saving results
    language="english",             # Text language
    nr_topics=15,                   # Number of topics (or "auto")
    min_topic_size=10,              # Min documents per topic
    top_n_topics=8                  # Number of top topics to visualize
)

# Check for errors
if "error" in bertopic_results:
    print(f"Error: {bertopic_results['error']}")
else:
    print(f"Analysis complete with {bertopic_results['topic_count']} topics")
    print("Created visualizations:")
    for viz in bertopic_results['visualizations']:
        print(f"- {viz}")
    
    # Access the processed dataframe with topic assignments
    topic_df = bertopic_results['processed_df']
    print(f"Documents with topic assignments: {len(topic_df)}")

## Abstract semantic analysis

In [None]:
sentiment_df = run_sentiment_analysis(df, "Abstract", max_samples=1000)
sentiment_df

# Categorical column analysis

## Categorical utils

In [None]:
def preprocess_categorical(df, column_name):
    """
    Preprocess a categorical column by removing NA values and printing statistics.
    
    Parameters:
    df (pandas.DataFrame): The dataframe containing the data
    column_name (str): The name of the categorical column to process
    
    Returns:
    pandas.DataFrame: A new dataframe with NA values removed for the specified column
    """
    # Record the original number of records
    original_count = len(df)
    
    # Create a copy to avoid modifying the original dataframe
    processed_df = df.copy()
    
    # Drop rows where the specified column is NA
    processed_df = processed_df.dropna(subset=[column_name])
    
    # Record the new count
    new_count = len(processed_df)
    
    # Calculate and print statistics
    dropped_count = original_count - new_count
    dropped_percentage = (dropped_count / original_count) * 100 if original_count > 0 else 0
    
    print(f"Preprocessing statistics for column '{column_name}':")
    print(f"- Original record count: {original_count}")
    print(f"- Records after dropping NA values: {new_count}")
    print(f"- Dropped records: {dropped_count} ({dropped_percentage:.2f}%)")
    
    return processed_df

## Linear trends over decades

In [None]:
def visualize_categorical_by_decade_percentage(df, column_name, decade_column, subject_main_dir, top_n=10, title_prefix="Distribution of"):
    """
    Visualize the percentage distribution of categorical values across decades.
    
    Parameters:
    df (pandas.DataFrame): The dataframe containing the data
    column_name (str): Name of the categorical column to analyze
    decade_column (str): Name of the column containing decade information
    subject_main_dir (str/Path): Main directory for all subject analyses
    subject_type (str): Type of subject analysis (e.g., 'domains', 'fields')
    top_n (int): Number of top categories to display (others will be grouped)
    title_prefix (str): Prefix for the plot title
    
    Returns:
    matplotlib.figure.Figure: The created figure
    """
    subject_type=column_name.lower()
    
    # Preprocess data
    processed_df = preprocess_categorical(df, column_name)
    
    # Get value counts for the top N categories
    top_categories = processed_df[column_name].value_counts().nlargest(top_n).index.tolist()
    
    # Create a copy of the dataframe with 'Other' for categories not in top N
    plot_df = processed_df.copy()
    plot_df.loc[~plot_df[column_name].isin(top_categories), column_name] = 'Other'
    
    # Group by decade and category, calculate percentages
    decade_category_counts = plot_df.groupby([decade_column, column_name]).size().unstack(fill_value=0)
    decade_percentages = decade_category_counts.div(decade_category_counts.sum(axis=1), axis=0)
    
    # Create the plot
    plt.figure(figsize=(14, 8))
    
    # Plot each category as a line
    for category in decade_percentages.columns:
        if category in top_categories:  # Skip 'Other' if we want to focus on top categories
            plt.plot(decade_percentages.index, decade_percentages[category], marker='o', label=category)
    
    # Configure the plot
    plt.title(f"{title_prefix} {column_name} Over Time", fontsize=14)
    plt.xlabel('Decade', fontsize=12)
    plt.ylabel(f'Percentage of {column_name}', fontsize=12)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    
    # Save the plot
    filename = f"{column_name}_percentage_by_decade.png"
    save_plot(plt, filename, subject_main_dir, subject_type)
    
    return plt.gcf()

def visualize_categorical_by_decade_counts(df, column_name, decade_column, subject_main_dir, top_n=10, title_prefix="Frequency of"):
    """
    Visualize the count distribution of categorical values across decades.
    
    Parameters:
    df (pandas.DataFrame): The dataframe containing the data
    column_name (str): Name of the categorical column to analyze
    decade_column (str): Name of the column containing decade information
    subject_main_dir (str/Path): Main directory for all subject analyses
    subject_type (str): Type of subject analysis (e.g., 'domains', 'fields')
    top_n (int): Number of top categories to display (others will be grouped)
    title_prefix (str): Prefix for the plot title
    
    Returns:
    matplotlib.figure.Figure: The created figure
    """
    subject_type=column_name.lower()
    # Preprocess data
    processed_df = preprocess_categorical(df, column_name)
    
    # Get value counts for the top N categories
    top_categories = processed_df[column_name].value_counts().nlargest(top_n).index.tolist()
    
    # Create a copy of the dataframe with 'Other' for categories not in top N
    plot_df = processed_df.copy()
    plot_df.loc[~plot_df[column_name].isin(top_categories), column_name] = 'Other'
    
    # Group by decade and category, get counts
    decade_category_counts = plot_df.groupby([decade_column, column_name]).size().unstack(fill_value=0)
    
    # Create the plot
    plt.figure(figsize=(14, 8))
    
    # Plot each category as a line
    for category in decade_category_counts.columns:
        if category in top_categories:  # Skip 'Other' if we want to focus on top categories
            plt.plot(decade_category_counts.index, decade_category_counts[category], marker='o', label=category)
    
    # Configure the plot
    plt.title(f"{title_prefix} {column_name} Over Time", fontsize=14)
    plt.xlabel('Decade', fontsize=12)
    plt.ylabel(f'Number of Records with {column_name}', fontsize=12)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    
    # Save the plot
    filename = f"{column_name}_counts_by_decade.png"
    save_plot(plt, filename, subject_main_dir, subject_type)
    
    return plt.gcf()

### Peaks analysis

In [None]:
def identify_peak_years(df, column_name, year_column, decade_column=None, focus_period=None):
    """
    Identify the years where each category in a column reaches its maximum value.
    
    Parameters:
    df (pandas.DataFrame): The dataframe containing the data
    column_name (str): Name of the categorical column to analyze
    year_column (str): Name of the column containing year information
    decade_column (str, optional): Name of decade column if available
    focus_period (tuple, optional): Tuple of (start_year, end_year) to focus analysis
    
    Returns:
    dict: Dictionary with categories as keys and information about their peaks as values
    pandas.DataFrame: Summary DataFrame of peak information
    """
    # Preprocess data
    processed_df = preprocess_categorical(df, column_name)
    
    # Filter by focus period if provided
    if focus_period:
        start_year, end_year = focus_period
        processed_df = processed_df[
            (processed_df[year_column] >= start_year) & 
            (processed_df[year_column] <= end_year)
        ]
        print(f"Focusing on period: {start_year} to {end_year}")
    
    # Dictionary to store results
    peak_info = {}
    
    # Get unique categories
    categories = processed_df[column_name].unique()
    
    # For each category, find the year with maximum count
    for category in categories:
        category_data = processed_df[processed_df[column_name] == category]
        
        # Group by year and count records
        yearly_counts = category_data.groupby(year_column).size()
        
        if not yearly_counts.empty:
            # Find the year(s) with maximum count
            max_count = yearly_counts.max()
            max_years = yearly_counts[yearly_counts == max_count].index.tolist()
            
            # Store the information
            peak_info[category] = {
                'peak_years': max_years,
                'peak_count': max_count,
                'total_records': len(category_data)
            }
    
    # Create a summary DataFrame
    summary_data = []
    for category, info in peak_info.items():
        summary_data.append({
            'Category': category,
            'Peak_Year(s)': ', '.join(map(str, info['peak_years'])),
            'Peak_Count': info['peak_count'],
            'Total_Records': info['total_records'],
            'Percentage_at_Peak': (info['peak_count'] / info['total_records']) * 100 if info['total_records'] > 0 else 0
        })
    
    summary_df = pd.DataFrame(summary_data)
    summary_df = summary_df.sort_values('Peak_Count', ascending=False)
    
    return peak_info, summary_df

def visualize_peak_distribution(peak_info, year_column, subject_main_dir, subject_type, focus_period=None):
    """
    Visualize the distribution of peak years across categories.
    
    Parameters:
    peak_info (dict): Dictionary with peak information from identify_peak_years
    year_column (str): Name of the column containing year information 
    subject_main_dir (str/Path): Main directory for all subject analyses
    subject_type (str): Type of subject analysis
    focus_period (tuple, optional): Tuple of (start_year, end_year) of focus period
    
    Returns:
    matplotlib.figure.Figure: The created figure
    """
    # Extract all peak years with their frequencies
    all_peak_years = []
    for category, info in peak_info.items():
        all_peak_years.extend(info['peak_years'])
    
    # Count frequency of each peak year
    year_counts = pd.Series(all_peak_years).value_counts().sort_index()
    
    # Create the histogram
    plt.figure(figsize=(12, 6))
    bars = plt.bar(year_counts.index, year_counts.values, color='steelblue', alpha=0.7)
    
    # Add data labels on top of each bar
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height + 0.1,
                f'{int(height)}', ha='center', va='bottom')
    
    # Configure the plot
    title = "Distribution of Peak Years Across Categories"
    if focus_period:
        title += f" ({focus_period[0]}-{focus_period[1]})"
        
    plt.title(title, fontsize=14)
    plt.xlabel('Year', fontsize=12)
    plt.ylabel('Number of Categories with Peak', fontsize=12)
    plt.grid(True, alpha=0.3, axis='y')
    plt.tight_layout()
    
    # Save the plot
    filename = f"peak_years_distribution_{year_column}.png"
    if focus_period:
        filename = f"peak_years_distribution_{focus_period[0]}_{focus_period[1]}.png"
        
    save_plot(plt, filename, subject_main_dir, subject_type)
    
    return plt.gcf()

In [None]:
def analyze_categorical_peaks(df, column_name, year_column, subject_main_dir, focus_period=None):
    """
    Analyze when categories reach their peak values and visualize the results.
    
    Parameters:
    df (pandas.DataFrame): The dataframe containing the data
    column_name (str): Name of the categorical column to analyze
    year_column (str): Name of the column containing year information
    subject_main_dir (str/Path): Main directory for all subject analyses
    subject_type (str): Type of subject analysis
    focus_period (tuple, optional): Tuple of (start_year, end_year) to focus analysis
    
    Returns:
    pandas.DataFrame: Summary of peak information
    """
    subject_type=column_name.lower()
    # Identify peaks for each category
    peak_info, summary_df = identify_peak_years(
        df, 
        column_name, 
        year_column, 
        focus_period=focus_period
    )
    
    # Print summary information
    print(f"\nPeak Analysis for {column_name}:")
    print(f"Total categories analyzed: {len(peak_info)}")
    
    if focus_period:
        print(f"Focus period: {focus_period[0]} to {focus_period[1]}")
    
    # Display the top categories by peak count
    print("\nTop categories by peak count:")
    display_df = summary_df.head(10)  # Show top 10
    print(display_df.to_string(index=False))
    
    # Visualize the distribution of peak years
    visualize_peak_distribution(
        peak_info, 
        year_column, 
        subject_main_dir, 
        subject_type,
        focus_period
    )
    
    # Save the summary to CSV
    csv_filename = f"{column_name}_peak_analysis.csv"
    csv_path = Path(subject_main_dir) / subject_type / csv_filename
    summary_df.to_csv(csv_path, index=False)
    print(f"\nSummary saved to: {csv_path}")
    
    return summary_df

#### Peak analysis - top

In [None]:
def identify_peak_years_top_categories(df, column_name, year_column, top_n=10, focus_period=None):
    """
    Identify the years where the top N categories in a column reach their maximum values.
    
    Parameters:
    df (pandas.DataFrame): The dataframe containing the data
    column_name (str): Name of the categorical column to analyze
    year_column (str): Name of the column containing year information
    top_n (int): Number of top categories to analyze
    focus_period (tuple, optional): Tuple of (start_year, end_year) to focus analysis
    
    Returns:
    dict: Dictionary with categories as keys and information about their peaks as values
    pandas.DataFrame: Summary DataFrame of peak information
    list: List of top categories analyzed
    """
    # Set subject_type based on column_name
    subject_type = column_name.lower()
    
    # Preprocess data
    processed_df = preprocess_categorical(df, column_name)
    
    # Filter by focus period if provided
    if focus_period:
        start_year, end_year = focus_period
        processed_df = processed_df[
            (processed_df[year_column] >= start_year) & 
            (processed_df[year_column] <= end_year)
        ]
        print(f"Focusing on period: {start_year} to {end_year}")
    
    # Get the top N categories
    top_categories = processed_df[column_name].value_counts().nlargest(top_n).index.tolist()
    
    # Dictionary to store results
    peak_info = {}
    
    # For each top category, find the year with maximum count
    for category in top_categories:
        category_data = processed_df[processed_df[column_name] == category]
        
        # Group by year and count records
        yearly_counts = category_data.groupby(year_column).size()
        
        if not yearly_counts.empty:
            # Find the year(s) with maximum count
            max_count = yearly_counts.max()
            max_years = yearly_counts[yearly_counts == max_count].index.tolist()
            
            # Store the information
            peak_info[category] = {
                'peak_years': max_years,
                'peak_count': max_count,
                'yearly_counts': yearly_counts,
                'total_records': len(category_data)
            }
    
    # Create a summary DataFrame
    summary_data = []
    for category, info in peak_info.items():
        summary_data.append({
            f'{column_name}': category,  # Use column_name as the column header
            'Peak_Year(s)': ', '.join(map(str, info['peak_years'])),
            'Peak_Count': info['peak_count'],
            'Total_Records': info['total_records'],
            'Percentage_at_Peak': (info['peak_count'] / info['total_records']) * 100 if info['total_records'] > 0 else 0
        })
    
    summary_df = pd.DataFrame(summary_data)
    summary_df = summary_df.sort_values('Peak_Count', ascending=False)
    
    return peak_info, summary_df, top_categories

def visualize_peak_distribution_top_categories(peak_info, column_name, year_column, subject_main_dir, focus_period=None):
    """
    Visualize the distribution of peak years across top categories.
    
    Parameters:
    peak_info (dict): Dictionary with peak information from identify_peak_years_top_categories
    column_name (str): Name of the categorical column analyzed
    year_column (str): Name of the column containing year information 
    subject_main_dir (str/Path): Main directory for all subject analyses
    focus_period (tuple, optional): Tuple of (start_year, end_year) of focus period
    
    Returns:
    matplotlib.figure.Figure: The created figure
    """
    # Set subject_type based on column_name
    subject_type = column_name.lower()
    
    # Capital case the first letter for display
    display_name = column_name[0].upper() + column_name[1:]
    
    # Extract all peak years with their frequencies
    all_peak_years = []
    for category, info in peak_info.items():
        all_peak_years.extend(info['peak_years'])
    
    # Count frequency of each peak year
    year_counts = pd.Series(all_peak_years).value_counts().sort_index()
    
    # Create the histogram
    plt.figure(figsize=(12, 6))
    bars = plt.bar(year_counts.index, year_counts.values, color='steelblue', alpha=0.7)
    
    # Add data labels on top of each bar
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height + 0.1,
                f'{int(height)}', ha='center', va='bottom')
    
    # Configure the plot
    title = f"Distribution of Peak Years Across Top {display_name}"
    if focus_period:
        title += f" ({focus_period[0]}-{focus_period[1]})"
        
    plt.title(title, fontsize=14)
    plt.xlabel('Year', fontsize=12)
    plt.ylabel(f'Number of {display_name} with Peak', fontsize=12)
    plt.grid(True, alpha=0.3, axis='y')
    plt.tight_layout()
    
    # Save the plot
    filename = f"peak_years_distribution_top_{column_name}.png"
    if focus_period:
        filename = f"peak_years_distribution_top_{column_name}_{focus_period[0]}_{focus_period[1]}.png"
        
    save_plot(plt, filename, subject_main_dir, subject_type)
    
    return plt.gcf()

def visualize_peak_trends_top_categories(peak_info, column_name, year_column, subject_main_dir, plot_type='counts', focus_period=None):
    """
    Visualize trends over time for top categories with their peak years highlighted.
    
    Parameters:
    peak_info (dict): Dictionary with peak information from identify_peak_years_top_categories
    column_name (str): Name of the categorical column analyzed
    year_column (str): Name of the column containing year information
    subject_main_dir (str/Path): Main directory for all subject analyses
    plot_type (str): Type of plot - 'counts' or 'percentage'
    focus_period (tuple, optional): Tuple of (start_year, end_year) of focus period
    
    Returns:
    matplotlib.figure.Figure: The created figure
    """
    # Set subject_type based on column_name
    subject_type = column_name.lower()
    
    # Capital case the first letter for display
    display_name = column_name[0].upper() + column_name[1:]
    
    # Create figure
    plt.figure(figsize=(14, 8))
    
    # Get all unique years across all categories
    all_years = set()
    for category, info in peak_info.items():
        all_years.update(info['yearly_counts'].index)
    all_years = sorted(list(all_years))
    
    # If plotting percentages, we need total counts per year
    if plot_type == 'percentage':
        total_by_year = {year: 0 for year in all_years}
        for category, info in peak_info.items():
            for year in info['yearly_counts'].index:
                total_by_year[year] += info['yearly_counts'][year]
    
    # Plot each category
    for category, info in peak_info.items():
        years = info['yearly_counts'].index.tolist()
        
        if plot_type == 'counts':
            values = info['yearly_counts'].values
            plt.plot(years, values, marker='o', label=category)
            
            # Highlight peak point(s)
            for peak_year in info['peak_years']:
                peak_value = info['yearly_counts'][peak_year]
                plt.plot(peak_year, peak_value, 'o', markersize=10, 
                         markerfacecolor='none', markeredgecolor='red', markeredgewidth=2)
        
        else:  # percentage
            percentages = [info['yearly_counts'][year] / total_by_year[year] * 100 
                          if total_by_year[year] > 0 else 0 
                          for year in years]
            plt.plot(years, percentages, marker='o', label=category)
            
            # Highlight peak point(s)
            for peak_year in info['peak_years']:
                if peak_year in years:
                    idx = years.index(peak_year)
                    peak_value = percentages[idx]
                    plt.plot(peak_year, peak_value, 'o', markersize=10, 
                             markerfacecolor='none', markeredgecolor='red', markeredgewidth=2)
    
    # Configure the plot
    y_label = f"Number of Records with {display_name}" if plot_type == 'counts' else f"Percentage of {display_name}"
    title_prefix = f"Frequency of {display_name}" if plot_type == 'counts' else f"Distribution of {display_name}"
    
    title = f"{title_prefix} Over Time with Peak Years"
    if focus_period:
        title += f" ({focus_period[0]}-{focus_period[1]})"
    
    plt.title(title, fontsize=14)
    plt.xlabel('Year', fontsize=12)
    plt.ylabel(y_label, fontsize=12)
    plt.grid(True, alpha=0.3)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    
    # Save the plot
    plot_type_str = 'counts' if plot_type == 'counts' else 'percentage'
    filename = f"{column_name}_peaks_{plot_type_str}.png"
    if focus_period:
        filename = f"{column_name}_peaks_{plot_type_str}_{focus_period[0]}_{focus_period[1]}.png"
    
    save_plot(plt, filename, subject_main_dir, subject_type)
    
    return plt.gcf()

def analyze_top_categorical_peaks(df, column_name, year_column, subject_main_dir, top_n=10, focus_period=None):
    """
    Analyze when top categories reach their peak values and visualize the results.
    
    Parameters:
    df (pandas.DataFrame): The dataframe containing the data
    column_name (str): Name of the categorical column to analyze
    year_column (str): Name of the column containing year information
    subject_main_dir (str/Path): Main directory for all subject analyses
    top_n (int): Number of top categories to analyze
    focus_period (tuple, optional): Tuple of (start_year, end_year) to focus analysis
    
    Returns:
    pandas.DataFrame: Summary of peak information
    """
    # Set subject_type based on column_name
    subject_type = column_name.lower()
    
    # Capital case the first letter for display
    display_name = column_name[0].upper() + column_name[1:]
    
    # Identify peaks for the top categories
    peak_info, summary_df, top_categories = identify_peak_years_top_categories(
        df, 
        column_name, 
        year_column,
        top_n=top_n,
        focus_period=focus_period
    )
    
    # Print summary information
    print(f"\nPeak Analysis for top {top_n} {display_name}:")
    print(f"Total categories analyzed: {len(peak_info)}")
    
    if focus_period:
        print(f"Focus period: {focus_period[0]} to {focus_period[1]}")
    
    # Display the summary
    print("\nCategories by peak count:")
    print(summary_df.to_string(index=False))
    
    # Visualize the distribution of peak years
    visualize_peak_distribution_top_categories(
        peak_info, 
        column_name,
        year_column, 
        subject_main_dir,
        focus_period
    )
    
    # Visualize trends with peaks highlighted - Counts
    visualize_peak_trends_top_categories(
        peak_info,
        column_name,
        year_column,
        subject_main_dir,
        plot_type='counts',
        focus_period=focus_period
    )
    
    # Visualize trends with peaks highlighted - Percentages
    visualize_peak_trends_top_categories(
        peak_info,
        column_name,
        year_column,
        subject_main_dir,
        plot_type='percentage',
        focus_period=focus_period
    )
    
    # Save the summary to CSV
    csv_filename = f"{column_name}_top_{top_n}_peak_analysis.csv"
    csv_path = Path(subject_main_dir) / subject_type / csv_filename
    summary_df.to_csv(csv_path, index=False)
    print(f"\nSummary saved to: {csv_path}")
    
    return summary_df

In [None]:
def analyze_peak_content(df, column_name='Journal', column_to_analyze='Title', n=5, 
                        year_range=(2010, 2020), year_column='Publication Year', 
                        subject_main_dir=None, text_analysis=True, entity_analysis=True,
                        min_entity_count=2, max_entities=20, language='english'):
    """
    Analyze the content of records during peak years for top categories.
    
    Parameters:
    df (pandas.DataFrame): The dataframe containing the data
    column_name (str): Name of the categorical column to analyze (default: 'Journal')
    column_to_analyze (str): Name of the text column to analyze (default: 'Title')
    n (int): Number of top categories to analyze (default: 5)
    year_range (tuple): Range of years to focus on (default: 2010-2020)
    year_column (str): Name of the column containing year information
    subject_main_dir (str/Path): Main directory for all subject analyses
    text_analysis (bool): Whether to perform general text analysis
    entity_analysis (bool): Whether to perform entity analysis
    min_entity_count (int): Minimum count for an entity to be included in analysis
    max_entities (int): Maximum number of entities to display
    language (str): Language for text analysis
    
    Returns:
    dict: Results of the analysis including peak information and textual insights
    """
    # Setting up the subject type
    subject_type = f"{column_name.lower()}_peak_{column_to_analyze.lower()}"
    
    # Create the appropriate directory
    if subject_main_dir:
        analysis_dir = Path(subject_main_dir) / subject_type
        analysis_dir.mkdir(parents=True, exist_ok=True)
    
    # Filter by year range
    start_year, end_year = year_range
    year_filtered_df = df[(df[year_column] >= start_year) & (df[year_column] <= end_year)]
    
    print(f"Analyzing peak content for top {n} {column_name} between {start_year}-{end_year}")
    print(f"Original dataset size: {len(df)}")
    print(f"Dataset size after year filtering: {len(year_filtered_df)}")
    
    # Get the top N categories in this period
    top_categories = year_filtered_df[column_name].value_counts().nlargest(n).index.tolist()
    
    # Results dictionary
    results = {
        'peak_info': {},
        'text_analysis': {},
        'entity_analysis': {}
    }
    
    # Process each top category
    for category in top_categories:
        # Filter to just this category
        category_df = year_filtered_df[year_filtered_df[column_name] == category]
        
        if len(category_df) == 0:
            print(f"No data found for {category} in the specified year range.")
            continue
            
        print(f"\nAnalyzing peak content for {category} ({len(category_df)} records)")
        
        # Find the peak year(s) for this category
        yearly_counts = category_df.groupby(year_column).size()
        max_count = yearly_counts.max()
        peak_years = yearly_counts[yearly_counts == max_count].index.tolist()
        
        # Store peak information
        results['peak_info'][category] = {
            'total_records': len(category_df),
            'peak_years': peak_years,
            'peak_count': max_count,
            'yearly_distribution': yearly_counts.to_dict()
        }
        
        print(f"Peak year(s) for {category}: {peak_years} with {max_count} records")
        
        # Filter to just the peak years for this category
        peak_df = category_df[category_df[year_column].isin(peak_years)]
        
        # Check if we have content to analyze
        if len(peak_df) == 0:
            print(f"No records found for {category} in peak years.")
            continue
            
        if column_to_analyze not in peak_df.columns:
            print(f"Column '{column_to_analyze}' not found in the dataset.")
            continue
            
        # Check if the column has non-null values
        if peak_df[column_to_analyze].isna().all():
            print(f"No non-null values found in '{column_to_analyze}' for {category} in peak years.")
            continue
            
        # Perform text analysis if requested
        if text_analysis:
            print(f"Performing text analysis on {column_to_analyze} for {category} peak years...")
            
            # Create a category-specific subfolder for this analysis
            category_subject_type = f"{subject_type}/{category.replace('/', '_')}"
            
            try:
                text_results = analyze_text_column(
                    df=peak_df,
                    column_name=column_to_analyze,
                    subject_main_dir=subject_main_dir,
                    filter_language=True,
                    target_language='en',
                    remove_stopwords=True,
                    language=language, 
                    subject_type=subject_type,
                )
                results['text_analysis'][category] = text_results
            except Exception as e:
                print(f"Error in text analysis for {category}: {str(e)}")
                results['text_analysis'][category] = {"error": str(e)}
        
        # Perform entity analysis if requested
        if entity_analysis:
            print(f"Performing entity analysis on {column_to_analyze} for {category} peak years...")
            
            try:
                entity_results = analyze_entities_in_column(
                    df=peak_df,
                    column_name=column_to_analyze,
                    subject_main_dir=subject_main_dir,
                    min_count=min_entity_count,
                    max_entities=max_entities,
                    subject_type=subject_type,
                )
                results['entity_analysis'][category] = entity_results
            except Exception as e:
                print(f"Error in entity analysis for {category}: {str(e)}")
                results['entity_analysis'][category] = {"error": str(e)}
    
    # Generate summary dataframe of peak information
    summary_data = []
    for category, info in results['peak_info'].items():
        summary_data.append({
            f'{column_name}': category,
            'Peak_Year(s)': ', '.join(map(str, info['peak_years'])),
            'Peak_Count': info['peak_count'],
            'Total_Records': info['total_records']
        })
    
    summary_df = pd.DataFrame(summary_data)
    if not summary_df.empty:
        summary_df = summary_df.sort_values('Peak_Count', ascending=False)
        
        # Save the summary
        if subject_main_dir:
            csv_filename = f"{column_name.lower()}_top_{n}_peak_content_summary.csv"
            csv_path = Path(subject_main_dir) / subject_type / csv_filename
            summary_df.to_csv(csv_path, index=False)
            print(f"\nSummary saved to: {csv_path}")
    
    # Create a visualization of common themes across peak content
    visualize_peak_content_summary(results, column_name, column_to_analyze, subject_main_dir, subject_type)
    
    return results, summary_df

def visualize_peak_content_summary(results, column_name, column_to_analyze, subject_main_dir, subject_type):
    """
    Visualize a summary of common themes across peak content.
    
    Parameters:
    results (dict): Results from the analyze_peak_content function
    column_name (str): Name of the categorical column analyzed
    column_to_analyze (str): Name of the text column analyzed
    subject_main_dir (str/Path): Main directory for all subject analyses
    subject_type (str): Type of subject analysis
    
    Returns:
    None
    """
    # Check if we have text analysis results
    if not results['text_analysis'] or all(isinstance(v, dict) and 'error' in v for v in results['text_analysis'].values()):
        print("No valid text analysis results to visualize.")
        return
    
    try:
        # Extract common words from all categories
        all_words = {}
        categories = list(results['text_analysis'].keys())
        
        for category in categories:
            if 'error' in results['text_analysis'][category]:
                continue
                
            if 'word_frequencies' in results['text_analysis'][category]:
                for word, count in results['text_analysis'][category]['word_frequencies'].items():
                    if word not in all_words:
                        all_words[word] = {}
                    all_words[word][category] = count
        
        if not all_words:
            print("No word frequencies found in the text analysis results.")
            return
            
        # Find words that appear in multiple categories
        common_words = {word: counts for word, counts in all_words.items() 
                      if len(counts) > 1}
        
        if not common_words:
            print("No common words found across categories.")
            return
            
        # Sort by total frequency across categories
        sorted_words = sorted(common_words.items(), 
                             key=lambda x: sum(x[1].values()), 
                             reverse=True)[:20]  # Top 20 common words
        
        # Prepare data for visualization
        words = [word for word, _ in sorted_words]
        categories_data = {category: [] for category in categories}
        
        for word, counts in sorted_words:
            for category in categories:
                categories_data[category].append(counts.get(category, 0))
        
        # Create the visualization
        plt.figure(figsize=(14, 10))
        
        # Define width of bars and positions
        width = 0.8 / len(categories)
        positions = np.arange(len(words))
        
        # Plot bars for each category
        for i, (category, values) in enumerate(categories_data.items()):
            plt.bar(positions + i * width, values, width=width, label=category)
        
        # Set labels and title
        plt.xlabel('Common Words', fontsize=12)
        plt.ylabel('Frequency', fontsize=12)
        plt.title(f'Common Words in {column_to_analyze} During Peak Years for Top {column_name}', fontsize=14)
        
        # Set x-axis ticks
        plt.xticks(positions + width * (len(categories) - 1) / 2, words, rotation=45, ha='right')
        
        plt.legend()
        plt.grid(axis='y', alpha=0.3)
        plt.tight_layout()
        
        # Save the plot
        if subject_main_dir:
            filename = f"common_words_peak_years_{column_name.lower()}_{column_to_analyze.lower()}.png"
            save_plot(plt, filename, subject_main_dir, subject_type)
            
    except Exception as e:
        print(f"Error creating peak content summary visualization: {str(e)}")

In [None]:
def analyze_combined_peak_content(df, column_name='Journal', column_to_analyze='Title', n=5, 
                        year_range=(2010, 2020), year_column='Publication Year', 
                        subject_main_dir=None, text_analysis=True, entity_analysis=True,
                        min_entity_count=2, max_entities=20, language='english'):
    """
    Analyze the combined content of records from peak years for top categories.
    
    Parameters:
    df (pandas.DataFrame): The dataframe containing the data
    column_name (str): Name of the categorical column to analyze (default: 'Journal')
    column_to_analyze (str): Name of the text column to analyze (default: 'Title')
    n (int): Number of top categories to analyze (default: 5)
    year_range (tuple): Range of years to focus on (default: 2010-2020)
    year_column (str): Name of the column containing year information
    subject_main_dir (str/Path): Main directory for all subject analyses
    text_analysis (bool): Whether to perform general text analysis
    entity_analysis (bool): Whether to perform entity analysis
    min_entity_count (int): Minimum count for an entity to be included in analysis
    max_entities (int): Maximum number of entities to display
    language (str): Language for text analysis
    
    Returns:
    dict: Results of the analysis including peak information and textual insights
    """
    # Setting up the subject type
    subject_type = f"{column_name.lower()}_peak_{column_to_analyze.lower()}"
    
    # Create the appropriate directory
    if subject_main_dir:
        analysis_dir = Path(subject_main_dir) / subject_type
        analysis_dir.mkdir(parents=True, exist_ok=True)
    
    # Filter by year range
    start_year, end_year = year_range
    year_filtered_df = df[(df[year_column] >= start_year) & (df[year_column] <= end_year)]
    
    print(f"Analyzing combined peak content for top {n} {column_name} between {start_year}-{end_year}")
    print(f"Original dataset size: {len(df)}")
    print(f"Dataset size after year filtering: {len(year_filtered_df)}")
    
    # Get the top N categories in this period
    top_categories = year_filtered_df[column_name].value_counts().nlargest(n).index.tolist()
    
    # Results dictionary
    results = {
        'peak_info': {},
        'combined_analysis': {}
    }
    
    # Collect all records from peak years across all categories
    all_peak_records = []
    
    # Process each top category to find peak years
    for category in top_categories:
        # Filter to just this category
        category_df = year_filtered_df[year_filtered_df[column_name] == category]
        
        if len(category_df) == 0:
            print(f"No data found for {category} in the specified year range.")
            continue
            
        print(f"\nIdentifying peak years for {category} ({len(category_df)} records)")
        
        # Find the peak year(s) for this category
        yearly_counts = category_df.groupby(year_column).size()
        max_count = yearly_counts.max()
        peak_years = yearly_counts[yearly_counts == max_count].index.tolist()
        
        # Store peak information
        results['peak_info'][category] = {
            'total_records': len(category_df),
            'peak_years': peak_years,
            'peak_count': max_count,
            'yearly_distribution': yearly_counts.to_dict()
        }
        
        print(f"Peak year(s) for {category}: {peak_years} with {max_count} records")
        
        # Filter to just the peak years for this category and add to our collection
        peak_df = category_df[category_df[year_column].isin(peak_years)]
        all_peak_records.append(peak_df)
    
    # Combine all peak records into a single DataFrame
    if all_peak_records:
        combined_peak_df = pd.concat(all_peak_records, ignore_index=True)
    else:
        print("No peak data found for any category.")
        return results, pd.DataFrame()
    
    print(f"\nAnalyzing combined peak data with {len(combined_peak_df)} records...")
    
    # Check if we have content to analyze
    if column_to_analyze not in combined_peak_df.columns:
        print(f"Column '{column_to_analyze}' not found in the dataset.")
        return results, pd.DataFrame()
        
    # Check if the column has non-null values
    if combined_peak_df[column_to_analyze].isna().all():
        print(f"No non-null values found in '{column_to_analyze}' in peak years.")
        return results, pd.DataFrame()
    
    # Perform text analysis if requested
    if text_analysis:
        print(f"Performing text analysis on combined {column_to_analyze} from peak years...")
        
        try:
            text_results = analyze_text_column(
                df=combined_peak_df,
                column_name=column_to_analyze,
                subject_main_dir=subject_main_dir,
                filter_language=True,
                target_language='en',
                remove_stopwords=True,
                language=language, 
                subject_type=subject_type,
            )
            results['combined_analysis']['text_analysis'] = text_results
        except Exception as e:
            print(f"Error in combined text analysis: {str(e)}")
            results['combined_analysis']['text_analysis'] = {"error": str(e)}
    
    # Perform entity analysis if requested
    if entity_analysis:
        print(f"Performing entity analysis on combined {column_to_analyze} from peak years...")
        
        try:
            entity_results = analyze_entities_in_column(
                df=combined_peak_df,
                column_name=column_to_analyze,
                subject_main_dir=subject_main_dir,
                min_count=min_entity_count,
                max_entities=max_entities,
                subject_type=subject_type,
            )
            results['combined_analysis']['entity_analysis'] = entity_results
        except Exception as e:
            print(f"Error in combined entity analysis: {str(e)}")
            results['combined_analysis']['entity_analysis'] = {"error": str(e)}
    
    # Generate summary dataframe of peak information
    summary_data = []
    for category, info in results['peak_info'].items():
        summary_data.append({
            f'{column_name}': category,
            'Peak_Year(s)': ', '.join(map(str, info['peak_years'])),
            'Peak_Count': info['peak_count'],
            'Total_Records': info['total_records']
        })
    
    summary_df = pd.DataFrame(summary_data)
    if not summary_df.empty:
        summary_df = summary_df.sort_values('Peak_Count', ascending=False)
        
        # Save the summary
        if subject_main_dir:
            csv_filename = f"{column_name.lower()}_top_{n}_peak_content_summary.csv"
            csv_path = Path(subject_main_dir) / subject_type / csv_filename
            summary_df.to_csv(csv_path, index=False)
            print(f"\nSummary saved to: {csv_path}")
    
    return results, summary_df

# Journal analysis

In [None]:
visualize_categorical_by_decade_percentage(
    df=df,
    column_name='Journal',  # Your categorical column
    decade_column='decade',  # Column containing decade information
    subject_main_dir=subject_main_dir,
    top_n=10,  # Number of top categories to show
)

# For count distribution
visualize_categorical_by_decade_counts(
    df=df,
    column_name='Journal',
    decade_column='decade',
    subject_main_dir=subject_main_dir,
    top_n=10,
)

In [None]:
# Analyze peaks for the 'journal' column in the 2000-2025 period
journal_peaks = analyze_categorical_peaks(
    df=df,
    column_name='Journal',  # Your categorical column
    year_column='Publication Year',     # Column with year information
    subject_main_dir=subject_main_dir,
    focus_period=(2000, 2025)  # Focus on 2000-2025 period
)

In [None]:
# Analyze peaks for the top 10 Journals in the 2000-2025 period
journal_peaks = analyze_top_categorical_peaks(
    df=df,
    column_name='Journal',
    year_column='Publication Year',
    subject_main_dir=subject_main_dir,
    top_n=5,
    focus_period=(2000, 2025)
)


In [None]:
journal_peak_content = analyze_combined_peak_content(
    df=df,
    column_name='Journal',
    column_to_analyze='Abstract',
    n=5,
    year_range=(2010, 2020),
    year_column='Publication Year',
    subject_main_dir=subject_main_dir
)

# Publisher analysis

In [None]:
visualize_categorical_by_decade_percentage(
    df=df,
    column_name='Publisher',  # Your categorical column
    decade_column='decade',  # Column containing decade information
    subject_main_dir=subject_main_dir,
    top_n=10,  # Number of top categories to show
)

# For count distribution
visualize_categorical_by_decade_counts(
    df=df,
    column_name='Publisher',
    decade_column='decade',
    subject_main_dir=subject_main_dir,
    top_n=10,
)

In [None]:
# Analyze peaks for the top 10 Journals in the 2000-2025 period
journal_peaks = analyze_top_categorical_peaks(
    df=df,
    column_name='Publisher',
    year_column='Publication Year',
    subject_main_dir=subject_main_dir,
    top_n=5,
    focus_period=(2000, 2025)
)


In [None]:
publisher_peak_content = analyze_combined_peak_content(
    df=df,
    column_name='Publisher',
    column_to_analyze='Title',
    n=5,
    year_range=(2010, 2020),
    year_column='Publication Year',
    subject_main_dir=subject_main_dir
)
publisher_peak_content = analyze_combined_peak_content(
    df=df,
    column_name='Publisher',
    column_to_analyze='Abstract',
    n=5,
    year_range=(2010, 2020),
    year_column='Publication Year',
    subject_main_dir=subject_main_dir
)

# Type analysis

In [None]:
visualize_categorical_by_decade_percentage(
    df=df,
    column_name='Type',  # Your categorical column
    decade_column='decade',  # Column containing decade information
    subject_main_dir=subject_main_dir,
    top_n=10,  # Number of top categories to show
)

# For count distribution
visualize_categorical_by_decade_counts(
    df=df,
    column_name='Type',
    decade_column='decade',
    subject_main_dir=subject_main_dir,
    top_n=10,
)

In [None]:
# Analyze peaks for the top 10 Journals in the 2000-2025 period
journal_peaks = analyze_top_categorical_peaks(
    df=df,
    column_name='Type',
    year_column='Publication Year',
    subject_main_dir=subject_main_dir,
    top_n=10,
    focus_period=(2000, 2025)
)


# Citation analysis

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

def analyze_citation_by_time(df, subject_main_dir, time_column='decade', citation_column='Citation Count', 
                            plot_title='Mean Citation Count by Decade', 
                            filename='mean_citation_by_decade.png',
                            exclude_zero_citations=False):
    """
    Analyze and visualize mean citation counts over time periods (e.g., decades)
    
    Parameters:
    df (pandas.DataFrame): DataFrame containing the data
    subject_main_dir (str/Path): Main directory for saving visualizations
    time_column (str): Column name for the time period (e.g., 'decade')
    citation_column (str): Column name for citation counts
    plot_title (str): Title for the plot
    filename (str): Filename for saving the plot
    """
    # Subject type is derived from the citation column name
    subject_type = citation_column.lower().replace(' ', '_')
    
    # Filter out zero citations if requested
    analysis_df = df.copy()
    # Filter out NaN and non-numeric values in the citation column
    analysis_df = analysis_df[pd.to_numeric(analysis_df[citation_column], errors='coerce').notna()]
    if exclude_zero_citations:
        analysis_df = analysis_df[analysis_df[citation_column] > 0]
        # Update filename to indicate filtering
        filename = filename.replace('.png', '_non_zero.png')
        plot_title += ' (Excluding Zero Citations)'
    
    # Group by time period and calculate mean citation count
    time_citation_df = analysis_df.groupby(time_column)[citation_column].mean().reset_index()
    
    # Create the plot
    plt.figure(figsize=(12, 6))
    
    # Line plot with markers
    sns.lineplot(x=time_column, y=citation_column, data=time_citation_df, marker='o', linewidth=2, markersize=8)
    
    # Add linear trend line
    sns.regplot(x=time_column, y=citation_column, data=time_citation_df, 
                scatter=False, line_kws={"color":"red", "linestyle":"--"})
    
    # Enhance the plot
    plt.title(plot_title, fontsize=16)
    plt.xlabel(f'{time_column.capitalize()}', fontsize=14)
    plt.ylabel('Mean Citation Count', fontsize=14)
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.xticks(time_citation_df[time_column], rotation=45 if len(time_citation_df) > 10 else 0)
    
    # # Add value labels
    # for x, y in zip(time_citation_df[time_column], time_citation_df[citation_column]):
    #     plt.text(x, y + max(time_citation_df[citation_column])*0.02, f'{y:.2f}', 
    #             ha='center', va='bottom', fontsize=10)
    
    plt.tight_layout()
    
    # Save the plot using the utility function
    save_plot(plt, filename, subject_main_dir, subject_type)
    
    # Return the aggregated data for further analysis if needed
    return time_citation_df

# Example usage (assuming df and subject_main_dir are already defined):
# With all citations:
# citation_by_decade = analyze_citation_by_time(df, subject_main_dir)
# 
# Only considering non-zero citations:
citation_by_decade_non_zero = analyze_citation_by_time(df, subject_main_dir, exclude_zero_citations=True)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import math

def analyze_citation_distribution(df, subject_main_dir, citation_column='Citation Count', 
                                 exclude_zero_citations=False, log_scale=True,
                                 percentile_cutoff=99.5):
    """
    Analyze and visualize the distribution of citation counts
    
    Parameters:
    df (pandas.DataFrame): DataFrame containing the data
    subject_main_dir (str/Path): Main directory for saving visualizations
    citation_column (str): Column name for citation counts
    exclude_zero_citations (bool): Whether to exclude zero citations
    log_scale (bool): Whether to use log scale for certain visualizations
    percentile_cutoff (float): Percentile cutoff for trimming extreme values in some plots
    
    Returns:
    dict: Dictionary containing statistical summary and filtered dataframe
    """
    # Subject type is derived from the citation column name
    subject_type = citation_column.lower().replace(' ', '_')
    
    # Filter out NaN and non-numeric values
    analysis_df = df.copy()
    analysis_df = analysis_df[pd.to_numeric(analysis_df[citation_column], errors='coerce').notna()]
    
    # Optionally exclude zero citations
    if exclude_zero_citations:
        analysis_df = analysis_df[analysis_df[citation_column] > 0]
        plot_suffix = '_non_zero'
    else:
        plot_suffix = ''
    
    # Calculate basic statistics
    stats = {
        'count': len(analysis_df),
        'mean': analysis_df[citation_column].mean(),
        'median': analysis_df[citation_column].median(),
        'std': analysis_df[citation_column].std(),
        'min': analysis_df[citation_column].min(),
        'max': analysis_df[citation_column].max(),
        'papers_with_1000plus_citations': len(analysis_df[analysis_df[citation_column] >= 1000]),
        'papers_with_500plus_citations': len(analysis_df[analysis_df[citation_column] >= 500]),
        'papers_with_100plus_citations': len(analysis_df[analysis_df[citation_column] >= 100]),
        'papers_with_10plus_citations': len(analysis_df[analysis_df[citation_column] >= 10]),
        'papers_with_zero_citations': len(analysis_df[analysis_df[citation_column] == 0]),
        'percentiles': {
            '25%': analysis_df[citation_column].quantile(0.25),
            '50%': analysis_df[citation_column].quantile(0.5),
            '75%': analysis_df[citation_column].quantile(0.75),
            '90%': analysis_df[citation_column].quantile(0.9),
            '95%': analysis_df[citation_column].quantile(0.95),
            '99%': analysis_df[citation_column].quantile(0.99),
            '99.9%': analysis_df[citation_column].quantile(0.999)
        }
    }
    
    # 1. Create histogram with distribution
    plt.figure(figsize=(12, 7))
    
    # Get the cutoff value for trimming extreme values
    cutoff_value = analysis_df[citation_column].quantile(percentile_cutoff/100)
    trimmed_df = analysis_df[analysis_df[citation_column] <= cutoff_value]
    
    # Determine appropriate number of bins
    bin_count = min(50, int(math.sqrt(len(trimmed_df))))
    
    # Create histogram
    sns.histplot(trimmed_df[citation_column], bins=bin_count, kde=True)
    plt.title(f'Distribution of Citation Counts (≤ {percentile_cutoff}th percentile)', fontsize=15)
    plt.xlabel('Citation Count', fontsize=12)
    plt.ylabel('Number of Papers', fontsize=12)
    plt.grid(True, alpha=0.3)
    
    # Add summary statistics as text
    stats_text = (f"Mean: {stats['mean']:.2f}\n"
                 f"Median: {stats['median']:.1f}\n"
                 f"Max: {stats['max']}\n"
                 f"Std Dev: {stats['std']:.2f}\n"
                 f"Papers ≥1000 citations: {stats['papers_with_1000plus_citations']}\n"
                 f"Papers ≥500 citations: {stats['papers_with_500plus_citations']}")
    plt.figtext(0.75, 0.70, stats_text, fontsize=10, 
                bbox=dict(facecolor='white', alpha=0.8, boxstyle='round'))
    
    plt.tight_layout()
    save_plot(plt, f'citation_distribution_histogram{plot_suffix}.png', subject_main_dir, subject_type)
    
    # 2. Create log-scale histogram if requested
    if log_scale:
        plt.figure(figsize=(12, 7))
        # Add 1 to all values to handle zeros when taking log
        log_values = np.log1p(analysis_df[citation_column])
        sns.histplot(log_values, bins=50, kde=True)
        plt.title('Distribution of Citation Counts (Log Scale)', fontsize=15)
        plt.xlabel('Log(Citation Count + 1)', fontsize=12)
        plt.ylabel('Number of Papers', fontsize=12)
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        save_plot(plt, f'citation_distribution_log_scale{plot_suffix}.png', subject_main_dir, subject_type)
    
    # 3. Create boxplot to show the distribution and outliers
    plt.figure(figsize=(10, 6))
    sns.boxplot(y=analysis_df[citation_column])
    plt.title('Boxplot of Citation Counts', fontsize=15)
    plt.ylabel('Citation Count', fontsize=12)
    plt.grid(True, axis='y', alpha=0.3)
    plt.tight_layout()
    save_plot(plt, f'citation_distribution_boxplot{plot_suffix}.png', subject_main_dir, subject_type)
    
    # 4. Create CDF (Cumulative Distribution Function) plot
    plt.figure(figsize=(12, 7))
    
    # Sort values for CDF
    sorted_citations = np.sort(analysis_df[citation_column])
    # Calculate cumulative probabilities
    cumulative_prob = np.arange(1, len(sorted_citations) + 1) / len(sorted_citations)
    
    plt.plot(sorted_citations, cumulative_prob)
    plt.title('Cumulative Distribution Function of Citation Counts', fontsize=15)
    plt.xlabel('Citation Count', fontsize=12)
    plt.ylabel('Cumulative Probability', fontsize=12)
    plt.grid(True, alpha=0.3)
    
    # Mark interesting percentiles
    for perc, label in [(0.5, '50%'), (0.75, '75%'), (0.9, '90%'), (0.95, '95%')]:
        idx = np.searchsorted(cumulative_prob, perc)
        if idx < len(sorted_citations):
            x_val = sorted_citations[idx]
            plt.axhline(y=perc, color='r', linestyle='--', alpha=0.3)
            plt.axvline(x=x_val, color='r', linestyle='--', alpha=0.3)
            plt.text(x_val + 0.5, perc + 0.01, f'{label}: {x_val:.1f}', fontsize=10)
    
    plt.tight_layout()
    save_plot(plt, f'citation_distribution_cdf{plot_suffix}.png', subject_main_dir, subject_type)
    
    # 5. Create a table of citation count ranges
    citation_ranges = [
        (0, 0, 'Zero citations'),
        (1, 9, '1-9 citations'),
        (10, 49, '10-49 citations'),
        (50, 99, '50-99 citations'),
        (100, 499, '100-499 citations'),
        (500, 999, '500-999 citations'),
        (1000, float('inf'), '1000+ citations')
    ]
    
    range_counts = []
    for lower, upper, label in citation_ranges:
        if upper == float('inf'):
            count = len(analysis_df[analysis_df[citation_column] >= lower])
        else:
            count = len(analysis_df[(analysis_df[citation_column] >= lower) & 
                                   (analysis_df[citation_column] <= upper)])
        percentage = count / len(analysis_df) * 100
        range_counts.append({'Range': label, 'Count': count, 'Percentage': percentage})
    
    range_df = pd.DataFrame(range_counts)
    
    # Create bar chart of citation ranges
    plt.figure(figsize=(12, 7))
    sns.barplot(x='Range', y='Percentage', data=range_df)
    plt.title('Distribution of Papers by Citation Count Ranges', fontsize=15)
    plt.xlabel('Citation Count Range', fontsize=12)
    plt.ylabel('Percentage of Papers (%)', fontsize=12)
    
    # Add count labels on top of bars
    for i, row in enumerate(range_df.itertuples()):
        plt.text(i, row.Percentage + 0.5, f'{row.Count:,}\n({row.Percentage:.1f}%)', 
                ha='center', va='bottom', fontsize=9)
    
    plt.xticks(rotation=45)
    plt.grid(True, axis='y', alpha=0.3)
    plt.tight_layout()
    save_plot(plt, f'citation_distribution_ranges{plot_suffix}.png', subject_main_dir, subject_type)
    
    return {'stats': stats, 'filtered_df': analysis_df, 'range_distribution': range_df}

# Example usage (assuming df and subject_main_dir are already defined):
citation_stats = analyze_citation_distribution(df, subject_main_dir)
# 
# # With filtering for non-zero citations:
citation_stats_non_zero = analyze_citation_distribution(df, subject_main_dir, exclude_zero_citations=True)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

def analyze_high_citations_by_category(df, subject_main_dir, category_column='Journal', 
                                      citation_column='Citation Count', min_citation_count=500,
                                      top_n=10, min_papers=3, exclude_na=True,
                                      plot_title=None, filename=None):
    """
    Analyze and visualize top categories (e.g., journals, publishers) by citation metrics
    
    Parameters:
    df (pandas.DataFrame): DataFrame containing the data
    subject_main_dir (str/Path): Main directory for saving visualizations
    category_column (str): Column name for the category to analyze (e.g., 'Journal', 'Publisher', 'Type')
    citation_column (str): Column name for citation counts
    min_citation_count (int): Minimum citation count threshold for "highly cited" papers
    top_n (int): Number of top categories to display
    min_papers (int): Minimum number of papers a category must have to be included
    exclude_na (bool): Whether to exclude NA/null values in the category column
    plot_title (str): Custom plot title (if None, will be auto-generated)
    filename (str): Custom filename (if None, will be auto-generated)
    
    Returns:
    dict: Dictionary containing analysis results
    """
    # Generate the subject type
    subject_type = f"citation_{category_column.lower().replace(' ', '_')}"
    
    # Filter out NaN and non-numeric values in citation column
    analysis_df = df.copy()
    analysis_df = analysis_df[pd.to_numeric(analysis_df[citation_column], errors='coerce').notna()]
    
    # Exclude NA values in category column if requested
    if exclude_na:
        analysis_df = analysis_df[analysis_df[category_column].notna()]
    
    # Generate auto filename and title if not provided
    if filename is None:
        filename = f"top_{top_n}_{category_column.lower().replace(' ', '_')}_by_citations.png"
    
    if plot_title is None:
        plot_title = f"Top {top_n} {category_column}s by Citation Metrics"
    
    # 1. Identify highly cited papers
    highly_cited = analysis_df[analysis_df[citation_column] >= min_citation_count].copy()
    
    # 2. Calculate statistics by category
    category_stats = []
    
    for category, group in analysis_df.groupby(category_column):
        # Skip categories with too few papers
        if len(group) < min_papers:
            continue
            
        # Calculate metrics
        highly_cited_count = len(highly_cited[highly_cited[category_column] == category])
        total_papers = len(group)
        percent_highly_cited = (highly_cited_count / total_papers) * 100 if total_papers > 0 else 0
        mean_citations = group[citation_column].mean()
        median_citations = group[citation_column].median()
        max_citations = group[citation_column].max()
        
        category_stats.append({
            'Category': category,
            'Total_Papers': total_papers,
            'Highly_Cited_Papers': highly_cited_count,
            'Percent_Highly_Cited': percent_highly_cited,
            'Mean_Citations': mean_citations,
            'Median_Citations': median_citations,
            'Max_Citations': max_citations
        })
    
    # Convert to DataFrame
    stats_df = pd.DataFrame(category_stats)
    
    # If we have no data, return early
    if len(stats_df) == 0:
        print(f"No categories with at least {min_papers} papers found.")
        return {
            'stats_df': stats_df,
            'error': 'No categories with minimum paper count found'
        }
    
    # Sort by different metrics and get top N
    top_by_percent = stats_df.sort_values('Percent_Highly_Cited', ascending=False).head(top_n)
    top_by_count = stats_df.sort_values('Highly_Cited_Papers', ascending=False).head(top_n)
    top_by_mean = stats_df.sort_values('Mean_Citations', ascending=False).head(top_n)
    
    # 3. Create visualizations
    
    # 3.1 Top categories by percentage of highly cited papers
    plt.figure(figsize=(12, 8))
    bars = sns.barplot(y='Category', x='Percent_Highly_Cited', data=top_by_percent)
    
    # Add count labels
    for i, (_, row) in enumerate(top_by_percent.iterrows()):
        plt.text(row['Percent_Highly_Cited'] + 0.5, i, 
                f"{row['Highly_Cited_Papers']}/{row['Total_Papers']}", 
                va='center', fontsize=9)
    
    plt.title(f"Top {top_n} {category_column}s by Percentage of Highly Cited Papers (≥{min_citation_count} citations)", 
              fontsize=15)
    plt.xlabel('Percentage of Highly Cited Papers (%)', fontsize=12)
    plt.ylabel(category_column, fontsize=12)
    plt.grid(True, axis='x', alpha=0.3)
    plt.tight_layout()
    save_plot(plt, f"top_{category_column.lower().replace(' ', '_')}_by_percent_highly_cited.png", 
              subject_main_dir, subject_type)
    
    # 3.2 Top categories by count of highly cited papers
    plt.figure(figsize=(12, 8))
    bars = sns.barplot(y='Category', x='Highly_Cited_Papers', data=top_by_count)
    
    # Add percentage labels
    for i, (_, row) in enumerate(top_by_count.iterrows()):
        plt.text(row['Highly_Cited_Papers'] + 0.5, i, 
                f"{row['Percent_Highly_Cited']:.1f}% of {row['Total_Papers']}", 
                va='center', fontsize=9)
    
    plt.title(f"Top {top_n} {category_column}s by Number of Highly Cited Papers (≥{min_citation_count} citations)", 
              fontsize=15)
    plt.xlabel('Number of Highly Cited Papers', fontsize=12)
    plt.ylabel(category_column, fontsize=12)
    plt.grid(True, axis='x', alpha=0.3)
    plt.tight_layout()
    save_plot(plt, f"top_{category_column.lower().replace(' ', '_')}_by_count_highly_cited.png", 
              subject_main_dir, subject_type)
    
    # 3.3 Top categories by mean citation count
    plt.figure(figsize=(12, 8))
    bars = sns.barplot(y='Category', x='Mean_Citations', data=top_by_mean)
    
    # Add paper count labels
    for i, (_, row) in enumerate(top_by_mean.iterrows()):
        plt.text(row['Mean_Citations'] + 1, i, 
                f"{row['Total_Papers']} papers", 
                va='center', fontsize=9)
    
    plt.title(f"Top {top_n} {category_column}s by Mean Citation Count", fontsize=15)
    plt.xlabel('Mean Citation Count', fontsize=12)
    plt.ylabel(category_column, fontsize=12)
    plt.grid(True, axis='x', alpha=0.3)
    plt.tight_layout()
    save_plot(plt, f"top_{category_column.lower().replace(' ', '_')}_by_mean_citations.png", 
              subject_main_dir, subject_type)
    
    # 3.4 Create a combined visualization showing multiple metrics
    # Combine the top categories from all metrics
    all_top_categories = pd.concat([
        top_by_percent['Category'], 
        top_by_count['Category'], 
        top_by_mean['Category']
    ]).unique()
    
    # Get stats for all these categories
    combined_stats = stats_df[stats_df['Category'].isin(all_top_categories)].sort_values('Highly_Cited_Papers', ascending=False)
    
    # Check if we have any data for the combined chart
    if len(combined_stats) > 0:
        plt.figure(figsize=(15, 10))
        
        # Prepare for grouped bar chart with 3 metrics
        categories = combined_stats['Category'].tolist()
        x = np.arange(len(categories))
        width = 0.25
        
        fig, ax = plt.subplots(figsize=(15, 10))
        
        # Create bars for each metric
        ax.bar(x - width, combined_stats['Highly_Cited_Papers'], width, label='Highly Cited Papers')
        ax.bar(x, combined_stats['Mean_Citations'], width, label='Mean Citations')
        ax.bar(x + width, combined_stats['Percent_Highly_Cited'], width, label='% Highly Cited')
        
        # Add custom y-axis for percentage
        ax2 = ax.twinx()
        ax2.set_ylabel('Percentage', fontsize=12)
        ax2.set_ylim(0, max(combined_stats['Percent_Highly_Cited']) * 1.2)
        
        # Set labels and legend
        ax.set_ylabel('Count / Mean', fontsize=12)
        ax.set_title(f'Multi-metric Comparison of Top {category_column}s', fontsize=15)
        ax.set_xticks(x)
        ax.set_xticklabels(categories, rotation=45, ha='right')
        ax.legend()
        
        plt.tight_layout()
        save_plot(plt, f"top_{category_column.lower().replace(' ', '_')}_multi_metric.png", 
                subject_main_dir, subject_type)
    
    # 4. Return results for further analysis
    return {
        'stats_df': stats_df,
        'top_by_percent': top_by_percent,
        'top_by_count': top_by_count,
        'top_by_mean': top_by_mean,
        'highly_cited_threshold': min_citation_count,
        'highly_cited_papers': highly_cited
    }

# Example usage (assuming df and subject_main_dir are already defined):
# Journal analysis
journal_analysis = analyze_high_citations_by_category(df, subject_main_dir, category_column='Journal', min_citation_count=1000, top_n=10)

# Publisher analysis
publisher_analysis = analyze_high_citations_by_category(df, subject_main_dir, category_column='Publisher', min_citation_count=1000, top_n=10)

# Type analysis
type_analysis = analyze_high_citations_by_category(df, subject_main_dir, category_column='Type', min_citation_count=1000, top_n=5)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ast
from pathlib import Path
from collections import Counter, defaultdict

def analyze_citations_by_list_elements(df, subject_main_dir, list_columns=['Topics'], 
                                      citation_column='Citation Count', 
                                      min_citation_count=500, top_n=15,
                                      min_occurrences=10, exclude_na=True):
    """
    Analyze and visualize citation patterns across list-based columns (e.g., Topics, Domains, Fields)
    
    Parameters:
    df (pandas.DataFrame): DataFrame containing the data
    subject_main_dir (str/Path): Main directory for saving visualizations
    list_columns (list): List of column names containing list-like data (string representations of lists or dictionaries)
    citation_column (str): Column name for citation counts
    min_citation_count (int): Minimum citation count threshold for "highly cited" papers
    top_n (int): Number of top elements to display
    min_occurrences (int): Minimum number of occurrences an element must have to be included
    exclude_na (bool): Whether to exclude NA/null values in the analyzed columns
    
    Returns:
    dict: Dictionary containing analysis results
    """
    # Generate the subject type from all column names
    combined_names = '_'.join([col.lower().replace(' ', '_') for col in list_columns])
    subject_type = f"citation_{combined_names}"
    
    # Filter out NaN and non-numeric values in citation column
    analysis_df = df.copy()
    analysis_df = analysis_df[pd.to_numeric(analysis_df[citation_column], errors='coerce').notna()]
    
    # Identify highly cited papers
    highly_cited = analysis_df[analysis_df[citation_column] >= min_citation_count].copy()
    
    # Function to safely convert string representations of lists to actual lists
    def safe_convert_to_list(value):
        if pd.isna(value):
            return []
        if isinstance(value, str):
            try:
                # If it's a string representation of a list
                converted = ast.literal_eval(value)
                if isinstance(converted, list):
                    return converted
                elif isinstance(converted, dict):
                    # If it's a dictionary, extract keys
                    return list(converted.keys())
                else:
                    return [converted]  # Handle scalar values
            except (ValueError, SyntaxError):
                # If not a valid literal, treat as a single string item
                return [value]
        elif isinstance(value, dict):
            # If it's already a dictionary
            return list(value.keys())
        elif isinstance(value, list):
            # If it's already a list
            return value
        else:
            # For scalar values
            return [value]
    
    # Process each paper and collect elements from specified columns
    all_elements = []
    highly_cited_elements = []
    
    # This will store counts of total papers and highly cited papers per element
    element_counts = defaultdict(lambda: {'total': 0, 'highly_cited': 0})
    
    # Process all papers
    for _, row in analysis_df.iterrows():
        paper_elements = []
        
        # Skip if any required column is missing
        if exclude_na and any(pd.isna(row[col]) for col in list_columns):
            continue
            
        # Process each column and extract elements
        for col in list_columns:
            elements = safe_convert_to_list(row[col])
            paper_elements.extend(elements)
        
        # Count each element for this paper
        for element in set(paper_elements):  # use set to count each element once per paper
            element_counts[element]['total'] += 1
            all_elements.append(element)
            
            # Check if this is a highly cited paper
            if row[citation_column] >= min_citation_count:
                element_counts[element]['highly_cited'] += 1
                highly_cited_elements.append(element)
    
    # Convert to DataFrame for analysis
    elements_df = pd.DataFrame([
        {
            'Element': element,
            'Total_Papers': counts['total'],
            'Highly_Cited_Papers': counts['highly_cited'],
            'Percent_Highly_Cited': (counts['highly_cited'] / counts['total'] * 100) if counts['total'] > 0 else 0,
            'Total_Frequency': all_elements.count(element),
            'Highly_Cited_Frequency': highly_cited_elements.count(element)
        }
        for element, counts in element_counts.items()
        if counts['total'] >= min_occurrences  # Filter by minimum occurrences
    ])
    
    # If no data, return early
    if len(elements_df) == 0:
        print(f"No elements with at least {min_occurrences} occurrences found.")
        return {
            'elements_df': pd.DataFrame(),
            'error': 'No elements with minimum occurrences found'
        }
    
    # Sort by different metrics and get top N
    top_by_percent = elements_df.sort_values('Percent_Highly_Cited', ascending=False).head(top_n)
    top_by_count = elements_df.sort_values('Highly_Cited_Papers', ascending=False).head(top_n)
    
    # Create visualizations
    
    # 1. Top elements by percentage of highly cited papers
    plt.figure(figsize=(12, 10))
    bars = sns.barplot(y='Element', x='Percent_Highly_Cited', data=top_by_percent)
    
    # Add count labels
    for i, (_, row) in enumerate(top_by_percent.iterrows()):
        plt.text(row['Percent_Highly_Cited'] + 0.5, i, 
                f"{row['Highly_Cited_Papers']}/{row['Total_Papers']}", 
                va='center', fontsize=9)
    
    column_names = ', '.join(list_columns)
    plt.title(f"Top {top_n} Elements from {column_names} by Percentage of Highly Cited Papers", 
              fontsize=15)
    plt.xlabel('Percentage of Highly Cited Papers (%)', fontsize=12)
    plt.ylabel('Element', fontsize=12)
    plt.grid(True, axis='x', alpha=0.3)
    plt.tight_layout()
    save_plot(plt, f"top_elements_by_percent_highly_cited.png", subject_main_dir, subject_type)
    
    # 2. Top elements by count of highly cited papers
    plt.figure(figsize=(12, 10))
    bars = sns.barplot(y='Element', x='Highly_Cited_Papers', data=top_by_count)
    
    # Add percentage labels
    for i, (_, row) in enumerate(top_by_count.iterrows()):
        plt.text(row['Highly_Cited_Papers'] + 0.5, i, 
                f"{row['Percent_Highly_Cited']:.1f}% of {row['Total_Papers']}", 
                va='center', fontsize=9)
    
    plt.title(f"Top {top_n} Elements from {column_names} by Number of Highly Cited Papers", 
              fontsize=15)
    plt.xlabel('Number of Highly Cited Papers', fontsize=12)
    plt.ylabel('Element', fontsize=12)
    plt.grid(True, axis='x', alpha=0.3)
    plt.tight_layout()
    save_plot(plt, f"top_elements_by_count_highly_cited.png", subject_main_dir, subject_type)
    
    # 3. Create a scatter plot of total papers vs highly cited papers
    plt.figure(figsize=(12, 10))
    
    # Calculate point sizes based on total papers (normalized)
    max_size = 300
    min_size = 30
    sizes = min_size + (elements_df['Total_Papers'] / elements_df['Total_Papers'].max()) * (max_size - min_size)
    
    # Create scatter plot
    scatter = plt.scatter(
        elements_df['Total_Papers'], 
        elements_df['Highly_Cited_Papers'],
        s=sizes,
        alpha=0.6,
        c=elements_df['Percent_Highly_Cited'],
        cmap='viridis'
    )
    
    # Add colorbar
    cbar = plt.colorbar(scatter)
    cbar.set_label('Percentage of Highly Cited Papers', fontsize=12)
    
    # Add labels for top elements
    top_combined = pd.concat([top_by_percent, top_by_count]).drop_duplicates()
    for _, row in top_combined.iterrows():
        plt.annotate(
            row['Element'],
            (row['Total_Papers'], row['Highly_Cited_Papers']),
            fontsize=9,
            xytext=(5, 5),
            textcoords='offset points'
        )
    
    plt.title(f"Relationship Between Total Papers and Highly Cited Papers by {column_names} Elements", 
              fontsize=15)
    plt.xlabel('Total Number of Papers', fontsize=12)
    plt.ylabel('Number of Highly Cited Papers', fontsize=12)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    save_plot(plt, f"elements_citation_scatter.png", subject_main_dir, subject_type)
    
    # Return results for further analysis
    return {
        'elements_df': elements_df,
        'top_by_percent': top_by_percent,
        'top_by_count': top_by_count,
        'highly_cited_threshold': min_citation_count
    }

# Example usage (assuming df and subject_main_dir are already defined):
# 
# # Single list column analysis
topics_analysis = analyze_citations_by_list_elements(df, subject_main_dir, 
                                                   list_columns=['Domains'],
                                                   min_citation_count=1000)

# Multiple list columns combined
domains_fields_analysis = analyze_citations_by_list_elements(df, subject_main_dir, 
                                                           list_columns=['Topics'],
                                                           min_citation_count=1000)

# # Analysis with concept dictionary
concepts_analysis = analyze_citations_by_list_elements(df, subject_main_dir, 
                                                     list_columns=['concept_dict'],
                                                     min_citation_count=1000)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

def analyze_high_citation_text(df, subject_main_dir, text_column='Abstract', citation_column='Citation Count',
                             min_citation_count=500, analysis_type='general', comparison=True,
                             filter_language=True, target_language='en', remove_stopwords=True,
                             language='english', ner_model='en_core_web_sm', 
                             min_entity_count=2, max_entities=20):
    """
    Analyze textual content specifically for highly cited papers and compare with regular papers
    
    Parameters:
    df (pandas.DataFrame): DataFrame containing the data
    subject_main_dir (str/Path): Main directory for saving visualizations
    text_column (str): Column name containing text to analyze
    citation_column (str): Column name for citation counts
    min_citation_count (int): Minimum citation count threshold for "highly cited" papers
    analysis_type (str): Type of analysis to perform - 'general', 'entities', or 'both'
    comparison (bool): Whether to compare high-citation papers with regular papers
    filter_language (bool): Whether to filter by language
    target_language (str): Target language code to keep
    remove_stopwords (bool): Whether to remove stopwords in preprocessing
    language (str): Language for stopwords removal
    ner_model (str): spaCy model to use for NER
    min_entity_count (int): Minimum count for an entity to be included
    max_entities (int): Maximum number of entities to show in visualizations
    
    Returns:
    dict: Dictionary containing analysis results
    """
    # Generate the subject type
    subject_type = f"high_citation_{text_column.lower().replace(' ', '_')}"
    
    # Filter out NaN and non-numeric values in citation column
    analysis_df = df.copy()
    analysis_df = analysis_df[pd.to_numeric(analysis_df[citation_column], errors='coerce').notna()]
    
    # Filter out rows with missing text
    analysis_df = analysis_df[analysis_df[text_column].notna()]
    
    # Split into highly cited and regular papers
    highly_cited_df = analysis_df[analysis_df[citation_column] >= min_citation_count].copy()
    regular_papers_df = analysis_df[analysis_df[citation_column] < min_citation_count].copy()
    
    # Create paths for intermediate processing results if needed
    highly_cited_proc_path = Path(subject_main_dir) / subject_type / f"highly_cited_processed.csv"
    regular_papers_proc_path = Path(subject_main_dir) / subject_type / f"regular_papers_processed.csv"
    
    # Ensure directory exists
    (Path(subject_main_dir) / subject_type).mkdir(parents=True, exist_ok=True)
    
    results = {
        'highly_cited_count': len(highly_cited_df),
        'regular_papers_count': len(regular_papers_df),
        'analysis_type': analysis_type
    }
    
    # Run appropriate analyses based on the selected type
    if analysis_type in ['general', 'both']:
        print(f"Running general text analysis on {len(highly_cited_df)} highly cited papers...")
        highly_cited_text_results = analyze_text_column(
            highly_cited_df, 
            text_column, 
            subject_main_dir,
            filter_language=filter_language,
            target_language=target_language,
            remove_stopwords=remove_stopwords,
            language=language,
            subject_type=subject_type
        )
        results['highly_cited_text_analysis'] = highly_cited_text_results
        
        # Compare with regular papers if requested
        if comparison:
            # Take a random sample of regular papers of similar size to highly cited papers
            sample_size = min(len(highly_cited_df) * 2, len(regular_papers_df))
            regular_sample_df = regular_papers_df.sample(sample_size, random_state=42)
            
            print(f"Running general text analysis on {len(regular_sample_df)} regular papers for comparison...")
            regular_text_results = analyze_text_column(
                regular_sample_df, 
                text_column, 
                subject_main_dir,
                filter_language=filter_language,
                target_language=target_language,
                remove_stopwords=remove_stopwords,
                language=language,
                subject_type=f'{subject_type}_compared}'
            )
            results['regular_papers_text_analysis'] = regular_text_results
            
            # Compare word clouds or other text statistics here
            # This would depend on what analyze_text_column returns
    
    if analysis_type in ['entities', 'both']:
        print(f"Running entity analysis on {len(highly_cited_df)} highly cited papers...")
        highly_cited_entity_results = analyze_entities_in_column(
            highly_cited_df,
            text_column,
            subject_main_dir,
            min_count=min_entity_count,
            max_entities=max_entities,
            model=ner_model,
            preprocess=True,
            subject_type=subject_type
        )
        results['highly_cited_entity_analysis'] = highly_cited_entity_results
        
        # Compare with regular papers if requested
        if comparison:
            # Take a random sample of regular papers of similar size to highly cited papers
            sample_size = min(len(highly_cited_df) * 2, len(regular_papers_df))
            regular_sample_df = regular_papers_df.sample(sample_size, random_state=42)
            
            print(f"Running entity analysis on {len(regular_sample_df)} regular papers for comparison...")
            regular_entity_results = analyze_entities_in_column(
                regular_sample_df,
                text_column,
                subject_main_dir,
                min_count=min_entity_count,
                max_entities=max_entities,
                model=ner_model,
                preprocess=True,
                subject_type=f'{subject_type}_compared}'
            )
            results['regular_papers_entity_analysis'] = regular_entity_results
    
    # Create comparison visualizations if we have both sets of results
    if comparison and 'highly_cited_text_analysis' in results and 'regular_papers_text_analysis' in results:
        print("Creating comparison visualizations...")
        
        # Example: Compare document lengths
        # (This is just an example - modify based on what your analyze_text_column function returns)
        try:
            high_lengths = results['highly_cited_text_analysis'].get('document_lengths', [])
            regular_lengths = results['regular_papers_text_analysis'].get('document_lengths', [])
            
            plt.figure(figsize=(12, 6))
            sns.histplot(high_lengths, kde=True, color='blue', alpha=0.5, label=f'Highly Cited Papers (≥{min_citation_count} citations)')
            sns.histplot(regular_lengths, kde=True, color='red', alpha=0.5, label='Regular Papers')
            plt.title(f'Document Length Comparison for {text_column}', fontsize=15)
            plt.xlabel('Document Length (characters)', fontsize=12)
            plt.ylabel('Count', fontsize=12)
            plt.legend()
            plt.grid(True, alpha=0.3)
            plt.tight_layout()
            save_plot(plt, f"document_length_comparison.png", subject_main_dir, subject_type)
            
        except Exception as e:
            print(f"Couldn't create document length comparison: {e}")
    
    return results

# Example usage (assuming df and subject_main_dir are already defined):
# 
# Analyze abstracts of highly cited papers
abstract_analysis = analyze_high_citation_text(
    df, subject_main_dir, text_column='Abstract', min_citation_count=500,
    analysis_type='general', comparison=True
)

# Analyze titles of highly cited papers
title_analysis = analyze_high_citation_text(
    df, subject_main_dir, text_column='Title', min_citation_count=500,
    analysis_type='general', comparison=True
)

# Domain analysis

## Top 20 domains

In [None]:
all_domains = [domain for sublist in df['Domains'] for domain in sublist]
domain_counts = Counter(all_domains)


In [None]:
domain_counts_df = (
    pd.DataFrame
    .from_dict(domain_counts, orient='index', columns=['count'])
    .reset_index()
    .rename(columns={'index': 'domain'})
    .sort_values('count', ascending=False)
)

# 5. Visualize the top 20 domains
top_n = 20
top_domains = domain_counts_df.head(top_n)

plt.figure(figsize=(10, 6))
plt.bar(top_domains['domain'], top_domains['count'], color='skyblue')
plt.xticks(rotation=45, ha='right')
plt.title(f"Top {top_n} Domains")
plt.xlabel("Domain")
plt.ylabel("Count")
plt.tight_layout()

# 6. Save the figure
analysis_filename = "domain_frequency_top20.png"
domain_plot_path = os.path.join(processing_domain_dir, analysis_filename)
plt.savefig(domain_plot_path)
plt.show()

## Most common domain over decades

In [None]:
def get_top_domain(group):
    # Flatten the list of domain lists for the group (decade)
    all_domains = [domain for domains in group['Domains'] for domain in domains]
    if all_domains:
        top_domain, count = Counter(all_domains).most_common(1)[0]
        return pd.Series({'top_domain': top_domain, 'count': count})
    else:
        return pd.Series({'top_domain': None, 'count': 0})

# Group by decade and get the top domain (and its count) for each decade.
decade_top = df.groupby('decade').apply(get_top_domain).reset_index()
decade_top = decade_top.sort_values('decade')
print(decade_top)

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(decade_top['decade'], decade_top['count'], marker='o', linestyle='-', color='blue')
# Annotate each point with the top domain name.
for idx, row in decade_top.iterrows():
    plt.annotate(row['top_domain'],
                 (row['decade'], row['count']),
                 textcoords="offset points",
                 xytext=(0,10),
                 ha='center')
plt.title("Most Common Domain by Decade (Line Chart)")
plt.xlabel("Decade")
plt.ylabel("Count for Top Domain")
plt.xticks(decade_top['decade'], rotation=45)
plt.tight_layout()

line_chart_filename = "top_domain_by_decade_line.png"
line_chart_path = os.path.join(processing_domain_dir, line_chart_filename)
plt.savefig(line_chart_path)
plt.show()


In [None]:
unique_top_domains = list(dict.fromkeys(decade_top['top_domain']))
mapping = {domain: i for i, domain in enumerate(unique_top_domains)}
decade_top['domain_code'] = decade_top['top_domain'].map(mapping)

# =========================
#       LINE PLOT
# =========================
plt.figure(figsize=(10, 6))
plt.plot(decade_top['decade'], decade_top['domain_code'], marker='o', linestyle='-', color='blue')
plt.xticks(decade_top['decade'], rotation=45)
# Set y-axis ticks using the numeric codes and corresponding domain names.
plt.yticks(list(mapping.values()), list(mapping.keys()))
plt.xlabel("Decade")
plt.ylabel("Top Domain")
plt.title("Trend of Most Common Domain by Decade")
plt.tight_layout()

# Save and display the figure.
line_plot_filename = "top_domain_trend_line_plot.png"
line_plot_path = os.path.join(processing_domain_dir, line_plot_filename)
plt.savefig(line_plot_path)
plt.show()

## Graph

### Main domains co-appearnces graph

In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np
from itertools import combinations
import seaborn as sns
from matplotlib import gridspec

def create_domain_network(df, subject_main_dir):
    """
    Create and save a network visualization of domain co-occurrences
    
    Parameters:
    df (pandas.DataFrame): The preprocessed DataFrame
    subject_main_dir (str/Path): Directory to save the visualization
    """
    # First preprocess the DataFrame
    df = preprocess_dataframe(df)
    
    # Create co-occurrence dictionary
    cooccurrence_dict = {}
    # Count total occurrences of each domain for node sizing
    domain_counts = Counter()
    
    # Process each paper's domains
    for domains in df['Domains']:
        # Count individual domains
        domain_counts.update(domains)
        
        # Count co-occurrences
        for d1, d2 in combinations(sorted(set(domains)), 2):
            pair = tuple(sorted([d1, d2]))
            cooccurrence_dict[pair] = cooccurrence_dict.get(pair, 0) + 1
    
    # Create network
    G = nx.Graph()
    
    # Add nodes with size based on frequency
    max_count = max(domain_counts.values())
    for domain, count in domain_counts.items():
        # Normalize node size between 1000 and 5000
        node_size = 1000 + (count / max_count) * 4000
        G.add_node(domain, size=node_size)
    
    # Add edges with weights
    edge_weights = []
    for (d1, d2), weight in cooccurrence_dict.items():
        G.add_edge(d1, d2, weight=weight)
        edge_weights.append(weight)
    
    # Calculate edge width and color ranges
    max_weight = max(edge_weights)
    min_weight = min(edge_weights)
    
    # Create color map with shifted scale to make lower values more visible
    edge_colors = []
    edge_widths = []
    
    for (d1, d2) in G.edges():
        weight = G[d1][d2]['weight']
        # Shift the color scale to make lower co-occurrences darker
        # Using logarithmic scale to enhance visibility of lower values
        if max_weight > min_weight:
            # Using a non-linear transformation to enhance lower values
            color_val = 0.4 + 0.6 * (np.log1p(weight - min_weight) / np.log1p(max_weight - min_weight))
        else:
            color_val = 0.5
        edge_colors.append(color_val)
        
        # Normalize width (1 to 5 to maintain clarity)
        width = 1 + 4 * (weight - min_weight) / (max_weight - min_weight) if max_weight > min_weight else 3
        edge_widths.append(width)
    
    # Set up the figure with GridSpec
    fig = plt.figure(figsize=(24, 22), facecolor='white')  # Slightly larger figure
    gs = gridspec.GridSpec(1, 20)  # 1 row, 20 columns for fine control
    
    # Main plot area (using 19 columns)
    ax_main = fig.add_subplot(gs[0, :19])
    
    # Colorbar area (using 1 column)
    ax_cbar = fig.add_subplot(gs[0, 19])
    
    # Set custom style parameters
    plt.rcParams['figure.facecolor'] = 'white'
    plt.rcParams['axes.facecolor'] = 'white'
    plt.rcParams['axes.grid'] = True
    plt.rcParams['grid.alpha'] = 0.3
    
    # Create layout with more spread
    pos = nx.spring_layout(G, k=1.5, iterations=100, seed=42)  # Increased k and iterations
    
    # Draw edges
    edges = nx.draw_networkx_edges(G, pos, 
                                 edge_color=edge_colors, 
                                 width=edge_widths,
                                 edge_cmap=plt.cm.Blues,
                                 alpha=0.8,  # Slightly increased alpha
                                 ax=ax_main)
    
    # Draw nodes
    node_sizes = [G.nodes[node]['size'] for node in G.nodes()]
    nodes = nx.draw_networkx_nodes(G, pos,
                                 node_color='lightblue',
                                 node_size=node_sizes,
                                 alpha=0.7,
                                 linewidths=2,
                                 edgecolors='white',
                                 ax=ax_main)
    
    # Add labels with slightly reduced size to prevent overlap
    label_sizes = {node: np.sqrt(size/1000) * 9 for node, size in zip(G.nodes(), node_sizes)}  # Reduced multiplier from 10 to 9
    nx.draw_networkx_labels(G, pos, font_size=label_sizes, font_weight='bold', ax=ax_main)
    
    # Add title
    ax_main.set_title('Domain Co-occurrence Network', fontsize=24, pad=20, fontweight='bold')
    ax_main.axis('off')
    
    # Add colorbar with shifted color scale
    # Create custom normalization to better show the relationship
    from matplotlib.colors import Normalize
    
    # Create custom tick positions and labels for the colorbar
    tick_positions = np.linspace(0, 1, 5)
    if max_weight > min_weight:
        # Calculate corresponding values using the inverse of our transformation
        tick_values = [min_weight + (np.exp(p * np.log1p(max_weight - min_weight)) - 1) for p in tick_positions]
        tick_labels = [f"{int(v)}" for v in tick_values]
    else:
        tick_values = [min_weight] * 5
        tick_labels = [f"{int(min_weight)}"] * 5
    
    norm = plt.Normalize(0, 1)  # We'll use our custom mapping
    sm = plt.cm.ScalarMappable(cmap=plt.cm.Blues, norm=norm)
    sm.set_array([])
    cbar = plt.colorbar(sm, cax=ax_cbar, ticks=tick_positions)
    cbar.set_ticklabels(tick_labels)
    cbar.set_label('Number of Co-occurrences', fontsize=16, fontweight='bold')
    
    # Adjust layout
    plt.tight_layout()
    
    # Save the plot
    save_plot(plt, "domain_cooccurrence_network.png", subject_main_dir, "domains")
    
    # Return the graph object for potential further analysis
    return G

# Function to analyze and print network statistics
def print_network_stats(G):
    """
    Print basic network statistics
    """
    print("\nNetwork Statistics:")
    print(f"Number of nodes: {G.number_of_nodes()}")
    print(f"Number of edges: {G.number_of_edges()}")
    print(f"Network density: {nx.density(G):.3f}")
    print("\nTop 5 domains by degree centrality:")
    degree_cent = nx.degree_centrality(G)
    for node, centrality in sorted(degree_cent.items(), key=lambda x: x[1], reverse=True)[:5]:
        print(f"{node}: {centrality:.3f}")

In [None]:
G = create_domain_network(df, subject_main_dir)
print_network_stats(G)

### Community detection

In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np
from itertools import combinations
import seaborn as sns
from matplotlib import gridspec
import community.community_louvain as community_louvain  # Updated import statement
import matplotlib.colors as mcolors

def analyze_domain_communities(df, subject_main_dir):
    """
    Analyze and visualize domain communities and cliques
    
    Parameters:
    df (pandas.DataFrame): The preprocessed DataFrame
    subject_main_dir (str/Path): Directory to save visualizations
    """
    # First get the network using our previous approach
    G = create_domain_network(df, subject_main_dir)
    
    # Detect communities using Louvain method
    communities = community_louvain.best_partition(G)
    
    # Find all maximal cliques of size 3 or larger
    cliques = list(nx.find_cliques(G))
    significant_cliques = [c for c in cliques if len(c) >= 3]
    
    # Print community and clique statistics
    print("\nCommunity Statistics:")
    print(f"Number of communities: {len(set(communities.values()))}")
    
    print("\nLargest communities:")
    community_counts = Counter(communities.values())
    for comm_id, count in sorted(community_counts.items(), key=lambda x: x[1], reverse=True)[:5]:
        members = [node for node, c_id in communities.items() if c_id == comm_id]
        print(f"\nCommunity {comm_id} (Size: {count}):")
        print(f"Members: {', '.join(members)}")
    
    print("\nClique Analysis:")
    print(f"Number of maximal cliques (size ≥ 3): {len(significant_cliques)}")
    print("\nLargest cliques:")
    for clique in sorted(significant_cliques, key=len, reverse=True)[:5]:
        print(f"Size {len(clique)}: {', '.join(clique)}")
    
    # Create visualization with communities
    # Set up the figure with GridSpec
    fig = plt.figure(figsize=(24, 22), facecolor='white')
    gs = gridspec.GridSpec(1, 20)
    ax_main = fig.add_subplot(gs[0, :19])
    ax_cbar = fig.add_subplot(gs[0, 19])
    
    # Generate colors for communities
    n_communities = len(set(communities.values()))
    community_colors = plt.cm.tab20(np.linspace(0, 1, n_communities))
    
    # Create layout with more spread
    pos = nx.spring_layout(G, k=1.5, iterations=100, seed=42)
    
    # Draw edges with previous style but slightly transparent
    edge_weights = [G[u][v]['weight'] for u, v in G.edges()]
    max_weight = max(edge_weights)
    min_weight = min(edge_weights)
    
    edge_colors = []
    edge_widths = []
    for weight in edge_weights:
        color_val = 0.4 + 0.6 * (np.log1p(weight - min_weight) / np.log1p(max_weight - min_weight))
        edge_colors.append(color_val)
        width = 1 + 4 * (weight - min_weight) / (max_weight - min_weight)
        edge_widths.append(width)
    
    nx.draw_networkx_edges(G, pos,
                          edge_color=edge_colors,
                          width=edge_widths,
                          edge_cmap=plt.cm.Blues,
                          alpha=0.5,
                          ax=ax_main)
    
    # Draw nodes colored by community
    node_colors = [community_colors[communities[node]] for node in G.nodes()]
    node_sizes = [G.nodes[node]['size'] for node in G.nodes()]
    
    nx.draw_networkx_nodes(G, pos,
                          node_color=node_colors,
                          node_size=node_sizes,
                          alpha=0.7,
                          linewidths=2,
                          edgecolors='white',
                          ax=ax_main)
    
    # Add labels
    label_sizes = {node: np.sqrt(size/1000) * 9 for node, size in zip(G.nodes(), node_sizes)}
    nx.draw_networkx_labels(G, pos, font_size=label_sizes, font_weight='bold', ax=ax_main)
    
    # Add title with community information
    ax_main.set_title('Domain Co-occurrence Network\nColored by Communities', 
                      fontsize=24, pad=20, fontweight='bold')
    ax_main.axis('off')
    
    # Add colorbar for edge weights
    norm = plt.Normalize(0, 1)
    sm = plt.cm.ScalarMappable(cmap=plt.cm.Blues, norm=norm)
    sm.set_array([])
    
    tick_positions = np.linspace(0, 1, 5)
    tick_values = [min_weight + (np.exp(p * np.log1p(max_weight - min_weight)) - 1) 
                  for p in tick_positions]
    tick_labels = [f"{int(v)}" for v in tick_values]
    
    cbar = plt.colorbar(sm, cax=ax_cbar, ticks=tick_positions)
    cbar.set_ticklabels(tick_labels)
    cbar.set_label('Number of Co-occurrences', fontsize=16, fontweight='bold')
    
    plt.tight_layout()
    
    # Save the community visualization
    save_plot(plt, "domain_communities_network.png", subject_main_dir, "domains")
    
    return G, communities, significant_cliques

# Additional helper function for detailed community analysis
def analyze_community_characteristics(G, communities):
    """
    Analyze characteristics of each community
    """
    community_stats = {}
    for node, comm_id in communities.items():
        if comm_id not in community_stats:
            community_stats[comm_id] = {
                'nodes': [],
                'internal_edges': 0,
                'external_edges': 0,
                'total_weight': 0
            }
        community_stats[comm_id]['nodes'].append(node)
    
    # Calculate edge statistics
    for (u, v, w) in G.edges(data='weight'):
        comm_u = communities[u]
        comm_v = communities[v]
        if comm_u == comm_v:
            community_stats[comm_u]['internal_edges'] += 1
            community_stats[comm_u]['total_weight'] += w
        else:
            community_stats[comm_u]['external_edges'] += 1
            community_stats[comm_v]['external_edges'] += 1
    
    return community_stats

In [None]:
G, communities, significant_cliques = analyze_domain_communities(df, subject_main_dir)
community_stats = analyze_community_characteristics(G, communities)

### Temporal graph analysis- domains over decades trends

In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np
from itertools import combinations
import seaborn as sns
from matplotlib import gridspec
import community.community_louvain as community_louvain

def analyze_temporal_domains(df, subject_main_dir):
    """
    Analyze and visualize domain relationships across different decades
    
    Parameters:
    df (pandas.DataFrame): The preprocessed DataFrame
    subject_main_dir (str/Path): Directory to save visualizations
    """
    # First preprocess the DataFrame
    df = preprocess_dataframe(df)
    
    # Create decade bins
    df['Decade'] = (df['Publication Year'] // 10) * 10
    decades = sorted(df['Decade'].unique())
    
    # Store network metrics for each decade
    temporal_metrics = {
        'n_nodes': [],
        'n_edges': [],
        'density': [],
        'avg_clustering': [],
        'top_domains': []
    }
    
    # Create subplots for network visualization
    n_decades = len(decades)
    n_cols = min(3, n_decades)
    n_rows = (n_decades + n_cols - 1) // n_cols
    
    fig = plt.figure(figsize=(8*n_cols, 8*n_rows))
    
    # Process each decade
    for idx, decade in enumerate(decades):
        decade_data = df[df['Decade'] == decade]
        
        # Create co-occurrence dictionary for this decade
        cooccurrence_dict = {}
        domain_counts = Counter()
        
        # Process each paper's domains in this decade
        for domains in decade_data['Domains']:
            domain_counts.update(domains)
            for d1, d2 in combinations(sorted(set(domains)), 2):
                pair = tuple(sorted([d1, d2]))
                cooccurrence_dict[pair] = cooccurrence_dict.get(pair, 0) + 1
        
        # Create network
        G = nx.Graph()
        
        # Add nodes with size based on frequency
        max_count = max(domain_counts.values()) if domain_counts else 1
        for domain, count in domain_counts.items():
            G.add_node(domain, size=1000 + (count / max_count) * 4000)
        
        # Add edges with weights
        for (d1, d2), weight in cooccurrence_dict.items():
            G.add_edge(d1, d2, weight=weight)
        
        # Calculate metrics
        temporal_metrics['n_nodes'].append(G.number_of_nodes())
        temporal_metrics['n_edges'].append(G.number_of_edges())
        temporal_metrics['density'].append(nx.density(G))
        temporal_metrics['avg_clustering'].append(nx.average_clustering(G))
        
        # Get top domains by degree centrality
        deg_cent = nx.degree_centrality(G)
        top_domains = sorted(deg_cent.items(), key=lambda x: x[1], reverse=True)[:5]
        temporal_metrics['top_domains'].append(top_domains)
        
        # Create subplot for this decade
        ax = plt.subplot(n_rows, n_cols, idx + 1)
        
        # Draw network
        pos = nx.spring_layout(G, k=1.5, iterations=50)
        
        # Draw edges with weight-based colors
        edge_weights = [G[u][v]['weight'] for u, v in G.edges()]
        if edge_weights:
            max_weight = max(edge_weights)
            min_weight = min(edge_weights)
            edge_colors = [0.4 + 0.6 * (np.log1p(w - min_weight) / np.log1p(max_weight - min_weight))
                         for w in edge_weights]
        else:
            edge_colors = []
        
        nx.draw_networkx_edges(G, pos,
                             edge_color=edge_colors,
                             edge_cmap=plt.cm.Blues,
                             alpha=0.5,
                             width=1)
        
        # Draw nodes
        node_sizes = [G.nodes[node]['size'] for node in G.nodes()]
        nx.draw_networkx_nodes(G, pos,
                             node_color='lightblue',
                             node_size=[s/4 for s in node_sizes],  # Smaller nodes for subplot
                             alpha=0.7,
                             linewidths=1,
                             edgecolors='white')
        
        # Add minimal labels (only for top domains)
        top_domain_names = [d[0] for d in top_domains[:3]]  # Show only top 3 for clarity
        labels = {node: node if node in top_domain_names else '' for node in G.nodes()}
        nx.draw_networkx_labels(G, pos, labels, font_size=8, font_weight='bold')
        
        plt.title(f'{decade}s\nNodes: {G.number_of_nodes()}, Edges: {G.number_of_edges()}')
        plt.axis('off')
    
    plt.tight_layout()
    save_plot(plt, "temporal_domain_networks.png", subject_main_dir, "domains")
    
    # Create trend visualization
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 12))
    
    # Plot network metrics trends
    ax1.plot(decades, temporal_metrics['n_nodes'], 'o-', label='Number of Domains')
    ax1.plot(decades, temporal_metrics['n_edges'], 's-', label='Number of Co-Appearnces')
    ax1.set_xlabel('Decade')
    ax1.set_ylabel('Count')
    ax1.set_title('Network Size Evolution')
    ax1.legend()
    ax1.grid(True)
    
    # Plot network characteristics trends
    ax2.plot(decades, temporal_metrics['density'], 'o-', label='Network Density')
    ax2.plot(decades, temporal_metrics['avg_clustering'], 's-', label='Average Clustering')
    ax2.set_xlabel('Decade')
    ax2.set_ylabel('Value')
    ax2.set_title('Network Characteristics Evolution')
    ax2.legend()
    ax2.grid(True)
    
    plt.tight_layout()
    save_plot(plt, "temporal_domain_metrics.png", subject_main_dir, "domains")
    
    return temporal_metrics

def print_temporal_analysis(temporal_metrics, decades):
    """Print detailed analysis of temporal trends"""
    print("\nTemporal Analysis of Domain Networks:")
    print("\nNetwork Size Evolution:")
    for decade, nodes, edges in zip(decades, temporal_metrics['n_nodes'], temporal_metrics['n_edges']):
        print(f"\n{decade}s:")
        print(f"  Nodes: {nodes}")
        print(f"  Edges: {edges}")
        print(f"  Density: {temporal_metrics['density'][decades.index(decade)]:.3f}")
        print(f"  Avg Clustering: {temporal_metrics['avg_clustering'][decades.index(decade)]:.3f}")
        print("  Top Domains:")
        for domain, centrality in temporal_metrics['top_domains'][decades.index(decade)][:3]:
            print(f"    - {domain}: {centrality:.3f}")

In [None]:
temporal_metrics = analyze_temporal_domains(df, subject_main_dir)
decades = sorted(df['Publication Year'].apply(lambda x: (x // 10) * 10).unique())
print_temporal_analysis(temporal_metrics, decades)

In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter, defaultdict
import community.community_louvain as community_louvain
import seaborn as sns
from itertools import combinations

def safe_centrality_calculation(G, calc_function, default_value=None):
    """
    Safely calculate centrality metrics with error handling
    """
    if G.number_of_edges() == 0:
        return {node: 0 for node in G.nodes()}
    try:
        return calc_function(G)
    except:
        if default_value is None:
            return {node: 0 for node in G.nodes()}
        return default_value

def analyze_temporal_centrality_communities(df, subject_main_dir):
    """
    Analyze central domains and community evolution over decades with edge case handling
    """
    df = preprocess_dataframe(df)
    df['Decade'] = (df['Publication Year'] // 10) * 10
    decades = sorted(df['Decade'].unique())
    
    # Store metrics
    centrality_evolution = defaultdict(list)
    community_evolution = defaultdict(dict)
    persistent_communities = defaultdict(list)
    all_top_domains = set()
    centrality_scores = defaultdict(dict)
    
    # Create figures
    fig_centrality = plt.figure(figsize=(15, 10))
    
    # Analyze each decade
    for decade in decades:
        print(f"\nProcessing decade: {decade}s")  # Debug information
        decade_data = df[df['Decade'] == decade]
        
        # Create network
        G = nx.Graph()
        cooccurrence_dict = {}
        domain_counts = Counter()
        
        # Build network
        for domains in decade_data['Domains']:
            domain_counts.update(domains)
            for d1, d2 in combinations(sorted(set(domains)), 2):
                pair = tuple(sorted([d1, d2]))
                cooccurrence_dict[pair] = cooccurrence_dict.get(pair, 0) + 1
        
        # Add nodes and edges
        for domain, count in domain_counts.items():
            G.add_node(domain)
        
        for (d1, d2), weight in cooccurrence_dict.items():
            G.add_edge(d1, d2, weight=weight)
        
        print(f"Network for {decade}s - Nodes: {G.number_of_nodes()}, Edges: {G.number_of_edges()}")
        
        # Skip empty or disconnected graphs
        if G.number_of_nodes() == 0:
            print(f"Warning: Empty graph for decade {decade}")
            community_evolution[decade] = {
                'communities': {},
                'modularity': 0,
                'num_communities': 0,
                'sizes': Counter()
            }
            continue
            
        # Calculate centrality metrics with error handling
        degree_cent = nx.degree_centrality(G)
        betweenness_cent = safe_centrality_calculation(G, nx.betweenness_centrality)
        eigenvector_cent = safe_centrality_calculation(
            G, 
            lambda g: nx.eigenvector_centrality(g, max_iter=1000)
        )
        
        # Store top domains
        top_domains = sorted(degree_cent.items(), key=lambda x: x[1], reverse=True)[:5]
        all_top_domains.update(domain for domain, _ in top_domains)
        
        # Store centrality scores
        for domain in G.nodes():
            centrality_scores[domain][decade] = {
                'degree': degree_cent.get(domain, 0),
                'betweenness': betweenness_cent.get(domain, 0),
                'eigenvector': eigenvector_cent.get(domain, 0)
            }
        
        # Community detection with error handling
        try:
            if G.number_of_edges() > 0:
                communities = community_louvain.best_partition(G)
                modularity = community_louvain.modularity(communities, G)
            else:
                # For graphs without edges, each node is its own community
                communities = {node: idx for idx, node in enumerate(G.nodes())}
                modularity = 0
                print(f"Warning: No edges in graph for decade {decade}, setting default community values")
            
            community_evolution[decade] = {
                'communities': communities,
                'modularity': modularity,
                'num_communities': len(set(communities.values())),
                'sizes': Counter(communities.values())
            }
            
            # Track community membership
            for domain, comm_id in communities.items():
                persistent_communities[domain].append((decade, comm_id))
                
        except Exception as e:
            print(f"Warning: Community detection failed for decade {decade}: {str(e)}")
            community_evolution[decade] = {
                'communities': {node: 0 for node in G.nodes()},
                'modularity': 0,
                'num_communities': 1,
                'sizes': Counter([0] * G.number_of_nodes())
            }
    
    # Visualize centrality evolution for top domains
    plt.figure(figsize=(15, 10))
    for domain in list(all_top_domains)[:10]:  # Plot top 10 domains
        decades_present = []
        centrality_values = []
        for decade in decades:
            if domain in centrality_scores and decade in centrality_scores[domain]:
                decades_present.append(decade)
                centrality_values.append(centrality_scores[domain][decade]['degree'])
        if decades_present:
            plt.plot(decades_present, centrality_values, 'o-', label=domain)
    
    plt.title('Evolution of Domain Centrality Over Time')
    plt.xlabel('Decade')
    plt.ylabel('Degree Centrality')
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True)
    plt.tight_layout()
    save_plot(plt, "domain_centrality_evolution.png", subject_main_dir, "domains")
    
    # Visualize community evolution (only for decades with valid data)
    plt.figure(figsize=(15, 8))
    valid_decades = []
    modularity_values = []
    num_communities = []
    
    for decade in decades:
        if community_evolution[decade]['num_communities'] > 0:
            valid_decades.append(decade)
            modularity_values.append(community_evolution[decade]['modularity'])
            num_communities.append(community_evolution[decade]['num_communities'])
    
    if valid_decades:  # Only plot if we have valid data
        plt.plot(valid_decades, modularity_values, 'o-', label='Modularity')
        plt.plot(valid_decades, 
                [n/max(num_communities) for n in num_communities], 
                's-', 
                label='Normalized Number of Communities')
        plt.title('Evolution of Community Structure')
        plt.xlabel('Decade')
        plt.ylabel('Value')
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        save_plot(plt, "community_evolution.png", subject_main_dir, "domains")
    
    return centrality_scores, community_evolution, persistent_communities

def print_temporal_centrality_analysis(centrality_scores, community_evolution, persistent_communities, decades):
    """Print detailed analysis of temporal patterns with edge case handling"""
    print("\nTemporal Analysis of Domain Centrality and Communities:")
    
    if not centrality_scores:
        print("No valid centrality scores found for any decade.")
        return
    
    # Analyze domain persistence
    print("\nMost Persistent Central Domains:")
    domain_persistence = {}
    for domain in centrality_scores:
        appearances = len(centrality_scores[domain])
        if appearances > 0:  # Only include domains that appear in at least one decade
            avg_centrality = np.mean([scores['degree'] for scores in centrality_scores[domain].values()])
            domain_persistence[domain] = (appearances, avg_centrality)
    
    if domain_persistence:
        for domain, (appearances, avg_cent) in sorted(
            domain_persistence.items(), 
            key=lambda x: (x[1][0], x[1][1]), 
            reverse=True
        )[:10]:
            print(f"{domain}: Present in {appearances} decades, Avg. Centrality: {avg_cent:.3f}")
    else:
        print("No persistent domains found.")
    
    # Analyze community stability
    print("\nCommunity Structure Evolution:")
    for decade in decades:
        info = community_evolution[decade]
        print(f"\n{decade}s:")
        print(f"Number of communities: {info['num_communities']}")
        if info['num_communities'] > 0:
            print(f"Modularity: {info['modularity']:.3f}")
            
            # Find largest communities
            top_communities = sorted(info['sizes'].items(), key=lambda x: x[1], reverse=True)[:3]
            print("Largest communities sizes:", [size for _, size in top_communities])
        else:
            print("No valid community structure for this decade")
    
    # Analyze stable domain groups
    if persistent_communities:
        print("\nStable Domain Groups (domains that frequently appear together in communities):")
        stable_groups = defaultdict(list)
        for domain, history in persistent_communities.items():
            if len(history) >= len(decades) * 0.5:  # Present in at least half of the decades
                community_pattern = tuple(comm_id for _, comm_id in history)
                stable_groups[community_pattern].append(domain)
        
        if stable_groups:
            for pattern, domains in sorted(stable_groups.items(), key=lambda x: len(x[1]), reverse=True)[:5]:
                print(f"\nStable group with {len(domains)} domains:")
                print(", ".join(domains))
        else:
            print("No stable domain groups found")
    else:
        print("\nNo persistent communities found")

In [None]:
centrality_scores, community_evolution, persistent_communities = analyze_temporal_centrality_communities(df, subject_main_dir)
decades = sorted(df['Publication Year'].apply(lambda x: (x // 10) * 10).unique())
print_temporal_centrality_analysis(centrality_scores, community_evolution, persistent_communities, decades)

# Graph analysis

In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np
from itertools import combinations
import seaborn as sns
from matplotlib import gridspec
import ast

def preprocess_column(df, column_name):
    """
    Preprocess a column to ensure it contains lists.
    For concept_dict, extract keys as a list.
    
    Parameters:
    df (pandas.DataFrame): The DataFrame
    column_name (str): Name of the column to process
    
    Returns:
    pandas.Series: Series with list values
    """
    processed_series = pd.Series(index=df.index, dtype=object)
    
    for idx, value in df[column_name].items():
        try:
            # Handle concept_dict specially - extract keys
            if column_name == 'concept_dict':
                if isinstance(value, str):
                    concept_dict = ast.literal_eval(value)
                else:
                    concept_dict = value
                processed_series[idx] = list(concept_dict.keys())
            
            # Handle string representations of lists
            elif isinstance(value, str):
                if value.startswith('[') and value.endswith(']'):
                    processed_series[idx] = ast.literal_eval(value)
                else:
                    # Handle comma-separated strings
                    processed_series[idx] = [item.strip() for item in value.split(',')]
            
            # Handle already-list values
            elif isinstance(value, list):
                processed_series[idx] = value
            
            # Handle other cases
            else:
                processed_series[idx] = [str(value)]
                
        except (ValueError, SyntaxError, TypeError):
            # Default to empty list for problematic entries
            processed_series[idx] = []
    
    return processed_series

def create_network_visualization(df, column_name, subject_main_dir, top_n=None, min_occurrences=2):
    """
    Create and save a network visualization of item co-occurrences
    for any list column (including concept_dict)
    
    Parameters:
    df (pandas.DataFrame): The DataFrame
    column_name (str): Name of the column to visualize
    subject_main_dir (str/Path): Directory to save the visualization
    top_n (int, optional): Limit to top N most frequent items
    min_occurrences (int): Minimum number of occurrences to include an item
    
    Returns:
    networkx.Graph: The network graph object
    """
    # Get subject type from column name
    subject_type = column_name.lower().split('_')[0]
    if subject_type == 'concept':
        subject_type = 'concepts'
    elif not subject_type.endswith('s'):
        subject_type = f"{subject_type}s"
    
    # Preprocess the column to get lists
    items_series = preprocess_column(df, column_name)
    
    # Create co-occurrence dictionary
    cooccurrence_dict = {}
    # Count total occurrences of each item for node sizing
    item_counts = Counter()
    
    # Process each record's items
    for items in items_series:
        if not items:  # Skip empty lists
            continue
            
        # Count individual items
        item_counts.update(items)
        
        # Count co-occurrences
        for i1, i2 in combinations(sorted(set(items)), 2):
            pair = tuple(sorted([i1, i2]))
            cooccurrence_dict[pair] = cooccurrence_dict.get(pair, 0) + 1
    
    # Filter items by minimum occurrences
    item_counts = {item: count for item, count in item_counts.items() 
                  if count >= min_occurrences}
    
    # Optionally filter to top N items
    if top_n is not None and len(item_counts) > top_n:
        top_items = set(dict(Counter(item_counts).most_common(top_n)).keys())
        item_counts = {item: count for item, count in item_counts.items() 
                      if item in top_items}
        
        # Filter co-occurrences to only include top items
        cooccurrence_dict = {(i1, i2): count for (i1, i2), count in cooccurrence_dict.items() 
                            if i1 in top_items and i2 in top_items}
    
    # Create network
    G = nx.Graph()
    
    # Add nodes with size based on frequency
    max_count = max(item_counts.values()) if item_counts else 1
    for item, count in item_counts.items():
        # Normalize node size between 1000 and 5000
        node_size = 1000 + (count / max_count) * 4000
        G.add_node(item, size=node_size, count=count)
    
    # Add edges with weights
    edge_weights = []
    for (i1, i2), weight in cooccurrence_dict.items():
        # Only add edges between nodes that exist in our filtered graph
        if i1 in G.nodes and i2 in G.nodes:
            G.add_edge(i1, i2, weight=weight)
            edge_weights.append(weight)
    
    # Calculate edge width and color ranges
    if edge_weights:
        max_weight = max(edge_weights)
        min_weight = min(edge_weights)
    else:
        max_weight = min_weight = 1
    
    # Create color map with shifted scale to make lower values more visible
    edge_colors = []
    edge_widths = []
    
    for (i1, i2) in G.edges():
        weight = G[i1][i2]['weight']
        # Shift the color scale to make lower co-occurrences darker
        # Using logarithmic scale to enhance visibility of lower values
        if max_weight > min_weight:
            # Using a non-linear transformation to enhance lower values
            color_val = 0.4 + 0.6 * (np.log1p(weight - min_weight) / np.log1p(max_weight - min_weight))
        else:
            color_val = 0.5
        edge_colors.append(color_val)
        
        # Normalize width (1 to 5 to maintain clarity)
        width = 1 + 4 * (weight - min_weight) / (max_weight - min_weight) if max_weight > min_weight else 3
        edge_widths.append(width)
    
    # Set up the figure with GridSpec
    fig = plt.figure(figsize=(24, 22), facecolor='white')
    gs = gridspec.GridSpec(1, 20)  # 1 row, 20 columns for fine control
    
    # Main plot area (using 19 columns)
    ax_main = fig.add_subplot(gs[0, :19])
    
    # Colorbar area (using 1 column)
    ax_cbar = fig.add_subplot(gs[0, 19])
    
    # Set custom style parameters
    plt.rcParams['figure.facecolor'] = 'white'
    plt.rcParams['axes.facecolor'] = 'white'
    plt.rcParams['axes.grid'] = True
    plt.rcParams['grid.alpha'] = 0.3
    
    # Create layout with more spread
    pos = nx.spring_layout(G, k=1.5, iterations=100, seed=42)
    
    # Draw edges
    edges = nx.draw_networkx_edges(G, pos, 
                                 edge_color=edge_colors, 
                                 width=edge_widths,
                                 edge_cmap=plt.cm.Blues,
                                 alpha=0.8,
                                 ax=ax_main)
    
    # Draw nodes
    node_sizes = [G.nodes[node]['size'] for node in G.nodes()]
    nodes = nx.draw_networkx_nodes(G, pos,
                                 node_color='lightblue',
                                 node_size=node_sizes,
                                 alpha=0.7,
                                 linewidths=2,
                                 edgecolors='white',
                                 ax=ax_main)
    
    # Add labels with slightly reduced size to prevent overlap
    label_sizes = {node: np.sqrt(size/1000) * 9 for node, size in zip(G.nodes(), node_sizes)}
    nx.draw_networkx_labels(G, pos, font_size=label_sizes, font_weight='bold', ax=ax_main)
    
    # Add title - make it more descriptive based on column
    title = f"{column_name.replace('_', ' ').title()} Co-occurrence Network"
    if top_n:
        title += f" (Top {top_n})"
    ax_main.set_title(title, fontsize=24, pad=20, fontweight='bold')
    ax_main.axis('off')
    
    # Add colorbar with shifted color scale
    # Create custom normalization to better show the relationship
    from matplotlib.colors import Normalize
    
    # Create custom tick positions and labels for the colorbar
    tick_positions = np.linspace(0, 1, 5)
    if max_weight > min_weight:
        # Calculate corresponding values using the inverse of our transformation
        tick_values = [min_weight + (np.exp(p * np.log1p(max_weight - min_weight)) - 1) for p in tick_positions]
        tick_labels = [f"{int(v)}" for v in tick_values]
    else:
        tick_values = [min_weight] * 5
        tick_labels = [f"{int(min_weight)}"] * 5
    
    norm = plt.Normalize(0, 1)  # We'll use our custom mapping
    sm = plt.cm.ScalarMappable(cmap=plt.cm.Blues, norm=norm)
    sm.set_array([])
    cbar = plt.colorbar(sm, cax=ax_cbar, ticks=tick_positions)
    cbar.set_ticklabels(tick_labels)
    cbar.set_label('Number of Co-occurrences', fontsize=16, fontweight='bold')
    
    # Adjust layout
    plt.tight_layout()
    
    # Create filename from column name
    filename = f"{column_name.lower().replace('_', '_')}_cooccurrence_network"
    if top_n:
        filename += f"_top_{top_n}"
    filename += ".png"
    
    # Save the plot
    save_plot(plt, filename, subject_main_dir, subject_type)
    
    # Return the graph object for potential further analysis
    return G

def get_network_stats(G):
    """
    Calculate and return basic network statistics
    
    Parameters:
    G (networkx.Graph): The network graph object
    
    Returns:
    dict: Dictionary of network statistics
    """
    stats = {
        "nodes": G.number_of_nodes(),
        "edges": G.number_of_edges(),
        "density": nx.density(G),
        "top_by_degree": []
    }
    
    # Get top 5 nodes by degree centrality
    degree_cent = nx.degree_centrality(G)
    stats["top_by_degree"] = [
        {"node": node, "centrality": centrality}
        for node, centrality in sorted(degree_cent.items(), key=lambda x: x[1], reverse=True)[:5]
    ]
    
    # Add average degree
    if G.number_of_nodes() > 0:
        stats["avg_degree"] = 2 * G.number_of_edges() / G.number_of_nodes()
    else:
        stats["avg_degree"] = 0
    
    return stats

def print_network_stats(G):
    """
    Print basic network statistics
    
    Parameters:
    G (networkx.Graph): The network graph object
    """
    stats = get_network_stats(G)
    
    print("\nNetwork Statistics:")
    print(f"Number of nodes: {stats['nodes']}")
    print(f"Number of edges: {stats['edges']}")
    print(f"Network density: {stats['density']:.3f}")
    print(f"Average degree: {stats['avg_degree']:.3f}")
    
    print("\nTop 5 items by degree centrality:")
    for item in stats["top_by_degree"]:
        print(f"{item['node']}: {item['centrality']:.3f}")

## Community detection and analysis

In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np
from itertools import combinations
import seaborn as sns
from matplotlib import gridspec
import community.community_louvain as community_louvain
import matplotlib.colors as mcolors

def analyze_network_communities(G, subject_main_dir, subject_type, min_clique_size=3, top_communities=5, top_cliques=5):
    """
    Analyze and visualize communities and cliques in any network graph
    
    Parameters:
    G (networkx.Graph): The network graph to analyze
    subject_main_dir (str/Path): Directory to save visualizations
    subject_type (str): Type of subject (e.g., "domains", "concepts", "fields")
    min_clique_size (int): Minimum size for significant cliques
    top_communities (int): Number of top communities to report
    top_cliques (int): Number of top cliques to report
    
    Returns:
    tuple: (communities dict, significant cliques list, community stats dict)
    """
    # Detect communities using Louvain method
    communities = community_louvain.best_partition(G)
    
    # Find all maximal cliques of specified minimum size or larger
    cliques = list(nx.find_cliques(G))
    significant_cliques = [c for c in cliques if len(c) >= min_clique_size]
    
    # Print community and clique statistics
    print(f"\n{subject_type.title()} Community Statistics:")
    print(f"Number of communities: {len(set(communities.values()))}")
    
    print(f"\nLargest {subject_type} communities:")
    community_counts = Counter(communities.values())
    for comm_id, count in sorted(community_counts.items(), key=lambda x: x[1], reverse=True)[:top_communities]:
        members = [node for node, c_id in communities.items() if c_id == comm_id]
        print(f"\nCommunity {comm_id} (Size: {count}):")
        print(f"Members: {', '.join(members)}")
    
    print(f"\n{subject_type.title()} Clique Analysis:")
    print(f"Number of maximal cliques (size ≥ {min_clique_size}): {len(significant_cliques)}")
    print("\nLargest cliques:")
    for clique in sorted(significant_cliques, key=len, reverse=True)[:top_cliques]:
        print(f"Size {len(clique)}: {', '.join(clique)}")
    
    # Create visualization with communities
    # Set up the figure with GridSpec
    fig = plt.figure(figsize=(24, 22), facecolor='white')
    gs = gridspec.GridSpec(1, 20)
    ax_main = fig.add_subplot(gs[0, :19])
    ax_cbar = fig.add_subplot(gs[0, 19])
    
    # Generate colors for communities
    n_communities = len(set(communities.values()))
    community_colors = plt.cm.tab20(np.linspace(0, 1, n_communities))
    
    # Create layout with more spread
    pos = nx.spring_layout(G, k=1.5, iterations=100, seed=42)
    
    # Draw edges with style based on weight
    if G.number_of_edges() > 0:
        edge_weights = [G[u][v]['weight'] for u, v in G.edges()]
        max_weight = max(edge_weights) if edge_weights else 1
        min_weight = min(edge_weights) if edge_weights else 1
        
        edge_colors = []
        edge_widths = []
        for weight in edge_weights:
            if max_weight > min_weight:
                color_val = 0.4 + 0.6 * (np.log1p(weight - min_weight) / np.log1p(max_weight - min_weight))
                width = 1 + 4 * (weight - min_weight) / (max_weight - min_weight)
            else:
                color_val = 0.5
                width = 3
            edge_colors.append(color_val)
            edge_widths.append(width)
        
        nx.draw_networkx_edges(G, pos,
                              edge_color=edge_colors,
                              width=edge_widths,
                              edge_cmap=plt.cm.Blues,
                              alpha=0.5,
                              ax=ax_main)
    
    # Draw nodes colored by community
    node_colors = [community_colors[communities[node]] for node in G.nodes()]
    
    # Check if 'size' attribute exists for all nodes, otherwise use default
    if all('size' in G.nodes[node] for node in G.nodes()):
        node_sizes = [G.nodes[node]['size'] for node in G.nodes()]
    else:
        # Use default size based on degree if size attribute not available
        node_sizes = [300 + 100 * G.degree(node) for node in G.nodes()]
    
    nx.draw_networkx_nodes(G, pos,
                          node_color=node_colors,
                          node_size=node_sizes,
                          alpha=0.7,
                          linewidths=2,
                          edgecolors='white',
                          ax=ax_main)
    
    # Add labels - scale by node size if available
    if all('size' in G.nodes[node] for node in G.nodes()):
        label_sizes = {node: np.sqrt(size/1000) * 9 for node, size in zip(G.nodes(), node_sizes)}
    else:
        # Default font size based on degree if size attribute not available
        label_sizes = {node: 9 + 2 * np.sqrt(G.degree(node)) for node in G.nodes()}
    
    nx.draw_networkx_labels(G, pos, font_size=label_sizes, font_weight='bold', ax=ax_main)
    
    # Add title with community information
    title = f"{subject_type.title()} Co-occurrence Network\nColored by Communities"
    ax_main.set_title(title, fontsize=24, pad=20, fontweight='bold')
    ax_main.axis('off')
    
    # Add colorbar for edge weights if edges exist
    if G.number_of_edges() > 0:
        norm = plt.Normalize(0, 1)
        sm = plt.cm.ScalarMappable(cmap=plt.cm.Blues, norm=norm)
        sm.set_array([])
        
        tick_positions = np.linspace(0, 1, 5)
        if max_weight > min_weight:
            tick_values = [min_weight + (np.exp(p * np.log1p(max_weight - min_weight)) - 1) 
                          for p in tick_positions]
        else:
            tick_values = [min_weight] * 5
        tick_labels = [f"{int(v)}" for v in tick_values]
        
        cbar = plt.colorbar(sm, cax=ax_cbar, ticks=tick_positions)
        cbar.set_ticklabels(tick_labels)
        cbar.set_label('Number of Co-occurrences', fontsize=16, fontweight='bold')
    
    plt.tight_layout()
    
    # Create filename based on subject type
    filename = f"{subject_type.lower()}_communities_network.png"
    
    # Save the community visualization
    save_plot(plt, filename, subject_main_dir, subject_type)
    
    # Calculate detailed community statistics
    community_stats = analyze_community_characteristics(G, communities)
    
    return communities, significant_cliques, community_stats

def analyze_community_characteristics(G, communities):
    """
    Analyze characteristics of each community
    
    Parameters:
    G (networkx.Graph): The network graph
    communities (dict): Community assignment dictionary
    
    Returns:
    dict: Dictionary of community statistics
    """
    community_stats = {}
    for node, comm_id in communities.items():
        if comm_id not in community_stats:
            community_stats[comm_id] = {
                'nodes': [],
                'internal_edges': 0,
                'external_edges': 0,
                'total_weight': 0,
                'cohesion': 0,  # Will calculate below
                'size': 0,       # Will update below
                'central_nodes': []  # Will populate below
            }
        community_stats[comm_id]['nodes'].append(node)
    
    # Update size for each community
    for comm_id in community_stats:
        community_stats[comm_id]['size'] = len(community_stats[comm_id]['nodes'])
    
    # Calculate edge statistics
    for (u, v, data) in G.edges(data=True):
        weight = data.get('weight', 1)  # Default to 1 if weight not specified
        comm_u = communities[u]
        comm_v = communities[v]
        if comm_u == comm_v:
            community_stats[comm_u]['internal_edges'] += 1
            community_stats[comm_u]['total_weight'] += weight
        else:
            community_stats[comm_u]['external_edges'] += 1
            community_stats[comm_v]['external_edges'] += 1
    
    # Calculate cohesion (ratio of internal to total possible edges)
    for comm_id, stats in community_stats.items():
        n = stats['size']
        possible_edges = n * (n - 1) / 2  # Maximum possible edges in the community
        if possible_edges > 0:
            stats['cohesion'] = stats['internal_edges'] / possible_edges
        else:
            stats['cohesion'] = 0
        
        # Find central nodes (top 3 by degree, or fewer if community is small)
        nodes_in_comm = stats['nodes']
        subgraph = G.subgraph(nodes_in_comm)
        degrees = dict(subgraph.degree())
        top_n = min(3, len(nodes_in_comm))
        stats['central_nodes'] = sorted(degrees.items(), key=lambda x: x[1], reverse=True)[:top_n]
    
    return community_stats

def visualize_community_stats(community_stats, subject_main_dir, subject_type):
    """
    Create visualizations of community statistics
    
    Parameters:
    community_stats (dict): Dictionary of community statistics
    subject_main_dir (str/Path): Directory to save visualizations
    subject_type (str): Type of subject (e.g., "domains", "concepts")
    """
    # Extract data for plotting
    comm_ids = list(community_stats.keys())
    sizes = [stats['size'] for stats in community_stats.values()]
    cohesion = [stats['cohesion'] for stats in community_stats.values()]
    internal_edges = [stats['internal_edges'] for stats in community_stats.values()]
    external_edges = [stats['external_edges'] for stats in community_stats.values()]
    
    # Sort communities by size for better visualization
    sorted_indices = np.argsort(sizes)[::-1]  # Descending order
    top_15_indices = sorted_indices[:15]  # Only show top 15 communities
    
    top_ids = [comm_ids[i] for i in top_15_indices]
    top_sizes = [sizes[i] for i in top_15_indices]
    top_cohesion = [cohesion[i] for i in top_15_indices]
    
    # Create a figure with two subplots
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 10))
    
    # Plot community sizes
    ax1.bar(range(len(top_ids)), top_sizes, color='skyblue')
    ax1.set_xticks(range(len(top_ids)))
    ax1.set_xticklabels([f'Comm {i}' for i in top_ids], rotation=45)
    ax1.set_ylabel('Number of Nodes')
    ax1.set_title(f'Top 15 {subject_type.title()} Communities by Size')
    
    # Add value labels on top of bars
    for i, v in enumerate(top_sizes):
        ax1.text(i, v + 0.5, str(v), ha='center')
    
    # Plot community cohesion
    ax2.bar(range(len(top_ids)), top_cohesion, color='lightgreen')
    ax2.set_xticks(range(len(top_ids)))
    ax2.set_xticklabels([f'Comm {i}' for i in top_ids], rotation=45)
    ax2.set_ylabel('Cohesion (0-1)')
    ax2.set_title(f'Cohesion of Top 15 {subject_type.title()} Communities')
    
    # Add value labels on top of bars
    for i, v in enumerate(top_cohesion):
        ax2.text(i, v + 0.02, f'{v:.2f}', ha='center')
    
    plt.tight_layout()
    
    # Save the plot
    filename = f"{subject_type.lower()}_community_stats.png"
    save_plot(plt, filename, subject_main_dir, subject_type)
    
    # Create an additional visualization of internal vs. external connections
    plt.figure(figsize=(14, 8))
    
    # Top 10 communities
    top_10_indices = sorted_indices[:10]
    top_10_ids = [comm_ids[i] for i in top_10_indices]
    top_10_internal = [internal_edges[i] for i in top_10_indices]
    top_10_external = [external_edges[i] for i in top_10_indices]
    
    # Create grouped bar chart
    x = np.arange(len(top_10_ids))
    width = 0.35
    
    plt.bar(x - width/2, top_10_internal, width, label='Internal Connections', color='#5DA5DA')
    plt.bar(x + width/2, top_10_external, width, label='External Connections', color='#FAA43A')
    
    plt.xlabel('Community ID')
    plt.ylabel('Number of Connections')
    plt.title(f'Internal vs. External Connections for Top 10 {subject_type.title()} Communities')
    plt.xticks(x, [f'Comm {i}' for i in top_10_ids])
    plt.legend()
    
    plt.tight_layout()
    
    # Save the plot
    filename = f"{subject_type.lower()}_community_connections.png"
    save_plot(plt, filename, subject_main_dir, subject_type)

def find_community_bridges(G, communities, subject_main_dir, subject_type, top_n=10):
    """
    Identify and visualize bridge nodes between communities
    
    Parameters:
    G (networkx.Graph): The network graph
    communities (dict): Community assignment dictionary
    subject_main_dir (str/Path): Directory to save visualizations
    subject_type (str): Type of subject (e.g., "domains", "concepts")
    top_n (int): Number of top bridge nodes to report
    
    Returns:
    list: List of bridge nodes with their scores
    """
    # Calculate betweenness centrality
    betweenness = nx.betweenness_centrality(G, weight='weight')
    
    # Calculate cross-community connections for each node
    bridge_scores = {}
    for node in G.nodes():
        node_comm = communities[node]
        cross_comm_connections = 0
        for neighbor in G.neighbors(node):
            if communities[neighbor] != node_comm:
                cross_comm_connections += 1
        
        # Bridge score combines betweenness and cross-community connections
        bridge_scores[node] = {
            'betweenness': betweenness[node],
            'cross_comm_connections': cross_comm_connections,
            'bridge_score': betweenness[node] * (1 + cross_comm_connections)
        }
    
    # Sort nodes by bridge score
    sorted_bridges = sorted(bridge_scores.items(), key=lambda x: x[1]['bridge_score'], reverse=True)
    top_bridges = sorted_bridges[:top_n]
    
    # Print top bridge nodes
    print(f"\nTop {top_n} {subject_type.title()} Bridge Nodes Between Communities:")
    for node, scores in top_bridges:
        print(f"{node}: Bridge Score={scores['bridge_score']:.4f}, "
              f"Betweenness={scores['betweenness']:.4f}, "
              f"Cross-Comm. Connections={scores['cross_comm_connections']}")
    
    # Visualize top bridge nodes
    plt.figure(figsize=(12, 8))
    
    bridge_names = [node for node, _ in top_bridges]
    bridge_scores = [scores['bridge_score'] for _, scores in top_bridges]
    
    # Sort by score for better visualization
    sorted_indices = np.argsort(bridge_scores)
    bridge_names = [bridge_names[i] for i in sorted_indices]
    bridge_scores = [bridge_scores[i] for i in sorted_indices]
    
    # Create horizontal bar chart
    plt.barh(bridge_names, bridge_scores, color='#8CD17D')
    plt.xlabel('Bridge Score')
    plt.title(f'Top {subject_type.title()} Bridge Nodes Between Communities')
    
    # Add scores next to bars
    for i, score in enumerate(bridge_scores):
        plt.text(score + 0.01, i, f'{score:.3f}', va='center')
    
    plt.tight_layout()
    
    # Save the plot
    filename = f"{subject_type.lower()}_bridge_nodes.png"
    save_plot(plt, filename, subject_main_dir, subject_type)
    
    return top_bridges

## Temporal community detection and analysis

In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
import numpy as np
from itertools import combinations
import seaborn as sns
from matplotlib import gridspec
import community.community_louvain as community_louvain
import matplotlib.colors as mcolors
import ast

def analyze_temporal_networks(df, column_name, subject_main_dir, decade_col='decade', 
                             min_occurrence=2, top_n=None, min_clique_size=3):
    """
    Analyze and visualize network relationships across different decades for any column type
    
    Parameters:
    df (pandas.DataFrame): The DataFrame
    column_name (str): Name of the column to analyze
    subject_main_dir (str/Path): Directory to save visualizations
    decade_col (str): Name of the column containing decade information
    min_occurrence (int): Minimum occurrence for an item to be included
    top_n (int, optional): Limit to top N most frequent items per decade
    min_clique_size (int): Minimum size for significant cliques
    
    Returns:
    dict: Temporal metrics and analysis results
    """
    # Get subject type from column name
    subject_type = column_name.lower().split('_')[0]
    if subject_type == 'concept':
        subject_type = 'concepts'
    elif not subject_type.endswith('s'):
        subject_type = f"{subject_type}s"
    
    # Preprocess the column to get lists
    items_series = preprocess_column(df, column_name)
    
    # Ensure decade column exists
    if decade_col not in df.columns:
        raise ValueError(f"Decade column '{decade_col}' not found in DataFrame")
    
    # Get sorted decades
    decades = sorted(df[decade_col].unique())
    
    # Store network metrics for each decade
    temporal_metrics = {
        'decades': decades,
        'n_nodes': [],
        'n_edges': [],
        'density': [],
        'avg_clustering': [],
        'communities': [],
        'cliques': [],
        'top_items': [],
        'bridge_nodes': [],
        'graph_objects': [],
        'modularity': [],
        'avg_path_length': []
    }
    
    # Used to track item persistence across decades
    item_decade_presence = defaultdict(set)
    item_decade_centrality = defaultdict(dict)
    persistent_connections = defaultdict(int)
    
    # Create subplots for network visualization
    n_decades = len(decades)
    n_cols = min(3, n_decades)
    n_rows = (n_decades + n_cols - 1) // n_cols
    
    fig = plt.figure(figsize=(8*n_cols, 8*n_rows))
    
    # Process each decade
    for idx, decade in enumerate(decades):
        decade_data = df[df[decade_col] == decade]
        decade_items = items_series[decade_data.index]
        
        # Create co-occurrence dictionary for this decade
        cooccurrence_dict = {}
        item_counts = Counter()
        
        # Process each record's items in this decade
        for items in decade_items:
            if not items:  # Skip empty lists
                continue
                
            # Count individual items
            item_counts.update(items)
            
            # Count co-occurrences
            for i1, i2 in combinations(sorted(set(items)), 2):
                pair = tuple(sorted([i1, i2]))
                cooccurrence_dict[pair] = cooccurrence_dict.get(pair, 0) + 1
        
        # Filter items by minimum occurrences
        item_counts = {item: count for item, count in item_counts.items() 
                      if count >= min_occurrence}
        
        # Optionally filter to top N items
        if top_n is not None and len(item_counts) > top_n:
            top_items = set(dict(Counter(item_counts).most_common(top_n)).keys())
            item_counts = {item: count for item, count in item_counts.items() 
                          if item in top_items}
            
            # Filter co-occurrences to only include top items
            cooccurrence_dict = {(i1, i2): count for (i1, i2), count in cooccurrence_dict.items() 
                                if i1 in top_items and i2 in top_items}
        
        # Track item presence across decades
        for item in item_counts:
            item_decade_presence[item].add(decade)
        
        # Create network
        G = nx.Graph()
        
        # Add nodes with size based on frequency
        max_count = max(item_counts.values()) if item_counts else 1
        for item, count in item_counts.items():
            # Normalize node size between 1000 and 5000
            node_size = 1000 + (count / max_count) * 4000
            G.add_node(item, size=node_size, count=count)
        
        # Add edges with weights
        for (i1, i2), weight in cooccurrence_dict.items():
            # Only add edges between nodes that exist in our filtered graph
            if i1 in G.nodes and i2 in G.nodes:
                G.add_edge(i1, i2, weight=weight)
                
                # Track persistent connections across decades
                persistent_connections[(i1, i2)] += 1
        
        # Skip further analysis if graph is empty
        if G.number_of_nodes() == 0:
            temporal_metrics['n_nodes'].append(0)
            temporal_metrics['n_edges'].append(0)
            temporal_metrics['density'].append(0)
            temporal_metrics['avg_clustering'].append(0)
            temporal_metrics['communities'].append({})
            temporal_metrics['cliques'].append([])
            temporal_metrics['top_items'].append([])
            temporal_metrics['bridge_nodes'].append([])
            temporal_metrics['graph_objects'].append(G)
            temporal_metrics['modularity'].append(0)
            temporal_metrics['avg_path_length'].append(0)
            continue
        
        # Calculate metrics
        temporal_metrics['n_nodes'].append(G.number_of_nodes())
        temporal_metrics['n_edges'].append(G.number_of_edges())
        temporal_metrics['density'].append(nx.density(G))
        
        # Try to calculate clustering coefficient
        try:
            temporal_metrics['avg_clustering'].append(nx.average_clustering(G))
        except:
            temporal_metrics['avg_clustering'].append(0)
            
        # Try to calculate average path length
        try:
            # Only calculate for the largest connected component
            largest_cc = max(nx.connected_components(G), key=len)
            largest_cc_graph = G.subgraph(largest_cc)
            temporal_metrics['avg_path_length'].append(nx.average_shortest_path_length(largest_cc_graph))
        except:
            temporal_metrics['avg_path_length'].append(0)
        
        # Get top items by degree centrality
        deg_cent = nx.degree_centrality(G)
        top_items = sorted(deg_cent.items(), key=lambda x: x[1], reverse=True)[:5]
        temporal_metrics['top_items'].append(top_items)
        
        # Store centrality values for tracking over time
        for item, centrality in deg_cent.items():
            item_decade_centrality[item][decade] = centrality
        
        # Detect communities
        communities = community_louvain.best_partition(G)
        temporal_metrics['communities'].append(communities)
        
        # Calculate modularity
        try:
            modularity = community_louvain.modularity(communities, G)
            temporal_metrics['modularity'].append(modularity)
        except:
            temporal_metrics['modularity'].append(0)
        
        # Find cliques
        cliques = list(nx.find_cliques(G))
        significant_cliques = [c for c in cliques if len(c) >= min_clique_size]
        temporal_metrics['cliques'].append(significant_cliques)
        
        # Find bridge nodes
        betweenness = nx.betweenness_centrality(G, weight='weight')
        bridge_scores = {}
        
        for node in G.nodes():
            if communities:  # Only calculate if communities exist
                node_comm = communities[node]
                cross_comm_connections = sum(1 for neighbor in G.neighbors(node) 
                                          if communities.get(neighbor, -1) != node_comm)
                
                # Bridge score combines betweenness and cross-community connections
                bridge_scores[node] = {
                    'betweenness': betweenness[node],
                    'cross_comm_connections': cross_comm_connections,
                    'bridge_score': betweenness[node] * (1 + cross_comm_connections)
                }
            else:
                bridge_scores[node] = {
                    'betweenness': betweenness[node],
                    'cross_comm_connections': 0,
                    'bridge_score': betweenness[node]
                }
        
        # Sort nodes by bridge score
        sorted_bridges = sorted(bridge_scores.items(), key=lambda x: x[1]['bridge_score'], reverse=True)
        top_bridges = sorted_bridges[:5]  # Top 5 bridge nodes
        temporal_metrics['bridge_nodes'].append(top_bridges)
        
        # Store graph object for later analysis
        temporal_metrics['graph_objects'].append(G)
        # Create subplot for this decade
        ax = plt.subplot(n_rows, n_cols, idx + 1)
        
        # Draw network
        pos = nx.spring_layout(G, k=1.5, iterations=50, seed=42)
        
        # Draw edges with weight-based colors
        edge_weights = [G[u][v]['weight'] for u, v in G.edges()]
        if edge_weights:
            max_weight = max(edge_weights)
            min_weight = min(edge_weights)
            edge_colors = [0.4 + 0.6 * (np.log1p(w - min_weight) / np.log1p(max_weight - min_weight))
                         for w in edge_weights]
            edge_widths = [1 + 3 * (w - min_weight) / (max_weight - min_weight) 
                          if max_weight > min_weight else 1 for w in edge_weights]
        else:
            edge_colors = []
            edge_widths = []
        
        nx.draw_networkx_edges(G, pos,
                             edge_color=edge_colors,
                             width=edge_widths if edge_widths else 1,
                             edge_cmap=plt.cm.Blues,
                             alpha=0.5)
        
        # Draw nodes with community colors if communities exist
        node_sizes = [G.nodes[node]['size'] / 4 for node in G.nodes()]  # Smaller for subplot
        
        if communities:
            # Generate colors for communities
            n_communities = len(set(communities.values()))
            community_colors = plt.cm.tab20(np.linspace(0, 1, n_communities))
            node_colors = [community_colors[communities[node]] for node in G.nodes()]
            
            nx.draw_networkx_nodes(G, pos,
                                 node_color=node_colors,
                                 node_size=node_sizes,
                                 alpha=0.7,
                                 linewidths=1,
                                 edgecolors='white')
        else:
            nx.draw_networkx_nodes(G, pos,
                                 node_color='lightblue',
                                 node_size=node_sizes,
                                 alpha=0.7,
                                 linewidths=1,
                                 edgecolors='white')
        
        # Add minimal labels (only for top items and bridge nodes)
        important_nodes = set()
        if top_items:
            important_nodes.update([item[0] for item in top_items[:3]])
        if top_bridges:
            important_nodes.update([node[0] for node in top_bridges[:2]])
        
        labels = {node: node if node in important_nodes else '' for node in G.nodes()}
        nx.draw_networkx_labels(G, pos, labels, font_size=8, font_weight='bold')
        
        # Add title with decade and key metrics
        n_communities = len(set(communities.values())) if communities else 0
        title = f'{decade}s\nNodes: {G.number_of_nodes()}, Edges: {G.number_of_edges()}'
        if n_communities > 0:
            title += f'\nCommunities: {n_communities}'
        plt.title(title)
        plt.axis('off')
    
    plt.tight_layout()
    
    # Create filename
    filename = f"temporal_{subject_type}_networks.png"
    save_plot(plt, filename, subject_main_dir, subject_type)
    
    # Create trend visualizations
    visualize_network_metric_trends(temporal_metrics, subject_main_dir, subject_type)
    
    # Analyze item persistence and centrality trends
    item_persistence = {item: len(decades_present) for item, decades_present in item_decade_presence.items()}
    persistent_items = {item: decades for item, decades in item_decade_presence.items()
                       if len(decades) >= max(2, len(decades) // 3)}  # Present in at least 1/3 of decades
    
    # Visualize persistent items' centrality trends
    visualize_centrality_trends(item_decade_centrality, decades, persistent_items, 
                               subject_main_dir, subject_type)
    
    # Analyze persistent connections
    persistent_connections = {pair: count for pair, count in persistent_connections.items() 
                             if count >= max(2, len(decades) // 3)}  # Present in at least 1/3 of decades
    
    # Visualize persistent connections
    visualize_persistent_connections(persistent_connections, item_decade_presence, 
                                    decades, subject_main_dir, subject_type)
    
    # Visualize community evolution
    visualize_community_evolution(temporal_metrics, subject_main_dir, subject_type)
    
    # Return all metrics for further analysis
    return {
        'metrics': temporal_metrics,
        'persistent_items': persistent_items,
        'persistent_connections': persistent_connections,
        'item_centrality': item_decade_centrality
    }

def visualize_network_metric_trends(temporal_metrics, subject_main_dir, subject_type):
    """
    Visualize trends in network metrics over time
    
    Parameters:
    temporal_metrics (dict): The temporal metrics dictionary
    subject_main_dir (str/Path): Directory to save visualizations
    subject_type (str): Type of subject (e.g., "domains", "concepts")
    """
    decades = temporal_metrics['decades']
    
    # Create figure with 3 subplots
    fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(15, 18))
    
    # Plot network size metrics trends
    ax1.plot(decades, temporal_metrics['n_nodes'], 'o-', color='#4878D0', linewidth=2, 
             label=f'Number of {subject_type.title()}')
    ax1.plot(decades, temporal_metrics['n_edges'], 's-', color='#EE854A', linewidth=2, 
             label='Number of Co-occurrences')
    ax1.set_xlabel('Decade', fontsize=12)
    ax1.set_ylabel('Count', fontsize=12)
    ax1.set_title(f'Network Size Evolution for {subject_type.title()}', fontsize=16)
    ax1.legend(fontsize=10)
    ax1.grid(True, alpha=0.3)
    ax1.tick_params(axis='x', rotation=45)
    
    # Add values on the points
    for i, decade in enumerate(decades):
        if i < len(temporal_metrics['n_nodes']):
            ax1.annotate(f"{temporal_metrics['n_nodes'][i]}", 
                       (decade, temporal_metrics['n_nodes'][i]),
                       textcoords="offset points", 
                       xytext=(0,10), 
                       ha='center')
            ax1.annotate(f"{temporal_metrics['n_edges'][i]}", 
                       (decade, temporal_metrics['n_edges'][i]),
                       textcoords="offset points", 
                       xytext=(0,10), 
                       ha='center')
    
    # Plot network characteristics trends
    ax2.plot(decades, temporal_metrics['density'], 'o-', color='#55A868', linewidth=2,
             label='Network Density')
    ax2.plot(decades, temporal_metrics['avg_clustering'], 's-', color='#C44E52', linewidth=2,
             label='Average Clustering')
    
    # Add modularity if available
    if 'modularity' in temporal_metrics and any(temporal_metrics['modularity']):
        ax2.plot(decades, temporal_metrics['modularity'], '^-', color='#8172B3', linewidth=2,
                label='Modularity')
    
    ax2.set_xlabel('Decade', fontsize=12)
    ax2.set_ylabel('Value', fontsize=12)
    ax2.set_title(f'Network Characteristics Evolution for {subject_type.title()}', fontsize=16)
    ax2.legend(fontsize=10)
    ax2.grid(True, alpha=0.3)
    ax2.tick_params(axis='x', rotation=45)
    # Plot community and clique counts
    comm_counts = [len(set(comm.values())) if comm else 0 for comm in temporal_metrics['communities']]
    clique_counts = [len(cliques) for cliques in temporal_metrics['cliques']]
    
    ax3.plot(decades, comm_counts, 'o-', color='#8172B3', linewidth=2,
             label='Number of Communities')
    ax3.plot(decades, clique_counts, 's-', color='#937860', linewidth=2,
             label='Number of Significant Cliques')
    
    # Add avg path length if available
    if 'avg_path_length' in temporal_metrics and any(temporal_metrics['avg_path_length']):
        # Plot on secondary y-axis
        ax3_twin = ax3.twinx()
        ax3_twin.plot(decades, temporal_metrics['avg_path_length'], '^-', color='#DA8BC3', linewidth=2,
                     label='Avg. Path Length')
        ax3_twin.set_ylabel('Average Path Length', fontsize=12)
        ax3_twin.legend(fontsize=10, loc='upper right')
    
    ax3.set_xlabel('Decade', fontsize=12)
    ax3.set_ylabel('Count', fontsize=12)
    ax3.set_title(f'Community Structure Evolution for {subject_type.title()}', fontsize=16)
    ax3.legend(fontsize=10)
    ax3.grid(True, alpha=0.3)
    ax3.tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    filename = f"temporal_{subject_type}_metrics.png"
    save_plot(plt, filename, subject_main_dir, subject_type)

def visualize_centrality_trends(item_centrality, decades, persistent_items, subject_main_dir, subject_type, top_n=10):
    """
    Visualize trends in item centrality over time for persistent items
    
    Parameters:
    item_centrality (dict): Dictionary of item centrality values by decade
    decades (list): List of decades
    persistent_items (dict): Dictionary of persistent items and their decades
    subject_main_dir (str/Path): Directory to save visualizations
    subject_type (str): Type of subject (e.g., "domains", "concepts")
    top_n (int): Number of top persistent items to visualize
    """
    # Calculate persistence score (sum of centrality across decades)
    persistence_scores = {}
    
    for item, item_decades in persistent_items.items():
        if item in item_centrality:
            # Sum centrality across all decades the item appears in
            persistence_scores[item] = sum(item_centrality[item].get(decade, 0) for decade in decades)
    
    # Get top N items by persistence score
    top_persistent_items = sorted(persistence_scores.items(), key=lambda x: x[1], reverse=True)[:top_n]
    top_item_names = [item[0] for item in top_persistent_items]
    
    # Create trend visualization
    plt.figure(figsize=(15, 10))
    
    # Plot centrality evolution for each top item
    for item in top_item_names:
        # Get centrality for each decade, using 0 if item not present
        centrality_trend = [item_centrality[item].get(decade, 0) for decade in decades]
        plt.plot(decades, centrality_trend, 'o-', linewidth=2, label=item)
    
    plt.xlabel('Decade', fontsize=12)
    plt.ylabel('Degree Centrality', fontsize=12)
    plt.title(f'Centrality Evolution of Top {top_n} Persistent {subject_type.title()}', fontsize=16)
    plt.legend(fontsize=10)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    
    filename = f"temporal_{subject_type}_centrality_trends.png"
    save_plot(plt, filename, subject_main_dir, subject_type)
    
    # Create heatmap visualization
    plt.figure(figsize=(15, 10))
    
    # Prepare heatmap data
    heatmap_data = []
    for item in top_item_names:
        row = [item_centrality[item].get(decade, 0) for decade in decades]
        heatmap_data.append(row)
    
    # Create heatmap
    sns.heatmap(heatmap_data, annot=True, fmt=".3f", 
               xticklabels=[f"{d}s" for d in decades],
               yticklabels=top_item_names,
               cmap="YlOrRd")
    
    plt.xlabel('Decade', fontsize=12)
    plt.ylabel(f'{subject_type.title()}', fontsize=12)
    plt.title(f'Centrality Heatmap of Top {top_n} Persistent {subject_type.title()}', fontsize=16)
    plt.tight_layout()
    
    filename = f"temporal_{subject_type}_centrality_heatmap.png"
    save_plot(plt, filename, subject_main_dir, subject_type)

In [None]:
def visualize_persistent_connections(persistent_connections, item_decade_presence, 
                                    decades, subject_main_dir, subject_type, top_n=15):
    """
    Visualize persistent connections between items over time
    
    Parameters:
    persistent_connections (dict): Dictionary of persistent connections
    item_decade_presence (dict): Dictionary tracking item presence across decades
    decades (list): List of decades
    subject_main_dir (str/Path): Directory to save visualizations
    subject_type (str): Type of subject (e.g., "domains", "concepts")
    top_n (int): Number of top persistent connections to visualize
    """
    if not persistent_connections:
        return
    
    # Sort connections by persistence count
    sorted_connections = sorted(persistent_connections.items(), key=lambda x: x[1], reverse=True)
    top_connections = sorted_connections[:top_n]
    
    # Create visualization
    plt.figure(figsize=(15, 10))
    
    # For each persistent connection, create a timeline visualization
    connection_labels = []
    connection_decades = []
    
    for i, ((item1, item2), count) in enumerate(top_connections):
        # Get decades where both items appear
        decades_together = set()
        for decade in decades:
            if decade in item_decade_presence.get(item1, set()) and decade in item_decade_presence.get(item2, set()):
                decades_together.add(decade)
        
        # Add to visualization data
        connection_labels.append(f"{item1} — {item2}")
        connection_decades.append(sorted(decades_together))
    
    # Create the plot
    for i, (label, connection_decade_list) in enumerate(zip(connection_labels, connection_decades)):
        plt.plot(connection_decade_list, [i] * len(connection_decade_list), 'o-', linewidth=2, 
                label=label if i < 10 else "_nolegend_")  # Only show first 10 in legend
    
    plt.yticks(range(len(connection_labels)), connection_labels)
    plt.xlabel('Decade', fontsize=12)
    plt.title(f'Temporal Evolution of Top {top_n} Persistent {subject_type.title()} Connections', fontsize=16)
    plt.grid(True, alpha=0.3, axis='x')
    
    # Only show legend for first 10 items to avoid overcrowding
    if len(connection_labels) > 10:
        plt.legend(fontsize=10, loc='upper center', bbox_to_anchor=(0.5, -0.15), ncol=2)
    else:
        plt.legend(fontsize=10, loc='upper center', bbox_to_anchor=(0.5, -0.15), ncol=2)
    
    plt.tight_layout()
    
    filename = f"temporal_{subject_type}_persistent_connections.png"
    save_plot(plt, filename, subject_main_dir, subject_type)
    
    # Create a persistence graph (showing only persistent connections)
    G_persistent = nx.Graph()
    
    # Add all items involved in persistent connections
    all_persistent_items = set()
    for (item1, item2), _ in top_connections:
        all_persistent_items.add(item1)
        all_persistent_items.add(item2)
    
    for item in all_persistent_items:
        G_persistent.add_node(item)
    
    # Add edges with weight equal to persistence count
    for (item1, item2), count in top_connections:
        G_persistent.add_edge(item1, item2, weight=count)
    
    # Visualize the persistent connections graph
    plt.figure(figsize=(15, 15))
    
    # Create layout
    pos = nx.spring_layout(G_persistent, k=0.5, seed=42)
    
    # Draw edges with weight-based width
    edge_weights = [G_persistent[u][v]['weight'] for u, v in G_persistent.edges()]
    max_weight = max(edge_weights) if edge_weights else 1
    min_weight = min(edge_weights) if edge_weights else 1
    
    # Normalize edge widths between 1 and 8
    edge_widths = [1 + 7 * (w - min_weight) / (max_weight - min_weight) 
                  if max_weight > min_weight else 3 for w in edge_weights]
    
    nx.draw_networkx_edges(G_persistent, pos,
                         width=edge_widths,
                         alpha=0.7,
                         edge_color='#5DA5DA')
    
    # Draw nodes
    node_sizes = [300 + 200 * G_persistent.degree(node) for node in G_persistent.nodes()]
    
    nx.draw_networkx_nodes(G_persistent, pos,
                         node_size=node_sizes,
                         node_color='#FAA43A',
                         alpha=0.8,
                         linewidths=2,
                         edgecolors='white')
    
    # Add labels
    font_sizes = {node: min(12 + G_persistent.degree(node), 18) for node in G_persistent.nodes()}
    nx.draw_networkx_labels(G_persistent, pos, font_size=font_sizes, font_weight='bold')
    
    plt.title(f'Persistent {subject_type.title()} Connections Network', fontsize=20)
    plt.axis('off')
    plt.tight_layout()
    
    filename = f"temporal_{subject_type}_persistent_network.png"
    save_plot(plt, filename, subject_main_dir, subject_type)

In [None]:
def visualize_community_evolution(temporal_metrics, subject_main_dir, subject_type, top_n=5):
    """
    Visualize the evolution of communities over time
    
    Parameters:
    temporal_metrics (dict): The temporal metrics dictionary
    subject_main_dir (str/Path): Directory to save visualizations
    subject_type (str): Type of subject (e.g., "domains", "concepts")
    top_n (int): Number of top communities to track per decade
    """
    decades = temporal_metrics['decades']
    communities = temporal_metrics['communities']
    
    # Create a dictionary to track top community members across decades
    community_evolution = {}
    
    for i, decade in enumerate(decades):
        if i >= len(communities) or not communities[i]:
            continue
            
        decade_communities = communities[i]
        
        # Count items per community
        community_counts = Counter(decade_communities.values())
        
        # Get top N communities
        top_communities = community_counts.most_common(top_n)
        
        for comm_id, size in top_communities:
            # Get members of this community
            members = [node for node, c_id in decade_communities.items() if c_id == comm_id]
            
            # Create a key for this decade and community
            key = f"{decade}_comm{comm_id}"
            community_evolution[key] = {
                'decade': decade,
                'comm_id': comm_id,
                'size': size,
                'members': members
            }
    
    # Create a similarity matrix between communities across decades
    similarity_matrix = {}
    community_keys = list(community_evolution.keys())
    
    for i, key1 in enumerate(community_keys):
        similarity_matrix[key1] = {}
        
        for key2 in community_keys:
            if key1 == key2:
                similarity_matrix[key1][key2] = 1.0
                continue
                
            # Skip if same decade (different communities)
            if community_evolution[key1]['decade'] == community_evolution[key2]['decade']:
                similarity_matrix[key1][key2] = 0.0
                continue
            
            # Calculate Jaccard similarity
            set1 = set(community_evolution[key1]['members'])
            set2 = set(community_evolution[key2]['members'])
            
            intersection = len(set1.intersection(set2))
            union = len(set1.union(set2))
            
            similarity = intersection / union if union > 0 else 0
            similarity_matrix[key1][key2] = similarity
    
    # Create a visualization of community similarities across decades
    plt.figure(figsize=(15, 12))
    
    # Prepare data for heatmap
    heatmap_labels = [f"{key.split('_')[0]}s - {key.split('_')[1]}" for key in community_keys]
    heatmap_data = [[similarity_matrix[key1][key2] for key2 in community_keys] for key1 in community_keys]
    
    # Create heatmap
    sns.heatmap(heatmap_data, annot=True, fmt=".2f", 
               xticklabels=heatmap_labels,
               yticklabels=heatmap_labels,
               cmap="YlGnBu")
    
    plt.xlabel('Community', fontsize=12)
    plt.ylabel('Community', fontsize=12)
    plt.title(f'Similarity Between {subject_type.title()} Communities Across Decades', fontsize=16)
    plt.xticks(rotation=90)
    plt.yticks(rotation=0)
    plt.tight_layout()
    
    filename = f"temporal_{subject_type}_community_similarity.png"
    save_plot(plt, filename, subject_main_dir, subject_type)
    
    # Create a visualization of top items in each community
    # Group communities by decade
    communities_by_decade = {}
    for key, data in community_evolution.items():
        decade = data['decade']
        if decade not in communities_by_decade:
            communities_by_decade[decade] = []
        communities_by_decade[decade].append((key, data))
    
    # Create a decade-by-decade community content visualization
    for decade, comms in communities_by_decade.items():
        if not comms:
            continue
            
        # Sort communities by size
        comms = sorted(comms, key=lambda x: x[1]['size'], reverse=True)
        
        # Create visualization for this decade
        plt.figure(figsize=(15, len(comms) * 2.5))
        
        for i, (key, data) in enumerate(comms):
            # Get top items by centrality from the original graph
            decade_idx = list(decades).index(decade)
            if decade_idx < len(temporal_metrics['graph_objects']):
                G = temporal_metrics['graph_objects'][decade_idx]
                
                # Get centrality measure for this graph
                if G.number_of_nodes() > 0:
                    centrality = nx.degree_centrality(G)
                    
                    # Filter for just community members
                    comm_centrality = {node: centrality[node] for node in data['members'] if node in centrality}
                    
                    # Sort by centrality
                    top_items = sorted(comm_centrality.items(), key=lambda x: x[1], reverse=True)[:10]
                    
                    # Create a horizontal bar chart
                    ax = plt.subplot(len(comms), 1, i+1)
                    
                    if top_items:
                        items = [item[0] for item in top_items]
                        values = [item[1] for item in top_items]
                        
                        # Horizontal bar chart
                        ax.barh(range(len(items)), values, color=plt.cm.Set3(i % 12))
                        ax.set_yticks(range(len(items)))
                        ax.set_yticklabels(items)
                        ax.set_title(f"Community {data['comm_id']} - Size: {data['size']}")
                        ax.set_xlabel("Centrality")
                        
                        # Add values
                        for j, value in enumerate(values):
                            ax.text(value, j, f"{value:.3f}", va='center')
            
        plt.tight_layout()
        
        filename = f"temporal_{subject_type}_communities_{decade}s.png"
        save_plot(plt, filename, subject_main_dir, subject_type)

# Multi-Column Graph

In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
import numpy as np
from itertools import combinations
import seaborn as sns
from matplotlib import gridspec
import community.community_louvain as community_louvain
import matplotlib.colors as mcolors
import ast
import matplotlib.patches as mpatches

def create_multi_column_network(df, list_columns, dict_column=None, subject_main_dir=None, 
                               min_occurrences=2, top_n_per_column=None, overall_top_n=None):
    """
    Create a network visualization combining values from multiple list columns and optionally a dict column.
    Values from different columns appear as different node shapes.
    
    Parameters:
    df (pandas.DataFrame): The DataFrame
    list_columns (list): List of column names containing list values
    dict_column (str, optional): Name of a column containing dictionaries (e.g. concept_dict)
    subject_main_dir (str/Path): Directory to save visualizations
    min_occurrences (int): Minimum occurrence for a value to be included
    top_n_per_column (dict, optional): Dict of {column_name: n} to limit top N values per column
    overall_top_n (int, optional): Overall limit on number of nodes (applied after column filters)
    
    Returns:
    networkx.Graph: The network graph object
    dict: Column mapping for node attributes
    """
    # Validate inputs
    all_columns = list_columns.copy()
    if dict_column:
        all_columns.append(dict_column)
    
    for col in all_columns:
        if col not in df.columns:
            raise ValueError(f"Column '{col}' not found in DataFrame")
    
    # Create subject type from combined column names
    subject_type = "_".join([col.lower().split('_')[0] for col in all_columns])
    
    # Process each column to get lists of values
    column_values = {}
    for col in list_columns:
        column_values[col] = preprocess_column(df, col)
    
    # Process dict column if provided
    if dict_column:
        dict_values = preprocess_column(df, dict_column)
        column_values[dict_column] = dict_values
    
    # Count occurrences of all values by column
    column_counts = {}
    for col, values_series in column_values.items():
        counts = Counter()
        for value_list in values_series:
            counts.update(value_list)
        
        # Filter by minimum occurrences
        filtered_counts = {value: count for value, count in counts.items() 
                          if count >= min_occurrences}
        
        # Apply top N filter per column if specified
        if top_n_per_column and col in top_n_per_column:
            n = top_n_per_column[col]
            filtered_counts = dict(Counter(filtered_counts).most_common(n))
        
        column_counts[col] = filtered_counts
    
    # Create sets of values to include from each column
    column_value_sets = {col: set(counts.keys()) for col, counts in column_counts.items()}
    
    # Track co-occurrences between values (within and across columns)
    cooccurrence_dict = {}
    
    # Process each row in the DataFrame
    for idx in df.index:
        # Collect all values from this row across specified columns
        row_values = []
        value_column_map = {}  # Maps value to its column
        
        for col in all_columns:
            values = column_values[col][idx]
            
            # Only include values that passed the filters
            filtered_values = [v for v in values if v in column_value_sets[col]]
            
            row_values.extend(filtered_values)
            
            # Track which column each value came from
            for v in filtered_values:
                value_column_map[v] = col
        
        # Count co-occurrences
        for i1, i2 in combinations(sorted(set(row_values)), 2):
            pair = tuple(sorted([i1, i2]))
            cooccurrence_dict[pair] = cooccurrence_dict.get(pair, 0) + 1
    
    # Create combined value counts for overall filtering
    all_value_counts = {}
    for col, counts in column_counts.items():
        all_value_counts.update(counts)
    
    # Apply overall top N filter if specified
    if overall_top_n is not None and overall_top_n < len(all_value_counts):
        top_values = set(dict(Counter(all_value_counts).most_common(overall_top_n)).keys())
        
        # Update column value sets
        for col in all_columns:
            column_value_sets[col] = {v for v in column_value_sets[col] if v in top_values}
        
        # Filter cooccurrence dict
        cooccurrence_dict = {(i1, i2): count for (i1, i2), count in cooccurrence_dict.items() 
                            if i1 in top_values and i2 in top_values}
    
    # Create network graph
    G = nx.Graph()
    
    # Add nodes with attributes from their columns
    node_column_map = {}  # To return for later use
    
    for col in all_columns:
        for value in column_value_sets[col]:
            count = column_counts[col][value]
            # Size based on frequency, normalized per column
            max_count = max(column_counts[col].values()) if column_counts[col] else 1
            node_size = 1000 + (count / max_count) * 4000
            
            # Add node with column information and count
            G.add_node(value, column=col, size=node_size, count=count)
            node_column_map[value] = col
    
    # Add edges with weights
    for (i1, i2), weight in cooccurrence_dict.items():
        # Only add edges between nodes that exist in our filtered graph
        if i1 in G.nodes and i2 in G.nodes:
            G.add_edge(i1, i2, weight=weight)
    
    print(f"Created multi-column network with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges")
    print(f"Node distribution by column:")
    for col in all_columns:
        col_count = sum(1 for _, attr in G.nodes(data=True) if attr.get('column') == col)
        print(f"  {col}: {col_count} nodes")
    
    # Return the graph and the column mapping
    return G, all_columns, subject_type, node_column_map

In [None]:
def visualize_multi_column_graph(G, columns, subject_main_dir, subject_type):
    """
    Visualize a multi-column graph with different shapes for different columns
    
    Parameters:
    G (networkx.Graph): The network graph
    columns (list): List of column names
    subject_main_dir (str/Path): Directory to save visualizations
    subject_type (str): Combined subject type
    
    Returns:
    tuple: (figure, axis) matplotlib objects
    """
    # Set up the figure with GridSpec
    fig = plt.figure(figsize=(20, 16), facecolor='white')
    gs = gridspec.GridSpec(1, 20)
    ax_main = fig.add_subplot(gs[0, :19])
    ax_legend = fig.add_subplot(gs[0, 19])
    
    # Define node shapes and colors for each column
    # Using 'o' for circular nodes, '^' for triangular nodes, 's' for square nodes,
    # 'd' for diamond nodes, 'p' for pentagonal nodes, etc.
    shapes = ['o', '^', 's', 'd', 'p', 'h', '8']
    column_shapes = {col: shapes[i % len(shapes)] for i, col in enumerate(columns)}
    
    # Define distinct colors for each column
    color_list = list(mcolors.TABLEAU_COLORS.values())
    column_colors = {col: color_list[i % len(color_list)] for i, col in enumerate(columns)}
    
    # Create layout with more spread
    pos = nx.spring_layout(G, k=0.3, iterations=100, seed=42)
    
    # Draw edges with weight-based colors
    edge_weights = [G[u][v]['weight'] for u, v in G.edges()]
    if edge_weights:
        max_weight = max(edge_weights)
        min_weight = min(edge_weights)
        
        # Calculate normalized edge colors and widths
        edge_colors = []
        edge_widths = []
        
        for weight in edge_weights:
            # Color scale from light to dark blue based on weight
            if max_weight > min_weight:
                color_val = 0.4 + 0.6 * (np.log1p(weight - min_weight) / np.log1p(max_weight - min_weight))
            else:
                color_val = 0.5
            edge_colors.append(color_val)
            
            # Width scale from 1 to 5 based on weight
            if max_weight > min_weight:
                width = 1 + 4 * (weight - min_weight) / (max_weight - min_weight)
            else:
                width = 3
            edge_widths.append(width)
        
        # Draw all edges
        nx.draw_networkx_edges(G, pos,
                             edge_color=edge_colors,
                             width=edge_widths,
                             edge_cmap=plt.cm.Blues,
                             alpha=0.6,
                             ax=ax_main)
    
    # Draw nodes by column types
    for col in columns:
        # Get nodes from this column
        col_nodes = [node for node, attr in G.nodes(data=True) if attr.get('column') == col]
        
        if not col_nodes:
            continue
        
        # Get sizes
        node_sizes = [G.nodes[node]['size'] for node in col_nodes]
        
        # Draw nodes with specific shape and color
        shape = column_shapes[col]
        color = column_colors[col]
        
        # For non-circular shapes, adjust size for visual consistency
        size_multiplier = 0.5 if shape != 'o' else 1
        
        nx.draw_networkx_nodes(G, pos,
                             nodelist=col_nodes,
                             node_size=[size * size_multiplier for size in node_sizes],
                             node_color=color,
                             node_shape=shape,
                             alpha=0.8,
                             linewidths=1,
                             edgecolors='white',
                             ax=ax_main)
    
    # Get top nodes by degree and betweenness for labeling
    # Calculate degree centrality
    degree_cent = nx.degree_centrality(G)
    sorted_degree = sorted(degree_cent.items(), key=lambda x: x[1], reverse=True)
    top_degree_nodes = [node for node, _ in sorted_degree[:min(30, len(G.nodes()))]]
    
    # Calculate betweenness centrality for larger graphs
    if len(G.nodes()) > 10:
        betweenness_cent = nx.betweenness_centrality(G, k=min(50, len(G.nodes())), normalized=True)
        sorted_betweenness = sorted(betweenness_cent.items(), key=lambda x: x[1], reverse=True)
        top_between_nodes = [node for node, _ in sorted_betweenness[:min(15, len(G.nodes()))]]
    else:
        top_between_nodes = []
    
    # Combine important nodes
    important_nodes = set(top_degree_nodes) | set(top_between_nodes)
    
    # Add labels for important nodes only
    labels = {node: node if node in important_nodes else '' for node in G.nodes()}
    
    # Adjust font size based on graph size
    if len(G.nodes()) > 100:
        font_size = 8
    elif len(G.nodes()) > 50:
        font_size = 10
    else:
        font_size = 12
    
    nx.draw_networkx_labels(G, pos, labels=labels, font_size=font_size, 
                          font_weight='bold', font_color='black', ax=ax_main)
    
    # Create legend for column types
    ax_legend.axis('off')
    legend_elements = []
    y_positions = np.linspace(0.9, 0.7, len(columns))

    for i, col in enumerate(columns):
        shape = column_shapes[col]
        color = column_colors[col]
        
        # Create patch for legend
        if shape == 'o':
            patch = mpatches.Circle((0.2, 0.9 - (i * 0.1)), radius=0.05, 
                                   facecolor=color, edgecolor='white', alpha=0.8,
                                   transform=ax_legend.transData)
        elif shape == '^':
            patch = mpatches.RegularPolygon((0.2, 0.9 - (i * 0.1)), 3, radius=0.06, 
                                           facecolor=color, edgecolor='white', alpha=0.8,
                                           transform=ax_legend.transData)
        elif shape == 's':
            patch = mpatches.Rectangle((0.15, 0.85 - (i * 0.1)), 0.1, 0.1, 
                                      facecolor=color, edgecolor='white', alpha=0.8,
                                      transform=ax_legend.transData)
        elif shape == 'd':
            patch = mpatches.RegularPolygon((0.2, 0.9 - (i * 0.1)), 4, radius=0.06, 
                                           orientation=np.pi/4, facecolor=color, 
                                           edgecolor='white', alpha=0.8,
                                           transform=ax_legend.transData)
        elif shape == 'p':
            patch = mpatches.RegularPolygon((0.2, 0.9 - (i * 0.1)), 5, radius=0.06, 
                                           facecolor=color, edgecolor='white', alpha=0.8,
                                           transform=ax_legend.transData)
        elif shape == 'h':
            patch = mpatches.RegularPolygon((0.2, 0.9 - (i * 0.1)), 6, radius=0.06, 
                                           facecolor=color, edgecolor='white', alpha=0.8,
                                           transform=ax_legend.transData)
        else:
            patch = mpatches.Circle((0.2, 0.9 - (i * 0.1)), radius=0.05, 
                                   facecolor=color, edgecolor='white', alpha=0.8,
                                   transform=ax_legend.transData)
        
        # Format column name for legend
        col_name = col.replace('_', ' ').title()
        
        # Add patch directly with the position already set
        ax_legend.add_patch(patch)
        
        # Add text label
        y_pos = 0.9 - (i * 0.1)
        ax_legend.text(0.4, y_pos, col_name, va='center', fontsize=12)

    # Add title with node and edge counts
    num_columns = len(columns)
    title = f"Multi-Column Network: {num_columns} Entity Types\n"
    title += f"Nodes: {G.number_of_nodes()}, Edges: {G.number_of_edges()}"
    
    ax_main.set_title(title, fontsize=16, pad=20)
    ax_main.axis('off')
    
    plt.tight_layout()
    
    # Create filename
    columns_string = "_".join([col.lower().split('_')[0] for col in columns])
    filename = f"multi_column_network_{columns_string}.png"
    
    # Save the visualization
    if subject_main_dir:
        save_plot(plt, filename, subject_main_dir, subject_type)
    
    return fig, ax_main

In [None]:
def analyze_multi_column_network(G, columns, subject_main_dir, subject_type, min_clique_size=3):
    """
    Analyze a multi-column network using community detection, 
    clique analysis, and bridge node identification
    
    Parameters:
    G (networkx.Graph): The multi-column network
    columns (list): List of column names
    subject_main_dir (str/Path): Directory to save visualizations
    subject_type (str): Combined subject type
    min_clique_size (int): Minimum size for significant cliques
    
    Returns:
    dict: Analysis results including communities, cliques, and bridge nodes
    """
    # Initialize results dictionary
    results = {
        'communities': None,
        'cliques': None,
        'bridge_nodes': None,
        'column_interactions': None
    }
    
    # Skip analysis if graph is too small
    if G.number_of_nodes() < 3 or G.number_of_edges() < 2:
        print("Graph is too small for meaningful analysis")
        return results
    
    # Detect communities
    communities = community_louvain.best_partition(G)
    results['communities'] = communities
    
    # Count community distribution
    community_counts = Counter(communities.values())
    
    # Run community analysis
    print(f"\n{subject_type.title()} Community Analysis:")
    print(f"Number of communities: {len(set(communities.values()))}")
    
    print("\nLargest communities:")
    for comm_id, count in sorted(community_counts.items(), key=lambda x: x[1], reverse=True)[:5]:
        members = [node for node, c_id in communities.items() if c_id == comm_id]
        
        # Count members by column type
        column_counts = {}
        for member in members:
            col = G.nodes[member]['column']
            column_counts[col] = column_counts.get(col, 0) + 1
        
        # Format column distribution
        col_distribution = ", ".join([f"{col}: {count}" for col, count in column_counts.items()])
        
        print(f"\nCommunity {comm_id} (Size: {count}):")
        print(f"Distribution: {col_distribution}")
        print(f"Sample members: {', '.join(members[:5])}...")
    
    # Find cliques
    cliques = list(nx.find_cliques(G))
    significant_cliques = [c for c in cliques if len(c) >= min_clique_size]
    results['cliques'] = significant_cliques
    
    print(f"\nClique Analysis:")
    print(f"Number of maximal cliques (size ≥ {min_clique_size}): {len(significant_cliques)}")
    
    # Print largest cliques
    print("\nLargest cliques:")
    for clique in sorted(significant_cliques, key=len, reverse=True)[:5]:
        # Count clique members by column type
        column_counts = {}
        for member in clique:
            col = G.nodes[member]['column']
            column_counts[col] = column_counts.get(col, 0) + 1
        
        # Format column distribution
        col_distribution = ", ".join([f"{col}: {count}" for col, count in column_counts.items()])
        
        print(f"\nSize {len(clique)}:")
        print(f"Distribution: {col_distribution}")
        print(f"Members: {', '.join(clique)}")
    
    # Find bridge nodes
    betweenness = nx.betweenness_centrality(G, weight='weight')
    bridge_scores = {}
    
    for node in G.nodes():
        node_comm = communities[node]
        cross_comm_connections = sum(1 for neighbor in G.neighbors(node) 
                                  if communities.get(neighbor, -1) != node_comm)
        
        # Bridge score combines betweenness and cross-community connections
        bridge_scores[node] = {
            'betweenness': betweenness[node],
            'cross_comm_connections': cross_comm_connections,
            'bridge_score': betweenness[node] * (1 + cross_comm_connections),
            'column': G.nodes[node]['column']
        }
    
    # Sort nodes by bridge score
    sorted_bridges = sorted(bridge_scores.items(), key=lambda x: x[1]['bridge_score'], reverse=True)
    top_bridges = sorted_bridges[:10]
    results['bridge_nodes'] = top_bridges
    
    print(f"\nTop Bridge Nodes Between Communities:")
    for node, scores in top_bridges:
        print(f"{node} ({G.nodes[node]['column']}): Bridge Score={scores['bridge_score']:.4f}, "
              f"Betweenness={scores['betweenness']:.4f}, "
              f"Cross-Comm. Connections={scores['cross_comm_connections']}")
    
    # Analyze column interactions
    column_interactions = analyze_column_interactions(G, columns)
    results['column_interactions'] = column_interactions
    
    # Visualize community structure
    visualize_multi_column_communities(G, communities, columns, subject_main_dir, subject_type)
    
    return results

def analyze_column_interactions(G, columns):
    """
    Analyze how different column types interact in the network
    
    Parameters:
    G (networkx.Graph): The multi-column network
    columns (list): List of column names
    
    Returns:
    dict: Analysis of interactions between column types
    """
    # Initialize interaction counters
    interactions = {
        'within': {col: 0 for col in columns},
        'between': {(col1, col2): 0 for col1 in columns for col2 in columns if col1 < col2}
    }
    
    # Count interactions by edge type
    for u, v, data in G.edges(data=True):
        col_u = G.nodes[u]['column']
        col_v = G.nodes[v]['column']
        weight = data.get('weight', 1)
        
        if col_u == col_v:
            # Within-column interaction
            interactions['within'][col_u] += weight
        else:
            # Between-column interaction
            pair = tuple(sorted([col_u, col_v]))
            interactions['between'][pair] += weight
    
    # Count node distribution by column
    node_counts = {}
    for col in columns:
        node_counts[col] = sum(1 for _, attr in G.nodes(data=True) if attr.get('column') == col)
    
    # Calculate interaction density relative to potential interactions
    densities = {
        'within': {},
        'between': {}
    }
    
    for col in columns:
        n = node_counts[col]
        potential = n * (n - 1) / 2
        if potential > 0:
            densities['within'][col] = interactions['within'][col] / potential
        else:
            densities['within'][col] = 0
    
    for col1, col2 in interactions['between'].keys():
        n1 = node_counts[col1]
        n2 = node_counts[col2]
        potential = n1 * n2
        if potential > 0:
            densities['between'][(col1, col2)] = interactions['between'][(col1, col2)] / potential
        else:
            densities['between'][(col1, col2)] = 0
    
    # Print interaction summary
    print("\nColumn Interaction Analysis:")
    
    print("\nNode counts by column:")
    for col, count in node_counts.items():
        print(f"  {col}: {count} nodes")
    
    print("\nWithin-column interactions:")
    for col, count in interactions['within'].items():
        if node_counts[col] > 1:
            density = densities['within'][col]
            print(f"  {col}: {count} interactions (density: {density:.4f})")
    
    print("\nBetween-column interactions:")
    for (col1, col2), count in interactions['between'].items():
        if node_counts[col1] > 0 and node_counts[col2] > 0:
            density = densities['between'][(col1, col2)]
            print(f"  {col1} <-> {col2}: {count} interactions (density: {density:.4f})")
    
    return {
        'interactions': interactions,
        'densities': densities,
        'node_counts': node_counts
    }

def visualize_multi_column_communities(G, communities, columns, subject_main_dir, subject_type):
    """
    Visualize communities in a multi-column network with improved legend
    
    Parameters:
    G (networkx.Graph): The multi-column network
    communities (dict): Community assignments
    columns (list): List of column names
    subject_main_dir (str/Path): Directory to save visualizations
    subject_type (str): Combined subject type
    """
    import matplotlib.pyplot as plt
    import matplotlib.patches as mpatches
    import matplotlib.colors as mcolors
    import matplotlib.gridspec as gridspec
    import networkx as nx
    import numpy as np
    from collections import Counter
    
    # Set up the figure with GridSpec - allocate more space for the legend
    fig = plt.figure(figsize=(22, 16), facecolor='white')
    gs = gridspec.GridSpec(1, 24)
    ax_main = fig.add_subplot(gs[0, :19])  # Main graph takes 19/24 of width
    ax_legend = fig.add_subplot(gs[0, 19:])  # Legend takes 5/24 of width
    
    # Define node shapes and colors for each column
    shapes = ['o', '^', 's', 'd', 'p', 'h', '8']
    column_shapes = {col: shapes[i % len(shapes)] for i, col in enumerate(columns)}
    
    # Define distinct colors for each column (for legend)
    type_colors = list(mcolors.TABLEAU_COLORS.values())
    column_colors = {col: type_colors[i % len(type_colors)] for i, col in enumerate(columns)}
    
    # Generate colors for communities
    n_communities = len(set(communities.values()))
    community_colors = plt.cm.tab20(np.linspace(0, 1, n_communities))
    
    # Create layout
    pos = nx.spring_layout(G, k=0.3, iterations=100, seed=42)
    
    # Draw edges with reduced opacity
    edge_weights = [G[u][v]['weight'] for u, v in G.edges()]
    if edge_weights:
        max_weight = max(edge_weights)
        min_weight = min(edge_weights)
        
        edge_colors = []
        edge_widths = []
        
        for weight in edge_weights:
            if max_weight > min_weight:
                # Use weight for width but fixed color (more transparent)
                width = 1 + 4 * (weight - min_weight) / (max_weight - min_weight)
            else:
                width = 1.5
            edge_widths.append(width)
        
        nx.draw_networkx_edges(G, pos,
                             width=edge_widths,
                             edge_color='gray',
                             alpha=0.4,
                             ax=ax_main)
    
    # Draw nodes by column types with community colors
    for col in columns:
        for comm_id in set(communities.values()):
            # Get nodes from this column and community
            nodes = [node for node, attr in G.nodes(data=True) 
                   if attr.get('column') == col and communities[node] == comm_id]
            
            if not nodes:
                continue
            
            # Get node sizes
            node_sizes = [G.nodes[node]['size'] for node in nodes]
            
            # Draw nodes with specific shape and community color
            shape = column_shapes[col]
            color = community_colors[comm_id]
            
            # For non-circular shapes, adjust size for visual consistency
            size_multiplier = 0.5 if shape != 'o' else 1
            
            nx.draw_networkx_nodes(G, pos,
                                 nodelist=nodes,
                                 node_size=[size * size_multiplier for size in node_sizes],
                                 node_color=[color] * len(nodes),
                                 node_shape=shape,
                                 alpha=0.8,
                                 linewidths=1,
                                 edgecolors='white',
                                 ax=ax_main)
    
    # Add labels for important nodes
    # Calculate degree centrality
    degree_cent = nx.degree_centrality(G)
    sorted_degree = sorted(degree_cent.items(), key=lambda x: x[1], reverse=True)
    top_degree_nodes = [node for node, _ in sorted_degree[:min(30, len(G.nodes()))]]
    
    # Add labels for important nodes only
    labels = {node: node if node in top_degree_nodes else '' for node in G.nodes()}
    
    # Adjust font size based on graph size
    if len(G.nodes()) > 100:
        font_size = 8
    elif len(G.nodes()) > 50:
        font_size = 10
    else:
        font_size = 12
    
    nx.draw_networkx_labels(G, pos, labels=labels, font_size=font_size, 
                          font_weight='bold', font_color='black', ax=ax_main)
    
    # ----------- IMPROVED LEGEND SECTION -----------
    
    # Clean up legend axis
    ax_legend.axis('off')
    ax_legend.set_xlim(0, 1)
    ax_legend.set_ylim(0, 1)
    
    # Add title for column section
    ax_legend.text(0.5, 0.98, "Column Types", ha='center', fontsize=14, fontweight='bold')
    
    # Add column type legend with better spacing
    spacing = min(0.08, 0.7 / len(columns))  # Adaptive spacing based on number of columns
    
    for i, col in enumerate(columns):
        shape = column_shapes[col]
        color = column_colors[col]
        
        # Calculate y position with adaptive spacing
        y_pos = 0.93 - (i * spacing * 1.5)
        
        # Create patch for legend based on shape
        if shape == 'o':
            patch = mpatches.Circle((0.15, y_pos), radius=0.03, 
                                  facecolor=color, edgecolor='black', alpha=0.8)
        elif shape == '^':
            patch = mpatches.RegularPolygon((0.15, y_pos), 3, radius=0.035, 
                                          facecolor=color, edgecolor='black', alpha=0.8)
        elif shape == 's':
            patch = mpatches.Rectangle((0.12, y_pos-0.03), 0.06, 0.06, 
                                     facecolor=color, edgecolor='black', alpha=0.8)
        elif shape == 'd':
            patch = mpatches.RegularPolygon((0.15, y_pos), 4, radius=0.035, 
                                          orientation=np.pi/4, facecolor=color, 
                                          edgecolor='black', alpha=0.8)
        elif shape == 'p':
            patch = mpatches.RegularPolygon((0.15, y_pos), 5, radius=0.035, 
                                          facecolor=color, edgecolor='black', alpha=0.8)
        elif shape == 'h':
            patch = mpatches.RegularPolygon((0.15, y_pos), 6, radius=0.035, 
                                          facecolor=color, edgecolor='black', alpha=0.8)
        else:
            patch = mpatches.Circle((0.15, y_pos), radius=0.03, 
                                  facecolor=color, edgecolor='black', alpha=0.8)
        
        # Add patch to legend
        ax_legend.add_patch(patch)
        
        # Format column name for legend (clean up and ensure reasonable length)
        col_name = col.replace('_', ' ').title()
        if len(col_name) > 20:  # Truncate long column names
            col_name = col_name[:18] + '...'
            
        # Add text label with more horizontal spacing
        ax_legend.text(0.3, y_pos, col_name, va='center', fontsize=12)
    
    # Add divider line between column types and communities
    community_section_start = 0.93 - (len(columns) * spacing * 1.5) - 0.1
    ax_legend.axhline(y=community_section_start + 0.05, xmin=0.05, xmax=0.95, color='gray', linestyle='-', linewidth=1)
    
    # Add title for community section
    ax_legend.text(0.5, community_section_start, "Communities", ha='center', fontsize=14, fontweight='bold')
    
    # Count communities
    community_counts = Counter(communities.values())
    top_communities = sorted(community_counts.items(), key=lambda x: x[1], reverse=True)
    
    # Determine how many communities to show based on available space and total communities
    max_communities_to_show = min(10, len(top_communities), int((community_section_start - 0.05) / (spacing * 1.5)))
    top_communities = top_communities[:max_communities_to_show]
    
    # Add community entries with better spacing
    for i, (comm_id, count) in enumerate(top_communities):
        y_pos = community_section_start - 0.08 - (i * spacing * 1.5)
        color = community_colors[comm_id]
        
        # Add color patch
        patch = mpatches.Circle((0.15, y_pos), radius=0.03, 
                              facecolor=color, edgecolor='black', alpha=0.8)
        ax_legend.add_patch(patch)
        
        # Add text label with count
        ax_legend.text(0.3, y_pos, f"Community {comm_id} ({count})", va='center', fontsize=11)
    
    # If there are more communities than shown, add an indication
    if len(community_counts) > max_communities_to_show:
        remaining = len(community_counts) - max_communities_to_show
        y_pos = community_section_start - 0.08 - (max_communities_to_show * spacing * 1.5) - (spacing * 0.5)
        ax_legend.text(0.5, y_pos, f"+ {remaining} more communities", 
                     ha='center', va='center', fontsize=10, fontstyle='italic', color='gray')
    
    # Add title with community information
    title = f"{subject_type.title()} Network\nColored by Communities ({n_communities} communities)"
    ax_main.set_title(title, fontsize=16)
    ax_main.axis('off')
    
    plt.tight_layout()
    
    # Create filename
    filename = f"multi_column_communities_{subject_type}.png"
    
    # Save the plot
    if subject_main_dir:
        save_plot(plt, filename, subject_main_dir, subject_type)
    
    return fig, ax_main

In [None]:
def visualize_column_interactions(column_interactions, columns, subject_main_dir, subject_type):
    """
    Visualize interactions between different column types
    
    Parameters:
    column_interactions (dict): The column interaction analysis results
    columns (list): List of column names
    subject_main_dir (str/Path): Directory to save visualizations
    subject_type (str): Combined subject type
    """
    interactions = column_interactions['interactions']
    densities = column_interactions['densities']
    node_counts = column_interactions['node_counts']
    
    # 1. Create a heatmap of interaction densities
    plt.figure(figsize=(12, 10))
    
    # Prepare data for heatmap
    # Create a matrix with all column pairs
    all_columns = columns.copy()
    n_cols = len(all_columns)
    density_matrix = np.zeros((n_cols, n_cols))
    
    # Fill diagonal with within-column densities
    for i, col in enumerate(all_columns):
        density_matrix[i, i] = densities['within'].get(col, 0)
    
    # Fill off-diagonal with between-column densities
    for i, col1 in enumerate(all_columns):
        for j, col2 in enumerate(all_columns):
            if i < j:  # Upper triangle
                pair = (col1, col2)
                density_matrix[i, j] = densities['between'].get(pair, 0)
                density_matrix[j, i] = density_matrix[i, j]  # Mirror
    
    # Create heatmap
    sns.heatmap(density_matrix, annot=True, fmt='.4f',
               xticklabels=[col.replace('_', ' ').title() for col in all_columns],
               yticklabels=[col.replace('_', ' ').title() for col in all_columns],
               cmap='YlGnBu')
    
    plt.title(f'Interaction Density Between Entity Types', fontsize=14)
    plt.tight_layout()
    
    # Save the plot
    filename = f"column_interaction_density_{subject_type}.png"
    save_plot(plt, filename, subject_main_dir, subject_type)
    
    # 2. Create a network visualization of column interactions
    plt.figure(figsize=(12, 10))
    
    # Create a graph of column interactions
    G_cols = nx.Graph()
    
    # Add nodes (columns)
    for col in all_columns:
        # Size proportional to number of entities
        size = node_counts.get(col, 0)
        G_cols.add_node(col, size=size)
    
    # Add edges (interactions between columns)
    for (col1, col2), weight in interactions['between'].items():
        if weight > 0:
            G_cols.add_edge(col1, col2, weight=weight)
    
    # Add self-loops (interactions within columns)
    for col, weight in interactions['within'].items():
        if weight > 0:
            G_cols.add_edge(col, col, weight=weight)
    
    # Calculate node positions
    if len(all_columns) <= 3:
        # For few columns, use circular layout
        pos = nx.circular_layout(G_cols)
    else:
        # For more columns, use spring layout
        pos = nx.spring_layout(G_cols, k=0.9, iterations=50, seed=42)
    
    # Get node sizes based on entity count
    node_sizes = [1000 + (node_counts.get(col, 0) * 20) for col in G_cols.nodes()]
    
    # Get edge widths based on interaction weight
    edge_weights = [G_cols[u][v]['weight'] for u, v in G_cols.edges()]
    max_weight = max(edge_weights) if edge_weights else 1
    
    # Scale edge widths between 1 and 10
    edge_widths = [1 + 9 * (weight / max_weight) for weight in edge_weights]
    
    # Draw the graph
    nx.draw_networkx_nodes(G_cols, pos,
                         node_size=node_sizes,
                         node_color='lightblue',
                         alpha=0.8,
                         edgecolors='white',
                         linewidths=2)
    
    nx.draw_networkx_edges(G_cols, pos,
                         width=edge_widths,
                         alpha=0.7,
                         edge_color='gray',
                         style='solid')
    
    # Add edge labels (interaction counts)
    edge_labels = {}
    for (u, v), width in zip(G_cols.edges(), edge_weights):
        # Format large numbers with k suffix
        weight = G_cols[u][v]['weight']
        if weight >= 1000:
            label = f"{weight/1000:.1f}k"
        else:
            label = f"{weight}"
        edge_labels[(u, v)] = label
    
    nx.draw_networkx_edge_labels(G_cols, pos,
                               edge_labels=edge_labels,
                               font_size=10)
    
    # Add node labels (column names)
    labels = {col: col.replace('_', ' ').title() for col in G_cols.nodes()}
    nx.draw_networkx_labels(G_cols, pos,
                          labels=labels,
                          font_size=12,
                          font_weight='bold')
    
    plt.title(f'Entity Type Interaction Network', fontsize=14)
    plt.axis('off')
    plt.tight_layout()
    
    # Save the plot
    filename = f"column_interaction_network_{subject_type}.png"
    save_plot(plt, filename, subject_main_dir, subject_type)
    
    # 3. Create a stacked bar chart of community composition
    plt.figure(figsize=(14, 8))
    
    # Get community distribution by column
    community_composition = {}
    
    # This requires community data, which we don't have in this function
    # Let's create a placeholder - the actual function would need to be called
    # with the community data from analyze_multi_column_network
    
    return True

def analyze_cross_column_communities(G, communities, columns):
    """
    Analyze how communities span across different column types
    
    Parameters:
    G (networkx.Graph): The multi-column network
    communities (dict): Community assignments
    columns (list): List of column names
    
    Returns:
    dict: Analysis of cross-column community structure
    """
    # Count community distribution by column
    community_composition = {}
    
    # Get unique community IDs
    community_ids = sorted(set(communities.values()))
    
    for comm_id in community_ids:
        # Get members of this community
        members = [node for node, c_id in communities.items() if c_id == comm_id]
        
        # Count by column
        col_counts = {}
        for member in members:
            col = G.nodes[member]['column']
            col_counts[col] = col_counts.get(col, 0) + 1
        
        # Store counts
        community_composition[comm_id] = col_counts
    
    # Count total nodes by column
    total_by_column = {}
    for col in columns:
        total_by_column[col] = sum(1 for _, attr in G.nodes(data=True) if attr.get('column') == col)
    
    # Calculate what percentage of each column's nodes are in each community
    percentage_by_column = {}
    
    for comm_id, col_counts in community_composition.items():
        percentage_by_column[comm_id] = {}
        
        for col in columns:
            # Calculate percentage (avoid division by zero)
            if total_by_column.get(col, 0) > 0:
                percentage = (col_counts.get(col, 0) / total_by_column[col]) * 100
            else:
                percentage = 0
            
            percentage_by_column[comm_id][col] = percentage
    
    # Find communities with good cross-column representation
    cross_column_communities = []
    
    for comm_id, col_percentages in percentage_by_column.items():
        # Count columns with significant representation (>10%)
        significant_columns = sum(1 for col, pct in col_percentages.items() if pct >= 10)
        
        if significant_columns >= 2:  # At least two columns have >10% of their nodes in this community
            cross_column_communities.append({
                'community_id': comm_id,
                'significant_columns': significant_columns,
                'column_percentages': col_percentages
            })
    
    # Sort by number of significant columns
    cross_column_communities.sort(key=lambda x: x['significant_columns'], reverse=True)
    
    return {
        'community_composition': community_composition,
        'percentage_by_column': percentage_by_column,
        'cross_column_communities': cross_column_communities,
        'total_by_column': total_by_column
    }

def visualize_community_composition(community_analysis, columns, subject_main_dir, subject_type):
    """
    Visualize how communities are composed of different column types
    
    Parameters:
    community_analysis (dict): The community composition analysis results
    columns (list): List of column names
    subject_main_dir (str/Path): Directory to save visualizations
    subject_type (str): Combined subject type
    """
    community_composition = community_analysis['community_composition']
    percentage_by_column = community_analysis['percentage_by_column']
    cross_column_communities = community_analysis['cross_column_communities']
    
    # Get top communities by size
    community_sizes = {comm_id: sum(col_counts.values()) 
                      for comm_id, col_counts in community_composition.items()}
    
    top_communities = sorted(community_sizes.items(), key=lambda x: x[1], reverse=True)[:10]
    top_comm_ids = [comm_id for comm_id, _ in top_communities]
    
    # 1. Create a stacked bar chart of community composition
    plt.figure(figsize=(14, 8))
    
    # Prepare data for stacked bar chart
    comm_labels = [f"Comm {comm_id}" for comm_id in top_comm_ids]
    
    # Create bottom value for stacked bars
    bottoms = np.zeros(len(top_comm_ids))
    
    # Define colors for columns
    color_list = list(mcolors.TABLEAU_COLORS.values())
    column_colors = {col: color_list[i % len(color_list)] for i, col in enumerate(columns)}
    
    # Plot each column's contribution as a section of the stacked bar
    for col in columns:
        # Get values for this column across top communities
        values = [community_composition[comm_id].get(col, 0) for comm_id in top_comm_ids]
        
        # Skip if no values
        if sum(values) == 0:
            continue
        
        # Plot this column's contribution
        plt.bar(comm_labels, values, bottom=bottoms, label=col.replace('_', ' ').title(),
               color=column_colors[col])
        
        # Update bottoms for next layer
        bottoms += values
    
    plt.title('Entity Type Composition of Top Communities', fontsize=14)
    plt.xlabel('Community')
    plt.ylabel('Number of Entities')
    plt.legend(title='Entity Type')
    plt.xticks(rotation=45)
    plt.tight_layout()
    
    # Save the plot
    filename = f"community_composition_{subject_type}.png"
    save_plot(plt, filename, subject_main_dir, subject_type)
    
    # 2. Create a heatmap of column percentages in communities
    plt.figure(figsize=(14, 10))
    
    # Prepare data for heatmap
    heatmap_data = []
    for comm_id in top_comm_ids:
        row = [percentage_by_column[comm_id].get(col, 0) for col in columns]
        heatmap_data.append(row)
    
    # Create heatmap
    sns.heatmap(heatmap_data, annot=True, fmt='.1f',
               xticklabels=[col.replace('_', ' ').title() for col in columns],
               yticklabels=comm_labels,
               cmap='YlGnBu')
    
    plt.title('Percentage of Each Entity Type in Communities', fontsize=14)
    plt.tight_layout()
    
    # Save the plot
    filename = f"community_percentages_{subject_type}.png"
    save_plot(plt, filename, subject_main_dir, subject_type)
    
    # 3. Highlight cross-column communities
    if cross_column_communities:
        plt.figure(figsize=(14, 8))
        
        # Limit to top 8 cross-column communities
        top_cross = cross_column_communities[:min(8, len(cross_column_communities))]
        
        # Prepare data
        cross_labels = [f"Comm {c['community_id']}" for c in top_cross]
        
        # Plot heatmap of percentages
        cross_data = []
        for c in top_cross:
            row = [c['column_percentages'].get(col, 0) for col in columns]
            cross_data.append(row)
        
        sns.heatmap(cross_data, annot=True, fmt='.1f',
                   xticklabels=[col.replace('_', ' ').title() for col in columns],
                   yticklabels=cross_labels,
                   cmap='YlGnBu')
        
        plt.title('Communities with Strong Cross-Entity Type Representation', fontsize=14)
        plt.tight_layout()
        
        # Save the plot
        filename = f"cross_column_communities_{subject_type}.png"
        save_plot(plt, filename, subject_main_dir, subject_type)
    
    return True

In [None]:
def flatten_countries_column(df):
    """
    Convert the nested list structure in the 'Countries' column to a flat list.
    
    Parameters:
    df (pandas.DataFrame): The DataFrame containing the 'Countries' column
    
    Returns:
    pandas.DataFrame: DataFrame with flattened 'Countries' column
    """
    # Create a copy of the DataFrame to avoid modifying the original
    df_copy = df.copy()
    
    # Get the Countries column as processed lists
    countries_lists = preprocess_column(df_copy, 'Countries')
    
    # Flatten any nested lists
    flattened_countries = []
    
    for countries in countries_lists:
        # If already a flat list, keep as is
        if all(isinstance(item, str) for item in countries):
            flattened_countries.append(countries)
        else:
            # Handle nested lists by flattening
            flat_list = []
            for item in countries:
                if isinstance(item, list):
                    flat_list.extend(item)
                else:
                    flat_list.append(item)
            flattened_countries.append(flat_list)
    
    # Update the DataFrame with the flattened lists
    df_copy['Countries_flat'] = flattened_countries
    
    # Print some statistics
    total_countries = sum(len(countries) for countries in flattened_countries)
    unique_countries = len(set(country for countries in flattened_countries for country in countries))
    
    print(f"Flattened 'Countries' column: {total_countries} total country mentions")
    print(f"Found {unique_countries} unique countries")
    
    return df_copy

# Example usage
df_flattened = flatten_countries_column(df)


In [None]:
G_at, columns_at, subject_type_at, node_column_map_at = create_multi_column_network(
    df_flattened,
    list_columns=['Institutions', 'Countries_flat'],  # List columns
    dict_column=None,                    # No dictionary column
    subject_main_dir=subject_main_dir,
    min_occurrences=3,                   # Only include items with at least 3 occurrences
    top_n_per_column={                   # Limit items per column
        'Institutions': 10, 
        'Countries_flat': 10
    },
    overall_top_n=20
)

# Visualize the multi-column network
fig, ax_main=visualize_multi_column_graph(G_at, columns_at, subject_main_dir, subject_type_at)
# legend_ax = fix_legend_display(fig, ax_main)
# Analyze the multi-column network
analysis_at = analyze_multi_column_network(
    G_at, 
    columns_at, 
    subject_main_dir, 
    subject_type_at,
    min_clique_size=3
)

# Analyze cross-column communities
community_analysis_at = analyze_cross_column_communities(
    G_at,
    analysis_at['communities'],
    columns_at
)

# Visualize community composition
visualize_community_composition(
    community_analysis_at,
    columns_at,
    subject_main_dir,
    subject_type_at
)

G_at, columns_at, subject_type_at, node_column_map_at = create_multi_column_network(
    df_flattened,
    list_columns=['Authors', 'Institutions', 'Countries_flat'],  # List columns
    dict_column=None,                    # No dictionary column
    subject_main_dir=subject_main_dir,
    min_occurrences=3,                   # Only include items with at least 3 occurrences
    top_n_per_column={                   # Limit items per column
        'Authors': 10,
        'Institutions': 10, 
        'Countries_flat': 5
    },
    overall_top_n=25
)

# Visualize the multi-column network
fig, ax_main=visualize_multi_column_graph(G_at, columns_at, subject_main_dir, subject_type_at)
# legend_ax = fix_legend_display(fig, ax_main)
# Analyze the multi-column network
analysis_at = analyze_multi_column_network(
    G_at, 
    columns_at, 
    subject_main_dir, 
    subject_type_at,
    min_clique_size=3
)

# Analyze cross-column communities
community_analysis_at = analyze_cross_column_communities(
    G_at,
    analysis_at['communities'],
    columns_at
)

# Visualize community composition
visualize_community_composition(
    community_analysis_at,
    columns_at,
    subject_main_dir,
    subject_type_at
)



In [None]:
G_at, columns_at, subject_type_at, node_column_map_at = create_multi_column_network(
    df,
    list_columns=['Authors', 'Topics'],  # List columns
    dict_column=None,                    # No dictionary column
    subject_main_dir=subject_main_dir,
    min_occurrences=3,                   # Only include items with at least 3 occurrences
    top_n_per_column={                   # Limit items per column
        'Authors': 10,
        'Topics': 10
    },
    overall_top_n=20 
)

# Visualize the multi-column network
fig, ax_main=visualize_multi_column_graph(G_at, columns_at, subject_main_dir, subject_type_at)
# legend_ax = fix_legend_display(fig, ax_main)
# Analyze the multi-column network
analysis_at = analyze_multi_column_network(
    G_at, 
    columns_at, 
    subject_main_dir, 
    subject_type_at,
    min_clique_size=3
)

# Analyze cross-column communities
community_analysis_at = analyze_cross_column_communities(
    G_at,
    analysis_at['communities'],
    columns_at
)

# Visualize community composition
visualize_community_composition(
    community_analysis_at,
    columns_at,
    subject_main_dir,
    subject_type_at
)

G_at, columns_at, subject_type_at, node_column_map_at = create_multi_column_network(
    df,
    list_columns=['Authors', 'Fields'],  # List columns
    dict_column=None,                    # No dictionary column
    subject_main_dir=subject_main_dir,
    min_occurrences=3,                   # Only include items with at least 3 occurrences
    top_n_per_column={                   # Limit items per column
        'Authors': 10, 
        'Fields': 10
    },
    overall_top_n=20 
)

# Visualize the multi-column network
fig, ax_main=visualize_multi_column_graph(G_at, columns_at, subject_main_dir, subject_type_at)
# legend_ax = fix_legend_display(fig, ax_main)
# Analyze the multi-column network
analysis_at = analyze_multi_column_network(
    G_at, 
    columns_at, 
    subject_main_dir, 
    subject_type_at,
    min_clique_size=3
)

# Analyze cross-column communities
community_analysis_at = analyze_cross_column_communities(
    G_at,
    analysis_at['communities'],
    columns_at
)

# Visualize community composition
visualize_community_composition(
    community_analysis_at,
    columns_at,
    subject_main_dir,
    subject_type_at
)

In [None]:
# 3. EXAMPLE: Analyzing Institutions with Fields and Sub-fields
print("\n" + "="*50)
print("MULTI-COLUMN NETWORK ANALYSIS: INSTITUTIONS + FIELDS + SUB-FIELDS")
print("="*50)

# Create multi-column network
G_ifs, columns_ifs, subject_type_ifs, node_column_map_ifs = create_multi_column_network(
    df,
    list_columns=['Institutions', 'Fields', 'Sub-fields'],  # List columns
    dict_column=None,                                      # No dictionary column
    subject_main_dir=subject_main_dir,
    min_occurrences=3,                                     # Only include items with at least 3 occurrences
    top_n_per_column={                                     # Limit items per column
        'Institutions': 10,
        'Fields': 5,
        'Sub-fields': 5
    }
)

# Visualize the multi-column network
visualize_multi_column_graph(G_ifs, columns_ifs, subject_main_dir, subject_type_ifs)

# Analyze the multi-column network
analysis_ifs = analyze_multi_column_network(
    G_ifs, 
    columns_ifs, 
    subject_main_dir, 
    subject_type_ifs,
    min_clique_size=3
)

# Analyze cross-column communities
community_analysis_ifs = analyze_cross_column_communities(
    G_ifs,
    analysis_ifs['communities'],
    columns_ifs
)

# Visualize community composition
visualize_community_composition(
    community_analysis_ifs,
    columns_ifs,
    subject_main_dir,
    subject_type_ifs
)

In [None]:
# Example usage of the multi-column network analysis functions

# 1. EXAMPLE: Analyzing Concepts, Fields, and Domains together
print("\n" + "="*50)
print("MULTI-COLUMN NETWORK ANALYSIS: CONCEPTS + FIELDS + DOMAINS")
print("="*50)

# Create multi-column network
G_cfd, columns_cfd, subject_type_cfd, node_column_map_cfd = create_multi_column_network(
    df,
    list_columns=['Fields', 'Domains', 'Topics'],  # List columns
    dict_column=None,          # Dictionary column
    subject_main_dir=subject_main_dir,
    min_occurrences=5,                   # Only include items with at least 5 occurrences
    top_n_per_column={                   # Limit items per column
        'Fields': 5,
        'Domains': 5,
        'Topics': 5
    },
    overall_top_n=20                  # Overall limit on total nodes
)

# Visualize the multi-column network
fig, ax_main=visualize_multi_column_graph(G_cfd, columns_cfd, subject_main_dir, subject_type_cfd)
# legend_ax = fix_legend_display(fig, ax_main)

# Analyze the multi-column network
analysis_cfd = analyze_multi_column_network(
    G_cfd, 
    columns_cfd, 
    subject_main_dir, 
    subject_type_cfd,
    min_clique_size=3
)

# Analyze cross-column communities
community_analysis_cfd = analyze_cross_column_communities(
    G_cfd,
    analysis_cfd['communities'],
    columns_cfd
)

# Visualize community composition
visualize_community_composition(
    community_analysis_cfd,
    columns_cfd,
    subject_main_dir,
    subject_type_cfd
)

# 2. EXAMPLE: Analyzing Authors and Topics
print("\n" + "="*50)
print("MULTI-COLUMN NETWORK ANALYSIS: AUTHORS + TOPICS")
print("="*50)

# Create multi-column network
G_at, columns_at, subject_type_at, node_column_map_at = create_multi_column_network(
    df,
    list_columns=['Authors'],  # List columns
    dict_column='concept_dict',                    # No dictionary column
    subject_main_dir=subject_main_dir,
    min_occurrences=3,                   # Only include items with at least 3 occurrences
    top_n_per_column={                   # Limit items per column
        'Authors': 10
    },
    overall_top_n=20 
)

# Visualize the multi-column network
fig, ax_main=visualize_multi_column_graph(G_at, columns_at, subject_main_dir, subject_type_at)
# legend_ax = fix_legend_display(fig, ax_main)
# Analyze the multi-column network
analysis_at = analyze_multi_column_network(
    G_at, 
    columns_at, 
    subject_main_dir, 
    subject_type_at,
    min_clique_size=3
)

# Analyze cross-column communities
community_analysis_at = analyze_cross_column_communities(
    G_at,
    analysis_at['communities'],
    columns_at
)

# Visualize community composition
visualize_community_composition(
    community_analysis_at,
    columns_at,
    subject_main_dir,
    subject_type_at
)

# 3. EXAMPLE: Analyzing Institutions with Fields and Sub-fields
print("\n" + "="*50)
print("MULTI-COLUMN NETWORK ANALYSIS: INSTITUTIONS + FIELDS + SUB-FIELDS")
print("="*50)

# Create multi-column network
G_ifs, columns_ifs, subject_type_ifs, node_column_map_ifs = create_multi_column_network(
    df,
    list_columns=['Institutions', 'Topics'],  # List columns
    dict_column=None,                                      # No dictionary column
    subject_main_dir=subject_main_dir,
    min_occurrences=3,                                     # Only include items with at least 3 occurrences
    top_n_per_column={                                     # Limit items per column
        'Institutions': 10,
        'Topics': 10
    }
)

# Visualize the multi-column network
visualize_multi_column_graph(G_ifs, columns_ifs, subject_main_dir, subject_type_ifs)

# Analyze the multi-column network
analysis_ifs = analyze_multi_column_network(
    G_ifs, 
    columns_ifs, 
    subject_main_dir, 
    subject_type_ifs,
    min_clique_size=3
)

# Analyze cross-column communities
community_analysis_ifs = analyze_cross_column_communities(
    G_ifs,
    analysis_ifs['communities'],
    columns_ifs
)

# Visualize community composition
visualize_community_composition(
    community_analysis_ifs,
    columns_ifs,
    subject_main_dir,
    subject_type_ifs
)

# 4. EXAMPLE: Linking Authors, Institutions, and Concepts
print("\n" + "="*50)
print("MULTI-COLUMN NETWORK ANALYSIS: AUTHORS + INSTITUTIONS + CONCEPTS")
print("="*50)

# Create multi-column network
G_aic, columns_aic, subject_type_aic, node_column_map_aic = create_multi_column_network(
    df_flattened,
    list_columns=['Institutions', 'Countries_flat'],  # List columns
    dict_column=None,               # Dictionary column
    subject_main_dir=subject_main_dir,
    min_occurrences=3,                        # Only include items with at least 3 occurrences
    top_n_per_column={                        # Limit items per column
        'Institutions': 10,
        'Countries_flat': 10
    },
    overall_top_n=100                         # Overall limit on total nodes
)

# Visualize the multi-column network
visualize_multi_column_graph(G_aic, columns_aic, subject_main_dir, subject_type_aic)

# Analyze the multi-column network
analysis_aic = analyze_multi_column_network(
    G_aic, 
    columns_aic, 
    subject_main_dir, 
    subject_type_aic,
    min_clique_size=3
)

# 5. CUSTOM ANALYSIS: Finding bridging concepts between fields
print("\n" + "="*50)
print("CUSTOM ANALYSIS: BRIDGING CONCEPTS BETWEEN FIELDS")
print("="*50)

# 6. EXAMPLE: Analyzing Concepts, Fields, and Domains together
print("\n" + "="*50)
print("MULTI-COLUMN NETWORK ANALYSIS: FIELDS + SUB-FIELDS")
print("="*50)

# Create multi-column network
G_cfd, columns_cfd, subject_type_cfd, node_column_map_cfd = create_multi_column_network(
    df,
    list_columns=['Fields', 'Sub-fields'],  # List columns
    dict_column=None,          # Dictionary column
    subject_main_dir=subject_main_dir,
    min_occurrences=5,                   # Only include items with at least 5 occurrences
    top_n_per_column={                   # Limit items per column
        'Fields': 10,
        'Sub-fields': 10
    },
    overall_top_n=20                  # Overall limit on total nodes
)

# Visualize the multi-column network
fig, ax_main=visualize_multi_column_graph(G_cfd, columns_cfd, subject_main_dir, subject_type_cfd)
# legend_ax = fix_legend_display(fig, ax_main)

# Analyze the multi-column network
analysis_cfd = analyze_multi_column_network(
    G_cfd, 
    columns_cfd, 
    subject_main_dir, 
    subject_type_cfd,
    min_clique_size=3
)

# Analyze cross-column communities
community_analysis_cfd = analyze_cross_column_communities(
    G_cfd,
    analysis_cfd['communities'],
    columns_cfd
)

# Visualize community composition
visualize_community_composition(
    community_analysis_cfd,
    columns_cfd,
    subject_main_dir,
    subject_type_cfd
)
# Assuming we've created a Fields +

In [None]:
# Example usage of the multi-column network analysis functions

# 1. EXAMPLE: Analyzing Concepts, Fields, and Domains together
print("\n" + "="*50)
print("MULTI-COLUMN NETWORK ANALYSIS: CONCEPTS + FIELDS + DOMAINS")
print("="*50)

# Create multi-column network
G_cfd, columns_cfd, subject_type_cfd, node_column_map_cfd = create_multi_column_network(
    df,
    list_columns=['Fields', 'Domains', 'Topics'],  # List columns
    dict_column=None,          # Dictionary column
    subject_main_dir=subject_main_dir,
    min_occurrences=5,                   # Only include items with at least 5 occurrences
    top_n_per_column={                   # Limit items per column
        'Fields': 5,
        'Domains': 5,
        'Topics': 5
    },
    overall_top_n=20                  # Overall limit on total nodes
)

# Visualize the multi-column network
fig, ax_main=visualize_multi_column_graph(G_cfd, columns_cfd, subject_main_dir, subject_type_cfd)
# legend_ax = fix_legend_display(fig, ax_main)

# Analyze the multi-column network
analysis_cfd = analyze_multi_column_network(
    G_cfd, 
    columns_cfd, 
    subject_main_dir, 
    subject_type_cfd,
    min_clique_size=3
)

# Analyze cross-column communities
community_analysis_cfd = analyze_cross_column_communities(
    G_cfd,
    analysis_cfd['communities'],
    columns_cfd
)

# Visualize community composition
visualize_community_composition(
    community_analysis_cfd,
    columns_cfd,
    subject_main_dir,
    subject_type_cfd
)

# 2. EXAMPLE: Analyzing Authors and Topics
print("\n" + "="*50)
print("MULTI-COLUMN NETWORK ANALYSIS: AUTHORS + TOPICS")
print("="*50)

# Create multi-column network
G_at, columns_at, subject_type_at, node_column_map_at = create_multi_column_network(
    df,
    list_columns=['Authors', 'Topics'],  # List columns
    dict_column=None,                    # No dictionary column
    subject_main_dir=subject_main_dir,
    min_occurrences=3,                   # Only include items with at least 3 occurrences
    top_n_per_column={                   # Limit items per column
        'Authors': 10,
        'Topics': 10
    }
)

# Visualize the multi-column network
fig, ax_main=visualize_multi_column_graph(G_at, columns_at, subject_main_dir, subject_type_at)
# legend_ax = fix_legend_display(fig, ax_main)
# Analyze the multi-column network
analysis_at = analyze_multi_column_network(
    G_at, 
    columns_at, 
    subject_main_dir, 
    subject_type_at,
    min_clique_size=3
)

# Analyze cross-column communities
community_analysis_at = analyze_cross_column_communities(
    G_at,
    analysis_at['communities'],
    columns_at
)

# Visualize community composition
visualize_community_composition(
    community_analysis_at,
    columns_at,
    subject_main_dir,
    subject_type_at
)

# 3. EXAMPLE: Analyzing Institutions with Fields and Sub-fields
print("\n" + "="*50)
print("MULTI-COLUMN NETWORK ANALYSIS: INSTITUTIONS + FIELDS + SUB-FIELDS")
print("="*50)

# Create multi-column network
G_ifs, columns_ifs, subject_type_ifs, node_column_map_ifs = create_multi_column_network(
    df,
    list_columns=['Institutions', 'Topics'],  # List columns
    dict_column=None,                                      # No dictionary column
    subject_main_dir=subject_main_dir,
    min_occurrences=3,                                     # Only include items with at least 3 occurrences
    top_n_per_column={                                     # Limit items per column
        'Institutions': 10,
        'Topics': 10
    }
)

# Visualize the multi-column network
visualize_multi_column_graph(G_ifs, columns_ifs, subject_main_dir, subject_type_ifs)

# Analyze the multi-column network
analysis_ifs = analyze_multi_column_network(
    G_ifs, 
    columns_ifs, 
    subject_main_dir, 
    subject_type_ifs,
    min_clique_size=3
)

# Analyze cross-column communities
community_analysis_ifs = analyze_cross_column_communities(
    G_ifs,
    analysis_ifs['communities'],
    columns_ifs
)

# Visualize community composition
visualize_community_composition(
    community_analysis_ifs,
    columns_ifs,
    subject_main_dir,
    subject_type_ifs
)

# 4. EXAMPLE: Linking Authors, Institutions, and Concepts
print("\n" + "="*50)
print("MULTI-COLUMN NETWORK ANALYSIS: AUTHORS + INSTITUTIONS + CONCEPTS")
print("="*50)

# Create multi-column network
G_aic, columns_aic, subject_type_aic, node_column_map_aic = create_multi_column_network(
    df_flattened,
    list_columns=['Institutions', 'Countries_flat'],  # List columns
    dict_column=None,               # Dictionary column
    subject_main_dir=subject_main_dir,
    min_occurrences=3,                        # Only include items with at least 3 occurrences
    top_n_per_column={                        # Limit items per column
        'Institutions': 10,
        'Countries_flat': 10
    },
    overall_top_n=100                         # Overall limit on total nodes
)

# Visualize the multi-column network
visualize_multi_column_graph(G_aic, columns_aic, subject_main_dir, subject_type_aic)

# Analyze the multi-column network
analysis_aic = analyze_multi_column_network(
    G_aic, 
    columns_aic, 
    subject_main_dir, 
    subject_type_aic,
    min_clique_size=3
)

# 5. CUSTOM ANALYSIS: Finding bridging concepts between fields
print("\n" + "="*50)
print("CUSTOM ANALYSIS: BRIDGING CONCEPTS BETWEEN FIELDS")
print("="*50)

# 6. EXAMPLE: Analyzing Concepts, Fields, and Domains together
print("\n" + "="*50)
print("MULTI-COLUMN NETWORK ANALYSIS: FIELDS + SUB-FIELDS")
print("="*50)

# Create multi-column network
G_cfd, columns_cfd, subject_type_cfd, node_column_map_cfd = create_multi_column_network(
    df,
    list_columns=['Fields', 'Sub-fields'],  # List columns
    dict_column=None,          # Dictionary column
    subject_main_dir=subject_main_dir,
    min_occurrences=5,                   # Only include items with at least 5 occurrences
    top_n_per_column={                   # Limit items per column
        'Fields': 10,
        'Sub-fields': 10
    },
    overall_top_n=20                  # Overall limit on total nodes
)

# Visualize the multi-column network
fig, ax_main=visualize_multi_column_graph(G_cfd, columns_cfd, subject_main_dir, subject_type_cfd)
# legend_ax = fix_legend_display(fig, ax_main)

# Analyze the multi-column network
analysis_cfd = analyze_multi_column_network(
    G_cfd, 
    columns_cfd, 
    subject_main_dir, 
    subject_type_cfd,
    min_clique_size=3
)

# Analyze cross-column communities
community_analysis_cfd = analyze_cross_column_communities(
    G_cfd,
    analysis_cfd['communities'],
    columns_cfd
)

# Visualize community composition
visualize_community_composition(
    community_analysis_cfd,
    columns_cfd,
    subject_main_dir,
    subject_type_cfd
)
# Assuming we've created a Fields +

# Concepts analysis

In [None]:
def analyze_top_concepts(df, column='concept_dict', n=20, subject_main_dir=subject_main_dir):
    """
    Analyze and visualize the top n common concepts from the concept dictionary column.
    
    Parameters:
    df (pandas.DataFrame): The dataframe containing the data
    column (str): The name of the column containing concept dictionaries
    n (int): Number of top concepts to display
    subject_main_dir (str/Path): Main directory for all subject analyses
    
    Returns:
    pandas.Series: Series containing the counts of the top n concepts
    """
    import pandas as pd
    import ast
    import matplotlib.pyplot as plt
    import seaborn as sns
    
    # Extract subject type from column name
    subject_type = column.lower().split('_')[0] + 's'  # Assuming concept_dict -> concepts
    
    # Collect all concepts
    all_concepts = []
    
    # Parse each concept dictionary and extract the concepts (keys)
    for concept_str in df[column]:
        try:
            # Convert string representation of dict to actual dict
            concept_dict = ast.literal_eval(concept_str) if isinstance(concept_str, str) else concept_str
            
            # Add all concept keys to our list
            all_concepts.extend(list(concept_dict.keys()))
        except (ValueError, SyntaxError, TypeError):
            # Skip invalid entries
            continue
    
    # Count the occurrences of each concept
    concept_counts = pd.Series(all_concepts).value_counts()
    
    # Get the top n concepts
    top_concepts = concept_counts.head(n)
    
    # Create the visualization
    plt.figure(figsize=(12, 8))
    
    # Create a horizontal bar chart
    ax = sns.barplot(x=top_concepts.values, y=top_concepts.index, palette='viridis')
    
    # Add labels and title
    plt.title(f'Top {n} Common Concepts', fontsize=16)
    plt.xlabel('Count', fontsize=12)
    plt.ylabel('Concept', fontsize=12)
    
    # Add count values to the end of each bar
    for i, v in enumerate(top_concepts.values):
        ax.text(v + 0.5, i, str(v), va='center')
    
    # Save the plot
    save_plot(plt, f"top_{n}_common_concepts.png", subject_main_dir, subject_type)
    
    # Return the data for further analysis if needed
    return top_concepts

# Example usage:
top_concepts = analyze_top_concepts(df, column='concept_dict', n=20, subject_main_dir=subject_main_dir)

## General graph

In [None]:
concepts_graph = create_network_visualization(
    df, 
    column_name='concept_dict', 
    subject_main_dir=subject_main_dir,
    top_n=50,  # Limit to top 50 most frequent concepts
    min_occurrences=5  # Only include concepts that appear at least 5 times
)
print_network_stats(concepts_graph)

In [None]:
print("\n" + "="*50)
print("CONCEPTS COMMUNITY ANALYSIS")
print("="*50)
concept_communities, concept_cliques, concept_comm_stats = analyze_network_communities(
    concepts_graph, 
    subject_main_dir, 
    "concepts", 
    min_clique_size=3
)

# Visualize concept community statistics
visualize_community_stats(concept_comm_stats, subject_main_dir, "concepts")

# Find concept bridge nodes
concept_bridges = find_community_bridges(concepts_graph, concept_communities, subject_main_dir, "concepts")

In [None]:
# Temporal analysis of Concepts
print("\n" + "="*50)
print("TEMPORAL ANALYSIS OF CONCEPTS")
print("="*50)

concept_temporal_results = analyze_temporal_networks(
    df,
    column_name='concept_dict',
    subject_main_dir=subject_main_dir,
    decade_col='decade',  # Using the existing decade column
    min_occurrence=3,     # Only include concepts that appear at least 3 times in a decade
    top_n=50,             # Limit to top 50 most frequent concepts per decade
    min_clique_size=3     # Consider cliques of size 3 or larger as significant
)

# Print some insights from the temporal analysis
print(f"\nFound {len(concept_temporal_results['persistent_items'])} concepts that persisted across multiple decades")
print(f"Found {len(concept_temporal_results['persistent_connections'])} persistent concept connections")

# Get the most persistent concepts (present in most decades)
most_persistent_concepts = sorted(
    [(item, len(decades)) for item, decades in concept_temporal_results['persistent_items'].items()],
    key=lambda x: x[1], 
    reverse=True
)[:10]

print("\nMost persistent concepts across decades:")
for concept, num_decades in most_persistent_concepts:
    print(f"  {concept}: present in {num_decades} decades")


## Score-based graph

In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import numpy as np
from pathlib import Path
from collections import Counter, defaultdict

def visualize_concept_network(df, concept_dict_column, 
                             top_n=20, subject_main_dir="output", 
                             filename_prefix="concept_network"):
    """
    Visualize a network of top concepts based on their scores and co-appearances
    
    Parameters:
    df (DataFrame): DataFrame containing the concept dictionaries
    concept_dict_column (str): Column in df containing dictionaries of concepts and their scores
    top_n (int): Number of top concepts to include in the visualization
    subject_main_dir (str/Path): Main directory for saving visualizations
    filename_prefix (str): Prefix for saved files
    
    Returns:
    nx.Graph: The created network graph
    """
    # Step 1: Extract and aggregate all concepts from the dataframe
    aggregated_concepts = defaultdict(float)
    
    # First, extract all concepts and sum their scores across all rows
    for _, row in df.iterrows():
        # Get the concept dictionary from this row
        concept_dict = row[concept_dict_column]
        
        # Handle string representation of dictionaries if needed
        if isinstance(concept_dict, str):
            try:
                concept_dict = eval(concept_dict)
            except:
                continue
        
        # Skip if not a dictionary
        if not isinstance(concept_dict, dict):
            continue
            
        # Add the scores to our aggregated dictionary
        for concept, score in concept_dict.items():
            aggregated_concepts[concept] += score
    
    # Get top N concepts by aggregated score
    top_concepts = sorted(aggregated_concepts.items(), key=lambda x: x[1], reverse=True)[:top_n]
    
    # Create graph
    G = nx.Graph()
    
    # Add nodes with sizes based on scores
    max_score = max([score for _, score in top_concepts])
    min_score = min([score for _, score in top_concepts])
    score_range = max_score - min_score if max_score != min_score else 1
    
    for concept, score in top_concepts:
        # Normalize scores to node sizes (between 500 and 3000)
        normalized_score = 500 + ((score - min_score) / score_range) * 2500
        G.add_node(concept, score=score, size=normalized_score)
    
    # Calculate co-appearances from the concept dictionaries
    # Create set of top concept names for efficient lookup
    top_concept_names = {concept for concept, _ in top_concepts}
    
    # Count co-appearances
    co_appearances = defaultdict(int)
    
    for _, row in df.iterrows():
        # Get the concept dictionary from this row
        concept_dict = row[concept_dict_column]
        
        # Handle string representation of dictionaries if needed
        if isinstance(concept_dict, str):
            try:
                concept_dict = eval(concept_dict)
            except:
                continue
        
        # Skip if not a dictionary
        if not isinstance(concept_dict, dict):
            continue
        
        # Get concepts that are in our top N list
        row_concepts = [c for c in concept_dict.keys() if c in top_concept_names]
        
        # Add co-appearances for each pair
        for i, concept1 in enumerate(row_concepts):
            for concept2 in row_concepts[i+1:]:
                # Create a sorted tuple to ensure consistent key ordering
                pair = tuple(sorted([concept1, concept2]))
                co_appearances[pair] += 1
        
        # Add edges to graph
    if co_appearances:
        max_weight = max(co_appearances.values())
        min_weight = min(co_appearances.values()) if co_appearances else 1
        weight_range = max_weight - min_weight if max_weight != min_weight else 1
        
        for (concept1, concept2), weight in co_appearances.items():
            # Only add edges if both concepts are in our top N
            G.add_edge(concept1, concept2, weight=weight)
    
    # Draw and save the network visualization
    plt.figure(figsize=(16, 14))
    
    # Set positions using spring layout
    pos = nx.spring_layout(G, k=0.3, seed=42)
    
    # Draw nodes with sizes based on scores
    node_sizes = [G.nodes[node]['size'] for node in G.nodes()]
    
    # Use a color gradient for nodes based on score
    node_colors = []
    for node in G.nodes():
        score = G.nodes[node]['score']
        # Normalize score between 0 and 1
        norm_score = (score - min_score) / score_range if score_range > 0 else 0.5
        # Use blue gradient (lighter blue for lower scores, darker for higher)
        node_colors.append(plt.cm.Blues(0.5 + norm_score/2))  # Start from middle of colormap
    
    nx.draw_networkx_nodes(G, pos, 
                          node_size=node_sizes, 
                          node_color=node_colors, 
                          alpha=0.8,
                          edgecolors='white',
                          linewidths=1.5)
    
    # Draw edges with weights determining thickness and color intensity
    edges = G.edges()
    weights = [G[u][v]['weight'] for u, v in edges]
    
    if weights:
        max_weight = max(weights)
        min_weight = min(weights)
        weight_range = max_weight - min_weight if max_weight != min_weight else 1
            
        for (u, v, data) in G.edges(data=True):
            weight = data['weight']
            # Normalize weight for visual properties
            norm_weight = (weight - min_weight) / weight_range if weight_range > 0 else 0.5
            # Width between 1 and 5
            width = 1 + norm_weight * 4
            # Alpha between 0.3 and 0.9
            alpha = 0.3 + norm_weight * 0.6
            
            nx.draw_networkx_edges(G, pos, edgelist=[(u, v)], 
                                  width=width, 
                                  alpha=alpha,
                                  edge_color='gray')
    
    # Calculate node labels with appropriate font sizes
    labels = {}
    for node in G.nodes():
        # Scale font size based on node size (from 9 to 14)
        node_size = G.nodes[node]['size']
        size_ratio = (node_size - 500) / 2500  # Normalized between 0 and 1
        font_size = 9 + size_ratio * 5  # Scale between 9 and 14
        
        labels[node] = {'label': node, 'fontsize': font_size}
    
    # Draw labels with varying font sizes
    for node, label_info in labels.items():
        plt.text(pos[node][0], pos[node][1], label_info['label'],
                 fontsize=label_info['fontsize'],
                 ha='center', va='center',
                 bbox=dict(facecolor='white', alpha=0.7, edgecolor='none', boxstyle='round,pad=0.3'))
    
    # Add legend for node sizes
    sizes = [500, 1500, 3000]  # Small, medium, large
    size_labels = [
        f"Low Score ({min_score:.2f})",
        f"Medium Score",
        f"High Score ({max_score:.2f})"
    ]
    
    # Create legend handles
    legend_handles = []
    for size, label in zip(sizes, size_labels):
        handle = plt.Line2D([0], [0], marker='o', color='w', 
                          label=label, 
                          markerfacecolor=plt.cm.Blues(0.7),
                          markersize=np.sqrt(size)/50)  # Scale down for legend
        legend_handles.append(handle)
    
    # Add legend for edge weights if we have co-appearance data
    if weights:
        # Add edge weight legend
        edge_weights = [min_weight, (min_weight + max_weight)/2, max_weight]
        edge_labels = [
            f"Few Co-appearances ({min_weight})",
            f"Medium Co-appearances",
            f"Many Co-appearances ({max_weight})"
        ]
        
        # Add line handles to legend
        for weight, label in zip(edge_weights, edge_labels):
            norm_weight = (weight - min_weight) / weight_range if weight_range > 0 else 0.5
            width = 1 + norm_weight * 4
            handle = plt.Line2D([0], [0], color='gray', 
                              linewidth=width, 
                              alpha=0.3 + norm_weight * 0.6,
                              label=label)
            legend_handles.append(handle)
    
    plt.legend(handles=legend_handles, loc='upper right', fontsize=10)
    
    plt.title(f"Top {top_n} Concepts Network by Score", fontsize=16)
    plt.axis('off')
    
    # Save the visualization
    save_plot(plt, f"{filename_prefix}.png", subject_main_dir, "concepts")
    
    return G

visualize_concept_network(
    df=df,
    concept_dict_column="concept_dict",
    top_n=20,
    subject_main_dir=subject_main_dir
)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
from pathlib import Path
import networkx as nx

def analyze_concept_trends_by_decade(df, concept_dict_column, decade_column, 
                                    top_n=20, subject_main_dir="output"):
    """
    Analyze temporal trends of concept scores and co-appearances across decades
    
    Parameters:
    df (DataFrame): DataFrame containing concept dictionaries and decade information
    concept_dict_column (str): Column containing dictionaries of concepts and scores
    decade_column (str): Column containing decade information (e.g., "1980s", "1990s")
    top_n (int): Number of top concepts to analyze
    subject_main_dir (str/Path): Directory to save visualizations
    
    Returns:
    tuple: (concept_trends_df, co_appearance_trends_df)
    """
    # Make a copy to avoid modifying original
    df = df.copy()
    
    # Step 1: Identify top concepts across the entire dataset
    # Extract and aggregate all concepts from the dataframe
    aggregated_concepts = defaultdict(float)
    
    for _, row in df.iterrows():
        # Get the concept dictionary from this row
        concept_dict = row[concept_dict_column]
        
        # Handle string representation of dictionaries if needed
        if isinstance(concept_dict, str):
            try:
                concept_dict = eval(concept_dict)
            except:
                continue
        
        # Skip if not a dictionary
        if not isinstance(concept_dict, dict):
            continue
            
        # Add the scores to our aggregated dictionary
        for concept, score in concept_dict.items():
            aggregated_concepts[concept] += score
    
    # Get top N concepts by aggregated score
    top_concepts = [concept for concept, _ in 
                   sorted(aggregated_concepts.items(), 
                         key=lambda x: x[1], reverse=True)[:top_n]]
    
    # Step 2: Ensure decade column is properly formatted
    if df[decade_column].dtype != 'object':
        df[decade_column] = df[decade_column].astype(str)
    
    # Get sorted unique decades
    all_decades = sorted(df[decade_column].unique())
    
    # Step 3: Analyze concept scores by decade
    concept_trends = {concept: [] for concept in top_concepts}
    
    # Process each decade
    for decade in all_decades:
        decade_df = df[df[decade_column] == decade]
        
        # Initialize scores for this decade
        decade_scores = defaultdict(float)
        
        # Process each row in this decade
        for _, row in decade_df.iterrows():
            concept_dict = row[concept_dict_column]
            
            # Handle string representation if needed
            if isinstance(concept_dict, str):
                try:
                    concept_dict = eval(concept_dict)
                except:
                    continue
            
            # Skip if not a dictionary
            if not isinstance(concept_dict, dict):
                continue
                
            # Sum scores for top concepts in this decade
            for concept, score in concept_dict.items():
                if concept in top_concepts:
                    decade_scores[concept] += score
        
        # Store scores for each concept in this decade
        for concept in top_concepts:
            concept_trends[concept].append(decade_scores[concept])
    
    # Create dataframe from trend data
    trend_df = pd.DataFrame(concept_trends, index=all_decades)
    
    # Step 4: Analyze concept co-appearances by decade
    co_appearance_trends = {}
    
    # For each decade, count co-appearances between top concepts
    for decade in all_decades:
        decade_df = df[df[decade_column] == decade]
        
        # Initialize co-appearance counter for this decade
        decade_co_appearances = defaultdict(int)
        
        # Process each row in this decade
        for _, row in decade_df.iterrows():
            concept_dict = row[concept_dict_column]
            
            # Handle string representation if needed
            if isinstance(concept_dict, str):
                try:
                    concept_dict = eval(concept_dict)
                except:
                    continue
            
            # Skip if not a dictionary
            if not isinstance(concept_dict, dict):
                continue
                
            # Get top concepts that appear in this row
            row_concepts = [c for c in concept_dict.keys() if c in top_concepts]
            
            # Count co-appearances for each pair
            for i, concept1 in enumerate(row_concepts):
                for concept2 in row_concepts[i+1:]:
                    # Create a sorted tuple to ensure consistent key ordering
                    pair = tuple(sorted([concept1, concept2]))
                    decade_co_appearances[pair] += 1
        
        # Store co-appearance counts for this decade
        for pair, count in decade_co_appearances.items():
            # Create key for the pair if it doesn't exist
            if pair not in co_appearance_trends:
                co_appearance_trends[pair] = []
            
            # Ensure we have data for all preceding decades
            while len(co_appearance_trends[pair]) < len(all_decades) - 1:
                co_appearance_trends[pair].append(0)
            
            # Add this decade's count
            co_appearance_trends[pair].append(count)
    
    # Ensure all pairs have values for all decades
    for pair in co_appearance_trends:
        while len(co_appearance_trends[pair]) < len(all_decades):
            co_appearance_trends[pair].append(0)
    
    # Create dataframe from co-appearance data
    # Use concept pairs as column names
    pair_columns = {pair: f"{pair[0]} & {pair[1]}" for pair in co_appearance_trends.keys()}
    
    # Verify all arrays have the same length as all_decades before creating DataFrame
    for pair, counts in co_appearance_trends.items():
        if len(counts) != len(all_decades):
            # Fix the length by either truncating or padding with zeros
            if len(counts) > len(all_decades):
                co_appearance_trends[pair] = counts[:len(all_decades)]
            else:
                co_appearance_trends[pair] = counts + [0] * (len(all_decades) - len(counts))
    
    co_appearance_df = pd.DataFrame(
        {pair_columns[pair]: counts for pair, counts in co_appearance_trends.items()},
        index=all_decades
    )
    
    # Step 5: Visualize the results
    
    # 5.1: Visualize concept score trends
    visualize_concept_trends(trend_df, subject_main_dir, top_n)
    
    # 5.2: Visualize co-appearance trends for top pairs
    visualize_co_appearance_trends(co_appearance_df, subject_main_dir, top_n)
    
    # 5.3: Create network evolution visualization
    if len(all_decades) > 1:
        visualize_network_evolution(df, concept_dict_column, decade_column, 
                                  all_decades, top_concepts, subject_main_dir)
    
    return trend_df, co_appearance_df

def visualize_concept_trends(trend_df, subject_main_dir, top_n=20):
    """Visualize trends in concept scores across decades"""
    # Create concepts subfolder
    concept_dir = Path(subject_main_dir) / "concepts"
    concept_dir.mkdir(parents=True, exist_ok=True)
    
    # Limit to displaying a reasonable number of lines for readability
    display_limit = min(10, len(trend_df.columns))
    
    # Sort concepts by their total score (sum across all decades)
    concept_totals = trend_df.sum().sort_values(ascending=False)
    top_concepts = concept_totals.index[:display_limit]
    
    # Prepare multiple visualizations
    
    # 1. Line plot of concept scores over decades for top concepts
    plt.figure(figsize=(14, 8))
    
    # Plot each concept as a line
    for concept in top_concepts:
        plt.plot(trend_df.index, trend_df[concept], marker='o', linewidth=2, label=concept)
    
    plt.title(f'Top {display_limit} Concept Scores Across Decades', fontsize=16)
    plt.xlabel('Decade', fontsize=12)
    plt.ylabel('Aggregated Score', fontsize=12)
    plt.legend(title='Concepts', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    
    # Save the plot
    plt.savefig(concept_dir / f'concept_trends_line_top{display_limit}.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # 2. Heatmap of all top concepts
    plt.figure(figsize=(16, 10))
    
    # Normalize data for better visualization
    # Use only the top N concepts sorted by total score
    heatmap_data = trend_df[concept_totals.index[:top_n]]
    
    # Create the heatmap
    sns.heatmap(heatmap_data.T, cmap='YlOrRd', annot=False, 
               linewidths=0.5, cbar_kws={'label': 'Score'})
    
    plt.title(f'Heatmap of Top {top_n} Concept Scores Across Decades', fontsize=16)
    plt.xlabel('Decade', fontsize=12)
    plt.ylabel('Concept', fontsize=12)
    plt.tight_layout()
    
    # Save the plot
    plt.savefig(concept_dir / f'concept_trends_heatmap_top{top_n}.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # 3. Stacked area chart to show relative importance
    plt.figure(figsize=(14, 8))
    
    # Use only the top concepts for clarity
    area_data = trend_df[top_concepts]
    
    # Create stacked area chart
    plt.stackplot(area_data.index, [area_data[concept] for concept in top_concepts], 
                labels=top_concepts, alpha=0.8)
    
    plt.title(f'Relative Importance of Top {display_limit} Concepts Across Decades', fontsize=16)
    plt.xlabel('Decade', fontsize=12)
    plt.ylabel('Aggregated Score', fontsize=12)
    plt.legend(title='Concepts', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    
    # Save the plot
    plt.savefig(concept_dir / f'concept_trends_area_top{display_limit}.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # 4. Normalized line plot to show relative trends regardless of volume
    plt.figure(figsize=(14, 8))
    
    # Normalize each concept's scores to percentages of its maximum
    norm_data = trend_df[top_concepts].copy()
    for concept in top_concepts:
        max_val = norm_data[concept].max()
        if max_val > 0:  # Avoid division by zero
            norm_data[concept] = (norm_data[concept] / max_val) * 100
    
    # Plot normalized trends
    for concept in top_concepts:
        plt.plot(norm_data.index, norm_data[concept], marker='o', linewidth=2, label=concept)
    
    plt.title(f'Normalized Trends of Top {display_limit} Concepts (% of Peak Value)', fontsize=16)
    plt.xlabel('Decade', fontsize=12)
    plt.ylabel('Percentage of Peak Value', fontsize=12)
    plt.legend(title='Concepts', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    
    # Save the plot
    plt.savefig(concept_dir / f'concept_trends_normalized_top{display_limit}.png', dpi=300, bbox_inches='tight')
    plt.close()

def visualize_co_appearance_trends(co_appearance_df, subject_main_dir, top_n=20):
    """Visualize trends in concept co-appearances across decades"""
    # Create concepts subfolder
    concept_dir = Path(subject_main_dir) / "concepts"
    concept_dir.mkdir(parents=True, exist_ok=True)
    
    # If no co-appearances, exit
    if co_appearance_df.empty:
        print("No co-appearance data to visualize")
        return
    
    # Limit to displaying a reasonable number of co-occurrence pairs
    display_limit = min(8, len(co_appearance_df.columns))
    
    # Find top co-appearance pairs
    pair_totals = co_appearance_df.sum().sort_values(ascending=False)
    top_pairs = pair_totals.index[:display_limit]
    
    # 1. Line plot of co-appearance trends for top pairs
    plt.figure(figsize=(14, 8))
    
    # Plot each pair as a line
    for pair in top_pairs:
        plt.plot(co_appearance_df.index, co_appearance_df[pair], marker='o', 
                linewidth=2, label=pair)
    
    plt.title(f'Top {display_limit} Concept Co-appearance Trends Across Decades', fontsize=16)
    plt.xlabel('Decade', fontsize=12)
    plt.ylabel('Co-appearance Count', fontsize=12)
    plt.legend(title='Concept Pairs', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    
    # Save the plot
    plt.savefig(concept_dir / f'coappearance_trends_line_top{display_limit}.png', 
               dpi=300, bbox_inches='tight')
    plt.close()
    
    # 2. Heatmap of all top co-appearances
    plt.figure(figsize=(16, 12))
    
    # Limit to top N pairs for readability
    heatmap_pairs = min(top_n, len(pair_totals))
    heatmap_data = co_appearance_df[pair_totals.index[:heatmap_pairs]]
    
    # Create the heatmap
    sns.heatmap(heatmap_data.T, cmap='YlGnBu', annot=False, 
               linewidths=0.5, cbar_kws={'label': 'Co-appearance Count'})
    
    plt.title(f'Heatmap of Top {heatmap_pairs} Concept Co-appearances Across Decades', fontsize=16)
    plt.xlabel('Decade', fontsize=12)
    plt.ylabel('Concept Pair', fontsize=12)
    plt.tight_layout()
    
    # Save the plot
    plt.savefig(concept_dir / f'coappearance_trends_heatmap_top{heatmap_pairs}.png', 
               dpi=300, bbox_inches='tight')
    plt.close()
    
    # 3. Stacked bar chart for comparing co-appearance patterns by decade
    plt.figure(figsize=(14, 8))
    
    # Prepare data
    bar_data = heatmap_data.copy()
    decades = bar_data.index
    
    # Create stacked bar chart
    bottom = np.zeros(len(decades))
    for pair in bar_data.columns:
        plt.bar(decades, bar_data[pair], bottom=bottom, label=pair)
        bottom += bar_data[pair].values
    
    plt.title(f'Stacked Co-appearance Patterns Across Decades', fontsize=16)
    plt.xlabel('Decade', fontsize=12)
    plt.ylabel('Co-appearance Count', fontsize=12)
    plt.legend(title='Concept Pairs', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    
    # Save the plot
    plt.savefig(concept_dir / f'coappearance_stacked_bar.png', 
               dpi=300, bbox_inches='tight')
    plt.close()

def visualize_network_evolution(df, concept_dict_column, decade_column, 
                               decades, top_concepts, subject_main_dir):
    """Create network visualizations showing concept relationships across decades"""
    # Create concepts/networks subfolder
    network_dir = Path(subject_main_dir) / "concepts" / "network_evolution"
    network_dir.mkdir(parents=True, exist_ok=True)
    
    # We'll create individual decade networks and a summary visualization
    decade_networks = []
    
    # Process each decade
    for decade in decades:
        # Filter data for this decade
        decade_df = df[df[decade_column] == decade]
        
        # Skip if no data for this decade
        if decade_df.empty:
            continue
            
        # Create network for this decade
        G = nx.Graph()
        
        # Add nodes (top concepts)
        for concept in top_concepts:
            G.add_node(concept, count=0, score=0)
        
        # Initialize co-appearance counter
        co_appearances = defaultdict(int)
        
        # Process each row to extract concept relationships
        for _, row in decade_df.iterrows():
            concept_dict = row[concept_dict_column]
            
            # Handle string representation if needed
            if isinstance(concept_dict, str):
                try:
                    concept_dict = eval(concept_dict)
                except:
                    continue
            
            # Skip if not a dictionary
            if not isinstance(concept_dict, dict):
                continue
                
            # Update node data and count co-appearances
            row_concepts = []
            
            for concept, score in concept_dict.items():
                if concept in top_concepts:
                    # Add to row concepts for co-appearance calculation
                    row_concepts.append(concept)
                    
                    # Update node data
                    G.nodes[concept]['count'] = G.nodes[concept].get('count', 0) + 1
                    G.nodes[concept]['score'] = G.nodes[concept].get('score', 0) + score
            
            # Count co-appearances
            for i, concept1 in enumerate(row_concepts):
                for concept2 in row_concepts[i+1:]:
                    pair = tuple(sorted([concept1, concept2]))
                    co_appearances[pair] += 1
        
        # Add edges based on co-appearances
        for (concept1, concept2), weight in co_appearances.items():
            G.add_edge(concept1, concept2, weight=weight)
        
        # Store the network
        decade_networks.append((decade, G))
        
        # Create visualization for this decade
        visualize_decade_network(G, decade, network_dir)
    
    # Create summary visualization comparing networks across decades
    if len(decade_networks) > 1:
        visualize_network_comparison(decade_networks, network_dir)

def visualize_decade_network(G, decade, output_dir):
    """Visualize network for a specific decade"""
    plt.figure(figsize=(14, 12))
    
    # Calculate node sizes based on scores
    node_sizes = []
    node_colors = []
    
    for node in G.nodes():
        score = G.nodes[node].get('score', 0)
        count = G.nodes[node].get('count', 0)
        
        # Size based on score, with a minimum size
        size = max(300, score * 10)  # Adjusted multiplier for better visibility
        node_sizes.append(size)
        
        # Color intensity based on frequency
        # More frequent = darker blue
        color_intensity = min(1.0, count / 10)  # Cap at 1.0
        node_colors.append((0.1, 0.1, 0.5 + 0.5 * color_intensity))
    
    # Set positions using spring layout
    pos = nx.spring_layout(G, k=0.3, seed=42)
    
    # Draw nodes
    nx.draw_networkx_nodes(G, pos, 
                          node_size=node_sizes, 
                          node_color=node_colors, 
                          alpha=0.8)
    
    # Draw edges with weights determining thickness
    edges = G.edges()
    weights = [G[u][v]['weight'] for u, v in edges]
    
    if weights:
        max_weight = max(weights)
        min_weight = min(weights)
        weight_range = max_weight - min_weight if max_weight != min_weight else 1
        
        for (u, v, data) in G.edges(data=True):
            weight = data['weight']
            # Normalize weight for visual properties
            norm_weight = (weight - min_weight) / weight_range if weight_range > 0 else 0.5
            # Width between 1 and 5
            width = 1 + norm_weight * 4
            # Alpha between 0.3 and 0.9
            alpha = 0.3 + norm_weight * 0.6
            
            nx.draw_networkx_edges(G, pos, edgelist=[(u, v)], 
                                 width=width, 
                                 alpha=alpha,
                                 edge_color='gray')
    
    # Draw labels
    nx.draw_networkx_labels(G, pos, font_size=10, font_weight='bold',
                          bbox=dict(facecolor='white', alpha=0.7, edgecolor='none', 
                                   boxstyle='round,pad=0.3'))
    
    plt.title(f'Concept Network for Decade: {decade}', fontsize=16)
    plt.axis('off')
    
    # Save the plot
    plt.savefig(output_dir / f'network_{decade}.png', dpi=300, bbox_inches='tight')
    plt.close()

def visualize_network_comparison(decade_networks, output_dir):
    """Create a visualization comparing networks across decades"""
    # We'll focus on how specific metrics change over time
    
    # 1. Track changes in node importance (score) over time
    # Extract data
    decades = []
    concept_scores = defaultdict(list)
    concept_counts = defaultdict(list)
    
    for decade, G in decade_networks:
        decades.append(decade)
        
        for node in G.nodes():
            score = G.nodes[node].get('score', 0)
            count = G.nodes[node].get('count', 0)
            
            concept_scores[node].append(score)
            concept_counts[node].append(count)
    
    # Find top concepts based on total score
    total_scores = {concept: sum(scores) for concept, scores in concept_scores.items()}
    top_concepts = sorted(total_scores.items(), key=lambda x: x[1], reverse=True)[:8]
    
    # Create line plot showing score changes
    plt.figure(figsize=(14, 8))
    
    for concept, _ in top_concepts:
        plt.plot(decades, concept_scores[concept], marker='o', linewidth=2, label=concept)
    
    plt.title('Evolution of Concept Scores Across Decades', fontsize=16)
    plt.xlabel('Decade', fontsize=12)
    plt.ylabel('Score', fontsize=12)
    plt.legend(title='Top Concepts', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    
    # Save the plot
    plt.savefig(output_dir / 'concept_score_evolution.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # 2. Track changes in relationship strength over time
    # Track existing edges for this decade
    decade_edges = set()
    edge_weights=defaultdict()
    for decade, G in decade_networks:
        decade_count = 0  # Keep track of the decade's position
        
        for u, v, data in G.edges(data=True):
            edge = tuple(sorted([u, v]))
            weight = data.get('weight', 0)
            
            # Initialize the edge list if it doesn't exist
            if edge not in edge_weights:
                edge_weights[edge] = [0] * decade_count  # Fill with zeros for previous decades
            
            # Make sure we're not adding more values than decades
            while len(edge_weights[edge]) < decade_count:
                edge_weights[edge].append(0)
                
            edge_weights[edge].append(weight)
            decade_edges.add(edge)
        
        # Fill in zeros for edges that don't appear in this decade
        for edge in edge_weights:
            if edge not in decade_edges and len(edge_weights[edge]) <= decade_count:
                edge_weights[edge].append(0)
        
        decade_count += 1  # Move to the next decade position
    
    # Find top edges based on total weight
    total_weights = {edge: sum(weights) for edge, weights in edge_weights.items()}
    top_edges = sorted(total_weights.items(), key=lambda x: x[1], reverse=True)[:6]
    
    # Create line plot showing relationship evolution
    plt.figure(figsize=(14, 8))
    
    for (u, v), _ in top_edges:
        label = f"{u} & {v}"
        plt.plot(decades, edge_weights[(u, v)], marker='o', linewidth=2, label=label)
    
    plt.title('Evolution of Concept Relationships Across Decades', fontsize=16)
    plt.xlabel('Decade', fontsize=12)
    plt.ylabel('Co-appearance Count', fontsize=12)
    plt.legend(title='Top Relationships', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    
    # Save the plot
    plt.savefig(output_dir / 'relationship_evolution.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # 3. Create a multi-panel visualization showing network density evolution
    plt.figure(figsize=(12, 8))
    
    # Calculate network metrics over time
    densities = []
    avg_degrees = []
    edge_counts = []
    active_concepts = []
    
    for _, G in decade_networks:
        densities.append(nx.density(G))
        avg_degrees.append(np.mean([deg for _, deg in G.degree()]))
        edge_counts.append(G.number_of_edges())
        active_concepts.append(sum(1 for n in G.nodes() if G.nodes[n].get('count', 0) > 0))
    
    # Plot metrics
    plt.subplot(2, 2, 1)
    plt.plot(decades, densities, marker='o', color='blue', linewidth=2)
    plt.title('Network Density', fontsize=12)
    plt.grid(True, linestyle='--', alpha=0.7)
    
    plt.subplot(2, 2, 2)
    plt.plot(decades, avg_degrees, marker='o', color='green', linewidth=2)
    plt.title('Average Degree', fontsize=12)
    plt.grid(True, linestyle='--', alpha=0.7)
    
    plt.subplot(2, 2, 3)
    plt.plot(decades, edge_counts, marker='o', color='red', linewidth=2)
    plt.title('Number of Connections', fontsize=12)
    plt.grid(True, linestyle='--', alpha=0.7)
    
    plt.subplot(2, 2, 4)
    plt.plot(decades, active_concepts, marker='o', color='purple', linewidth=2)
    plt.title('Active Concepts', fontsize=12)
    plt.grid(True, linestyle='--', alpha=0.7)
    
    plt.suptitle('Evolution of Network Properties Across Decades', fontsize=16)
    plt.tight_layout(rect=[0, 0, 1, 0.95])
    
    # Save the plot
    plt.savefig(output_dir / 'network_metrics_evolution.png', dpi=300, bbox_inches='tight')
    plt.close()

# Example usage
trend_df, co_appearance_df = analyze_concept_trends_by_decade(
    df=df,
    concept_dict_column="concept_dict",  # Column with concept dictionaries
    decade_column="decade",             # Column with decade information
    top_n=20,                           # Analyze top 20 concepts
    subject_main_dir="results"          # Output directory
)


# Domains with general

In [None]:
domains_graph = create_network_visualization(
    df, 
    column_name='Domains', 
    subject_main_dir=subject_main_dir,
    top_n=50,  
    min_occurrences=5 
)
print_network_stats(domains_graph)

In [None]:
domains_graph = create_network_visualization(
    df, 
    column_name='Domains', 
    subject_main_dir=subject_main_dir,
    min_occurrences=3  # Only include domains that appear at least 3 times
)
print_network_stats(domains_graph)

# Fields visualization
fields_graph = create_network_visualization(
    df, 
    column_name='Fields', 
    subject_main_dir=subject_main_dir,
    top_n=30  # Limit to top 30 most frequent fields
)
print_network_stats(fields_graph)

# Topics visualization
topics_graph = create_network_visualization(
    df, 
    column_name='Topics', 
    subject_main_dir=subject_main_dir,
    top_n=40,
    min_occurrences=3
)
print_network_stats(topics_graph)

# Sub-fields visualization
subfields_graph = create_network_visualization(
    df, 
    column_name='Sub-fields', 
    subject_main_dir=subject_main_dir,
    top_n=50
)
print_network_stats(subfields_graph)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
from pathlib import Path
import matplotlib.dates as mdates
from matplotlib.ticker import MaxNLocator
import networkx as nx

def analyze_concept_trends(df, concept_dict_column, date_column, 
                          time_granularity='M', top_n=20, 
                          subject_main_dir="output"):
    """
    Analyze temporal trends of concept scores and co-appearances over time
    
    Parameters:
    df (DataFrame): DataFrame containing concept dictionaries and dates
    concept_dict_column (str): Column containing dictionaries of concepts and scores
    date_column (str): Column containing date information
    time_granularity (str): Pandas time frequency for aggregation ('Y'=year, 'M'=month, 'Q'=quarter)
    top_n (int): Number of top concepts to analyze
    subject_main_dir (str/Path): Directory to save visualizations
    
    Returns:
    tuple: (concept_trends_df, co_appearance_trends_df)
    """
    # Ensure date column is datetime type
    df = df.copy()
    df[date_column] = pd.to_datetime(df[date_column], errors='coerce')
    
    # Drop rows with invalid dates
    df = df.dropna(subset=[date_column])
    
    # Step 1: Identify top concepts across the entire dataset
    # This keeps the analysis focused on the most important concepts
    
    # Extract and aggregate all concepts from the dataframe
    aggregated_concepts = defaultdict(float)
    
    for _, row in df.iterrows():
        # Get the concept dictionary from this row
        concept_dict = row[concept_dict_column]
        
        # Handle string representation of dictionaries if needed
        if isinstance(concept_dict, str):
            try:
                concept_dict = eval(concept_dict)
            except:
                continue
        
        # Skip if not a dictionary
        if not isinstance(concept_dict, dict):
            continue
            
        # Add the scores to our aggregated dictionary
        for concept, score in concept_dict.items():
            aggregated_concepts[concept] += score
    
    # Get top N concepts by aggregated score
    top_concepts = [concept for concept, _ in 
                   sorted(aggregated_concepts.items(), 
                         key=lambda x: x[1], reverse=True)[:top_n]]
    
    # Step 2: Create time-based grouping for analysis
    # Add a period column based on the specified granularity
    if time_granularity == 'Y':
        df['period'] = df[date_column].dt.to_period('Y')
        period_format = '%Y'
        x_label = 'Year'
    elif time_granularity == 'Q':
        df['period'] = df[date_column].dt.to_period('Q')
        period_format = '%Y-Q%q'
        x_label = 'Quarter'
    else:  # Default to monthly
        df['period'] = df[date_column].dt.to_period('M')
        period_format = '%Y-%m'
        x_label = 'Month'
    
    # Convert to string for easier handling
    df['period_str'] = df['period'].astype(str)
    
    # Get sorted unique periods
    all_periods = sorted(df['period'].unique())
    period_strs = [str(period) for period in all_periods]
    
    # Step 3: Analyze concept scores over time
    
    # Initialize dictionary for storing trend data
    concept_trends = {concept: [] for concept in top_concepts}
    period_data = []
    
    # Process each time period
    for period in all_periods:
        period_df = df[df['period'] == period]
        period_str = str(period)
        period_data.append(period_str)
        
        # Initialize scores for this period
        period_scores = defaultdict(float)
        
        # Process each row in this period
        for _, row in period_df.iterrows():
            concept_dict = row[concept_dict_column]
            
            # Handle string representation if needed
            if isinstance(concept_dict, str):
                try:
                    concept_dict = eval(concept_dict)
                except:
                    continue
            
            # Skip if not a dictionary
            if not isinstance(concept_dict, dict):
                continue
                
            # Sum scores for top concepts in this period
            for concept, score in concept_dict.items():
                if concept in top_concepts:
                    period_scores[concept] += score
        
        # Store scores for each concept in this period
        for concept in top_concepts:
            concept_trends[concept].append(period_scores[concept])
    
    # Create dataframe from trend data
    trend_df = pd.DataFrame(concept_trends, index=period_data)
    
    # Step 4: Analyze concept co-appearances over time
    
    # Initialize dictionary for storing co-appearance data
    co_appearance_trends = {}
    
    # For each period, count co-appearances between top concepts
    for period in all_periods:
        period_df = df[df['period'] == period]
        period_str = str(period)
        
        # Initialize co-appearance counter for this period
        period_co_appearances = defaultdict(int)
        
        # Process each row in this period
        for _, row in period_df.iterrows():
            concept_dict = row[concept_dict_column]
            
            # Handle string representation if needed
            if isinstance(concept_dict, str):
                try:
                    concept_dict = eval(concept_dict)
                except:
                    continue
            
            # Skip if not a dictionary
            if not isinstance(concept_dict, dict):
                continue
                
            # Get top concepts that appear in this row
            row_concepts = [c for c in concept_dict.keys() if c in top_concepts]
            
            # Count co-appearances for each pair
            for i, concept1 in enumerate(row_concepts):
                for concept2 in row_concepts[i+1:]:
                    # Create a sorted tuple to ensure consistent key ordering
                    pair = tuple(sorted([concept1, concept2]))
                    period_co_appearances[pair] += 1
        
        # Store co-appearance counts for this period
        for pair, count in period_co_appearances.items():
            # Create key for the pair if it doesn't exist
            if pair not in co_appearance_trends:
                co_appearance_trends[pair] = []
            
            # Ensure we have data for all preceding periods
            while len(co_appearance_trends[pair]) < len(period_data) - 1:
                co_appearance_trends[pair].append(0)
            
            # Add this period's count
            co_appearance_trends[pair].append(count)
    
    # Ensure all pairs have values for all periods
    for pair in co_appearance_trends:
        while len(co_appearance_trends[pair]) < len(period_data):
            co_appearance_trends[pair].append(0)
    
    # Create dataframe from co-appearance data
    # Use concept pairs as column names
    pair_columns = {pair: f"{pair[0]} & {pair[1]}" for pair in co_appearance_trends.keys()}
    co_appearance_df = pd.DataFrame(
        {pair_columns[pair]: counts for pair, counts in co_appearance_trends.items()},
        index=period_data
    )
    
    # Step 5: Visualize the results
    
    # 5.1: Visualize concept score trends
    visualize_concept_trends(trend_df, subject_main_dir, top_n)
    
    # 5.2: Visualize co-appearance trends for top pairs
    visualize_co_appearance_trends(co_appearance_df, subject_main_dir, top_n)
    
    # 5.3: Create animated network evolution (if periods > 1)
    if len(period_data) > 1:
        visualize_network_evolution(df, concept_dict_column, 'period_str', 
                                  period_data, top_concepts, subject_main_dir)
    
    return trend_df, co_appearance_df

def visualize_concept_trends(trend_df, subject_main_dir, top_n=20):
    """Visualize trends in concept scores over time"""
    # Create concepts subfolder
    concept_dir = Path(subject_main_dir) / "concepts"
    concept_dir.mkdir(parents=True, exist_ok=True)
    
    # Limit to displaying a reasonable number of lines for readability
    display_limit = min(10, len(trend_df.columns))
    
    # Sort concepts by their total score (sum across all periods)
    concept_totals = trend_df.sum().sort_values(ascending=False)
    top_concepts = concept_totals.index[:display_limit]
    
    # Prepare multiple visualizations
    
    # 1. Line plot of concept scores over time for top concepts
    plt.figure(figsize=(14, 8))
    
    # Plot each concept as a line
    for concept in top_concepts:
        plt.plot(trend_df.index, trend_df[concept], marker='o', linewidth=2, label=concept)
    
    plt.title(f'Top {display_limit} Concept Scores Over Time', fontsize=16)
    plt.xlabel('Time Period', fontsize=12)
    plt.ylabel('Aggregated Score', fontsize=12)
    plt.legend(title='Concepts', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    
    # Save the plot
    plt.savefig(concept_dir / f'concept_trends_line_top{display_limit}.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # 2. Heatmap of all top concepts
    plt.figure(figsize=(16, 10))
    
    # Normalize data for better visualization
    # Use only the top N concepts sorted by total score
    heatmap_data = trend_df[concept_totals.index[:top_n]]
    
    # Create the heatmap
    sns.heatmap(heatmap_data.T, cmap='YlOrRd', annot=False, 
               linewidths=0.5, cbar_kws={'label': 'Score'})
    
    plt.title(f'Heatmap of Top {top_n} Concept Scores Over Time', fontsize=16)
    plt.xlabel('Time Period', fontsize=12)
    plt.ylabel('Concept', fontsize=12)
    plt.tight_layout()
    
    # Save the plot
    plt.savefig(concept_dir / f'concept_trends_heatmap_top{top_n}.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # 3. Stacked area chart to show relative importance
    plt.figure(figsize=(14, 8))
    
    # Use only the top concepts for clarity
    area_data = trend_df[top_concepts]
    
    # Create stacked area chart
    plt.stackplot(area_data.index, [area_data[concept] for concept in top_concepts], 
                labels=top_concepts, alpha=0.8)
    
    plt.title(f'Relative Importance of Top {display_limit} Concepts Over Time', fontsize=16)
    plt.xlabel('Time Period', fontsize=12)
    plt.ylabel('Aggregated Score', fontsize=12)
    plt.legend(title='Concepts', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    
    # Save the plot
    plt.savefig(concept_dir / f'concept_trends_area_top{display_limit}.png', dpi=300, bbox_inches='tight')
    plt.close()

def visualize_co_appearance_trends(co_appearance_df, subject_main_dir, top_n=20):
    """Visualize trends in concept co-appearances over time"""
    # Create concepts subfolder
    concept_dir = Path(subject_main_dir) / "concepts"
    concept_dir.mkdir(parents=True, exist_ok=True)
    
    # If no co-appearances, exit
    if co_appearance_df.empty:
        print("No co-appearance data to visualize")
        return
    
    # Limit to displaying a reasonable number of co-occurrence pairs
    display_limit = min(8, len(co_appearance_df.columns))
    
    # Find top co-appearance pairs
    pair_totals = co_appearance_df.sum().sort_values(ascending=False)
    top_pairs = pair_totals.index[:display_limit]
    
    # 1. Line plot of co-appearance trends for top pairs
    plt.figure(figsize=(14, 8))
    
    # Plot each pair as a line
    for pair in top_pairs:
        plt.plot(co_appearance_df.index, co_appearance_df[pair], marker='o', 
                linewidth=2, label=pair)
    
    plt.title(f'Top {display_limit} Concept Co-appearance Trends', fontsize=16)
    plt.xlabel('Time Period', fontsize=12)
    plt.ylabel('Co-appearance Count', fontsize=12)
    plt.legend(title='Concept Pairs', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    
    # Save the plot
    plt.savefig(concept_dir / f'coappearance_trends_line_top{display_limit}.png', 
               dpi=300, bbox_inches='tight')
    plt.close()
    
    # 2. Heatmap of all top co-appearances
    plt.figure(figsize=(16, 12))
    
    # Limit to top N pairs for readability
    heatmap_pairs = min(top_n, len(pair_totals))
    heatmap_data = co_appearance_df[pair_totals.index[:heatmap_pairs]]
    
    # Create the heatmap
    sns.heatmap(heatmap_data.T, cmap='YlGnBu', annot=False, 
               linewidths=0.5, cbar_kws={'label': 'Co-appearance Count'})
    
    plt.title(f'Heatmap of Top {heatmap_pairs} Concept Co-appearances Over Time', fontsize=16)
    plt.xlabel('Time Period', fontsize=12)
    plt.ylabel('Concept Pair', fontsize=12)
    plt.tight_layout()
    
    # Save the plot
    plt.savefig(concept_dir / f'coappearance_trends_heatmap_top{heatmap_pairs}.png', 
               dpi=300, bbox_inches='tight')
    plt.close()

def visualize_network_evolution(df, concept_dict_column, period_column, 
                               period_list, top_concepts, subject_main_dir):
    """Create network visualizations showing concept relationships over time"""
    # Create concepts/networks subfolder
    network_dir = Path(subject_main_dir) / "concepts" / "network_evolution"
    network_dir.mkdir(parents=True, exist_ok=True)
    
    # We'll create individual period networks and a summary visualization
    period_networks = []
    
    # Process each time period
    for i, period in enumerate(period_list):
        # Filter data for this period
        period_df = df[df[period_column] == period]
        
        # Skip if no data for this period
        if period_df.empty:
            continue
            
        # Create network for this period
        G = nx.Graph()
        
        # Add nodes (top concepts)
        for concept in top_concepts:
            G.add_node(concept, count=0, score=0)
        
        # Initialize co-appearance counter
        co_appearances = defaultdict(int)
        
        # Process each row to extract concept relationships
        for _, row in period_df.iterrows():
            concept_dict = row[concept_dict_column]
            
            # Handle string representation if needed
            if isinstance(concept_dict, str):
                try:
                    concept_dict = eval(concept_dict)
                except:
                    continue
            
            # Skip if not a dictionary
            if not isinstance(concept_dict, dict):
                continue
                
            # Update node data and count co-appearances
            row_concepts = []
            
            for concept, score in concept_dict.items():
                if concept in top_concepts:
                    # Add to row concepts for co-appearance calculation
                    row_concepts.append(concept)
                    
                    # Update node data
                    G.nodes[concept]['count'] = G.nodes[concept].get('count', 0) + 1
                    G.nodes[concept]['score'] = G.nodes[concept].get('score', 0) + score
            
            # Count co-appearances
            for i, concept1 in enumerate(row_concepts):
                for concept2 in row_concepts[i+1:]:
                    pair = tuple(sorted([concept1, concept2]))
                    co_appearances[pair] += 1
        
        # Add edges based on co-appearances
        for (concept1, concept2), weight in co_appearances.items():
            G.add_edge(concept1, concept2, weight=weight)
        
        # Store the network
        period_networks.append((period, G))
        
        # Create visualization for this period
        visualize_period_network(G, period, network_dir)
    
    # Create summary visualization comparing networks across time periods
    if len(period_networks) > 1:
        visualize_network_comparison(period_networks, network_dir)

def visualize_period_network(G, period, output_dir):
    """Visualize network for a specific time period"""
    plt.figure(figsize=(14, 12))
    
    # Calculate node sizes based on scores
    node_sizes = []
    node_colors = []
    
    for node in G.nodes():
        score = G.nodes[node].get('score', 0)
        count = G.nodes[node].get('count', 0)
        
        # Size based on score, with a minimum size
        size = max(300, score * 30)
        node_sizes.append(size)
        
        # Color intensity based on frequency
        # More frequent = darker blue
        color_intensity = min(1.0, count / 10)  # Cap at 1.0
        node_colors.append((0.1, 0.1, 0.5 + 0.5 * color_intensity))
    
    # Set positions using spring layout
    pos = nx.spring_layout(G, k=0.3, seed=42)
    
    # Draw nodes
    nx.draw_networkx_nodes(G, pos, 
                          node_size=node_sizes, 
                          node_color=node_colors, 
                          alpha=0.8)
    
    # Draw edges with weights determining thickness
    edges = G.edges()
    weights = [G[u][v]['weight'] for u, v in edges]
    
    if weights:
        max_weight = max(weights)
        min_weight = min(weights)
        weight_range = max_weight - min_weight if max_weight != min_weight else 1
        
        for (u, v, data) in G.edges(data=True):
            weight = data['weight']
            # Normalize weight for visual properties
            norm_weight = (weight - min_weight) / weight_range if weight_range > 0 else 0.5
            # Width between
            width = 1 + norm_weight * 4
            # Alpha between 0.3 and 0.9
            alpha = 0.3 + norm_weight * 0.6
            
            nx.draw_networkx_edges(G, pos, edgelist=[(u, v)], 
                                 width=width, 
                                 alpha=alpha,
                                 edge_color='gray')
    
    # Draw labels
    nx.draw_networkx_labels(G, pos, font_size=10, font_weight='bold',
                          bbox=dict(facecolor='white', alpha=0.7, edgecolor='none', 
                                   boxstyle='round,pad=0.3'))
    
    plt.title(f'Concept Network for Period: {period}', fontsize=16)
    plt.axis('off')
    
    # Save the plot
    plt.savefig(output_dir / f'network_{period}.png', dpi=300, bbox_inches='tight')
    plt.close()

def visualize_network_comparison(period_networks, output_dir):
    """Create a visualization comparing networks across time"""
    # We'll focus on how specific metrics change over time
    
    # 1. Track changes in node importance (score) over time
    # Extract data
    periods = []
    concept_scores = defaultdict(list)
    concept_counts = defaultdict(list)
    
    for period, G in period_networks:
        periods.append(period)
        
        for node in G.nodes():
            score = G.nodes[node].get('score', 0)
            count = G.nodes[node].get('count', 0)
            
            concept_scores[node].append(score)
            concept_counts[node].append(count)
    
    # Find top concepts based on total score
    total_scores = {concept: sum(scores) for concept, scores in concept_scores.items()}
    top_concepts = sorted(total_scores.items(), key=lambda x: x[1], reverse=True)[:8]
    
    # Create line plot showing score changes
    plt.figure(figsize=(14, 8))
    
    for concept, _ in top_concepts:
        plt.plot(periods, concept_scores[concept], marker='o', linewidth=2, label=concept)
    
    plt.title('Evolution of Concept Scores Over Time', fontsize=16)
    plt.xlabel('Time Period', fontsize=12)
    plt.ylabel('Score', fontsize=12)
    plt.legend(title='Top Concepts', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    
    # Save the plot
    plt.savefig(output_dir / 'concept_score_evolution.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # 2. Track changes in relationship strength over time
    # Extract edge data
    edge_weights = defaultdict(list)
    
    for period, G in period_networks:
        # Track existing edges for this period
        period_edges = set()
        
        for u, v, data in G.edges(data=True):
            edge = tuple(sorted([u, v]))
            weight = data.get('weight', 0)
            
            edge_weights[edge].append(weight)
            period_edges.add(edge)
        
        # Ensure all edges have a value for this period (0 if not present)
        for edge in edge_weights:
            if edge not in period_edges:
                # This edge wasn't in the current period
                while len(edge_weights[edge]) < len(periods):
                    edge_weights[edge].append(0)
    
    # Find top edges based on total weight
    total_weights = {edge: sum(weights) for edge, weights in edge_weights.items()}
    top_edges = sorted(total_weights.items(), key=lambda x: x[1], reverse=True)[:6]
    
    # Create line plot showing relationship evolution
    plt.figure(figsize=(14, 8))
    
    for (u, v), _ in top_edges:
        label = f"{u} & {v}"
        plt.plot(periods, edge_weights[(u, v)], marker='o', linewidth=2, label=label)
    
    plt.title('Evolution of Concept Relationships Over Time', fontsize=16)
    plt.xlabel('Time Period', fontsize=12)
    plt.ylabel('Co-appearance Count', fontsize=12)
    plt.legend(title='Top Relationships', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    
    # Save the plot
    plt.savefig(output_dir / 'relationship_evolution.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # 3. Create a multi-panel visualization showing network density evolution
    plt.figure(figsize=(12, 8))
    
    # Calculate network metrics over time
    densities = []
    avg_degrees = []
    edge_counts = []
    active_concepts = []
    
    for _, G in period_networks:
        densities.append(nx.density(G))
        avg_degrees.append(np.mean([deg for _, deg in G.degree()]))
        edge_counts.append(G.number_of_edges())
        active_concepts.append(sum(1 for n in G.nodes() if G.nodes[n].get('count', 0) > 0))
    
    # Plot metrics
    plt.subplot(2, 2, 1)
    plt.plot(periods, densities, marker='o', color='blue', linewidth=2)
    plt.title('Network Density', fontsize=12)
    plt.grid(True, linestyle='--', alpha=0.7)
    
    plt.subplot(2, 2, 2)
    plt.plot(periods, avg_degrees, marker='o', color='green', linewidth=2)
    plt.title('Average Degree', fontsize=12)
    plt.grid(True, linestyle='--', alpha=0.7)
    
    plt.subplot(2, 2, 3)
    plt.plot(periods, edge_counts, marker='o', color='red', linewidth=2)
    plt.title('Number of Connections', fontsize=12)
    plt.grid(True, linestyle='--', alpha=0.7)
    
    plt.subplot(2, 2, 4)
    plt.plot(periods, active_concepts, marker='o', color='purple', linewidth=2)
    plt.title('Active Concepts', fontsize=12)
    plt.grid(True, linestyle='--', alpha=0.7)
    
    plt.suptitle('Evolution of Network Properties Over Time', fontsize=16)
    plt.tight_layout(rect=[0, 0, 1, 0.95])
    
    # Save the plot
    plt.savefig(output_dir / 'network_metrics_evolution.png', dpi=300, bbox_inches='tight')
    plt.close()

# Example usage
'''
# Assuming your dataframe has:
# - A column with concept dictionaries
# - A date column

# Analyze temporal trends
trend_df, co_appearance_df = analyze_concept_trends(
    df=your_dataframe,
    concept_dict_column="ConceptDict",  # Column with concept dictionaries
    date_column="Date",                 # Column with date information
    time_granularity='M',               # Monthly analysis ('Y' for yearly, 'Q' for quarterly)
    top_n=20,                           # Analyze top 20 concepts
    subject_main_dir="results"          # Output directory
)
'''

In [None]:
print("\n" + "="*50)
print("DOMAINS COMMUNITY ANALYSIS")
print("="*50)
domain_communities, domain_cliques, domain_comm_stats = analyze_network_communities(
    domains_graph, 
    subject_main_dir, 
    "domains", 
    min_clique_size=3
)

# Visualize domain community statistics
visualize_community_stats(domain_comm_stats, subject_main_dir, "domains")

# Find domain bridge nodes
domain_bridges = find_community_bridges(domains_graph, domain_communities, subject_main_dir, "domains")

# Fields community analysis
print("\n" + "="*50)
print("FIELDS COMMUNITY ANALYSIS")
print("="*50)
field_communities, field_cliques, field_comm_stats = analyze_network_communities(
    fields_graph, 
    subject_main_dir, 
    "fields", 
    min_clique_size=3
)

# Visualize field community statistics
visualize_community_stats(field_comm_stats, subject_main_dir, "fields")

# Find field bridge nodes
field_bridges = find_community_bridges(fields_graph, field_communities, subject_main_dir, "fields")

# Print a summary of findings across all domains
print("\n" + "="*50)
print("SUMMARY OF COMMUNITY ANALYSIS")
print("="*50)

print(f"Concepts: {len(set(concept_communities.values()))} communities, {len(concept_cliques)} significant cliques")
print(f"Domains: {len(set(domain_communities.values()))} communities, {len(domain_cliques)} significant cliques")
print(f"Fields: {len(set(field_communities.values()))} communities, {len(field_cliques)} significant cliques")

In [None]:
print("\n" + "="*50)
print("TEMPORAL ANALYSIS OF DOMAINS")
print("="*50)

domain_temporal_results = analyze_temporal_networks(
    df,
    column_name='Domains',
    subject_main_dir=subject_main_dir,
    decade_col='decade',
    min_occurrence=2,  # Domains might be fewer, so lower threshold
    top_n=None,        # Include all domains that meet min_occurrence
    min_clique_size=3
)

# Print some insights from the temporal analysis
print(f"\nFound {len(domain_temporal_results['persistent_items'])} domains that persisted across multiple decades")
print(f"Found {len(domain_temporal_results['persistent_connections'])} persistent domain connections")

# Temporal analysis of Fields
print("\n" + "="*50)
print("TEMPORAL ANALYSIS OF FIELDS")
print("="*50)

field_temporal_results = analyze_temporal_networks(
    df,
    column_name='Fields',
    subject_main_dir=subject_main_dir,
    decade_col='decade',
    min_occurrence=2,
    top_n=30
)

# Temporal analysis of Sub-fields
print("\n" + "="*50)
print("TEMPORAL ANALYSIS OF SUB-FIELDS")
print("="*50)

subfield_temporal_results = analyze_temporal_networks(
    df,
    column_name='Sub-fields',
    subject_main_dir=subject_main_dir,
    decade_col='decade',
    min_occurrence=2,
    top_n=40
)

# Temporal analysis of Topics
print("\n" + "="*50)
print("TEMPORAL ANALYSIS OF TOPICS")
print("="*50)

topic_temporal_results = analyze_temporal_networks(
    df,
    column_name='Topics',
    subject_main_dir=subject_main_dir,
    decade_col='decade',
    min_occurrence=2,
    top_n=40
)

# Print a comparative summary of temporal evolution across all subjects
print("\n" + "="*50)
print("COMPARATIVE SUMMARY OF TEMPORAL EVOLUTION")
print("="*50)

decades = df['decade'].unique()
decades.sort()

# Create a summary table structure for comparison
summary_data = []

for decade in decades:
    row = {'Decade': f"{decade}s"}
    
    # Add concept stats if available
    concept_idx = list(concept_temporal_results['metrics']['decades']).index(decade) if decade in concept_temporal_results['metrics']['decades'] else None
    if concept_idx is not None and concept_idx < len(concept_temporal_results['metrics']['n_nodes']):
        row['Concepts'] = concept_temporal_results['metrics']['n_nodes'][concept_idx]
        row['Concept Communities'] = len(set(concept_temporal_results['metrics']['communities'][concept_idx].values())) if concept_temporal_results['metrics']['communities'][concept_idx] else 0
    else:
        row['Concepts'] = 'N/A'
        row['Concept Communities'] = 'N/A'
    
    # Add domain stats if available
    domain_idx = list(domain_temporal_results['metrics']['decades']).index(decade) if decade in domain_temporal_results['metrics']['decades'] else None
    if domain_idx is not None and domain_idx < len(domain_temporal_results['metrics']['n_nodes']):
        row['Domains'] = domain_temporal_results['metrics']['n_nodes'][domain_idx]
        row['Domain Communities'] = len(set(domain_temporal_results['metrics']['communities'][domain_idx].values())) if domain_temporal_results['metrics']['communities'][domain_idx] else 0
    else:
        row['Domains'] = 'N/A'
        row['Domain Communities'] = 'N/A'
    
    # Add field stats if available
    field_idx = list(field_temporal_results['metrics']['decades']).index(decade) if decade in field_temporal_results['metrics']['decades'] else None
    if field_idx is not None and field_idx < len(field_temporal_results['metrics']['n_nodes']):
        row['Fields'] = field_temporal_results['metrics']['n_nodes'][field_idx]
        row['Field Communities'] = len(set(field_temporal_results['metrics']['communities'][field_idx].values())) if field_temporal_results['metrics']['communities'][field_idx] else 0
    else:
        row['Fields'] = 'N/A'
        row['Field Communities'] = 'N/A'
    
    summary_data.append(row)

# Print the summary table
print("\nTemporal Evolution Summary by Decade:")
for row in summary_data:
    print(f"\n{row['Decade']}:")
    print(f"  Concepts: {row['Concepts']} (Communities: {row['Concept Communities']})")
    print(f"  Domains: {row['Domains']} (Communities: {row['Domain Communities']})")
    print(f"  Fields: {row['Fields']} (Communities: {row['Field Communities']})")

In [None]:
# Example code for analyzing the Authors field using the three main functions:
# 1. create_network_visualization
# 2. analyze_network_communities 
# 3. analyze_temporal_networks

# ==========================================
# 1. NETWORK VISUALIZATION FOR AUTHORS
# ==========================================

# Create co-authorship network visualization
print("\n" + "="*50)
print("CO-AUTHORSHIP NETWORK VISUALIZATION")
print("="*50)

# Parameters tailored for author networks
authors_graph = create_network_visualization(
    df,
    column_name='Authors',
    subject_main_dir=subject_main_dir,
    # Increase min_occurrences to filter out authors with few papers
    min_occurrences=3,
    # Limit to top prolific authors for cleaner visualization
    top_n=100
)

print(f"Created co-authorship network with {authors_graph.number_of_nodes()} authors " 
      f"and {authors_graph.number_of_edges()} collaborations")

# Print basic network stats
print_network_stats(authors_graph)

# ==========================================
# 2. COMMUNITY ANALYSIS FOR AUTHORS
# ==========================================

print("\n" + "="*50)
print("CO-AUTHORSHIP COMMUNITY ANALYSIS")
print("="*50)

# Analyze communities in the co-authorship network
authors_communities, authors_cliques, authors_comm_stats = analyze_network_communities(
    authors_graph,
    subject_main_dir,
    "authors",
    # Cliques of 3+ authors represent close collaboration groups
    min_clique_size=3,
    # Show more top communities as author networks often have many small groups
    top_communities=8,
    # Show more top cliques as these represent complete collaboration groups
    top_cliques=8
)

# Visualize author community statistics
visualize_community_stats(authors_comm_stats, subject_main_dir, "authors")

# Find and visualize bridge authors (who connect different communities)
authors_bridges = find_community_bridges(
    authors_graph, 
    authors_communities, 
    subject_main_dir, 
    "authors",
    # Show more bridge authors as they're particularly important in research networks
    top_n=15
)

# ==========================================
# 3. TEMPORAL ANALYSIS FOR AUTHORS
# ==========================================

print("\n" + "="*50)
print("TEMPORAL CO-AUTHORSHIP ANALYSIS")
print("="*50)

# Analyze how co-authorship networks evolve over time
authors_temporal_results = analyze_temporal_networks(
    df,
    column_name='Authors',
    subject_main_dir=subject_main_dir,
    decade_col='decade',
    # Lower minimum occurrence for authors as each decade may have fewer papers
    min_occurrence=2,
    # Limit to top authors per decade for cleaner visualization
    top_n=75,
    # Smaller cliques might be meaningful in author networks
    min_clique_size=3
)

# Print some insights from the temporal analysis
print(f"\nFound {len(authors_temporal_results['persistent_items'])} authors who published across multiple decades")
print(f"Found {len(authors_temporal_results['persistent_connections'])} persistent co-authorship relationships")

# Get the most persistent authors (present in most decades)
most_persistent_authors = sorted(
    [(item, len(decades)) for item, decades in authors_temporal_results['persistent_items'].items()],
    key=lambda x: x[1], 
    reverse=True
)[:15]

print("\nMost persistent authors across decades:")
for author, num_decades in most_persistent_authors:
    print(f"  {author}: published in {num_decades} decades")

# Get the most persistent co-authorship connections
most_persistent_collaborations = sorted(
    [((author1, author2), count) for (author1, author2), count in authors_temporal_results['persistent_connections'].items()],
    key=lambda x: x[1],
    reverse=True
)[:10]

print("\nMost persistent co-authorship relationships:")
for (author1, author2), decade_count in most_persistent_collaborations:
    print(f"  {author1} & {author2}: collaborated across {decade_count} decades")

# ==========================================
# CUSTOM ANALYSIS SPECIFIC TO AUTHORS
# ==========================================

# Calculate and visualize author productivity over time
print("\n" + "="*50)
print("AUTHOR PRODUCTIVITY ANALYSIS")
print("="*50)

# Process data for productivity analysis
def analyze_author_productivity(df, subject_main_dir):
    # Get author counts by decade
    author_publication_counts = {}
    decades = sorted(df['decade'].unique())
    
    for decade in decades:
        decade_df = df[df['decade'] == decade]
        decade_authors = []
        
        # Collect all authors from this decade
        for authors_list in preprocess_column(decade_df, 'Authors'):
            decade_authors.extend(authors_list)
        
        # Count publications per author
        author_counts = Counter(decade_authors)
        author_publication_counts[decade] = author_counts
    
    # Find top authors across all decades
    all_authors = set()
    for decade_counts in author_publication_counts.values():
        all_authors.update(decade_counts.keys())
    
    # Calculate total publications per author
    total_pub_counts = Counter()
    for decade_counts in author_publication_counts.values():
        total_pub_counts.update(decade_counts)
    
    # Get top 20 most productive authors overall
    top_authors = [author for author, _ in total_pub_counts.most_common(20)]
    
    # Create visualization of publication trends for top authors
    plt.figure(figsize=(15, 10))
    
    for author in top_authors[:10]:  # Limit to top 10 for clarity in the chart
        pub_counts = [author_publication_counts[decade].get(author, 0) for decade in decades]
        plt.plot(decades, pub_counts, 'o-', linewidth=2, label=author)
    
    plt.xlabel('Decade', fontsize=12)
    plt.ylabel('Number of Publications', fontsize=12)
    plt.title('Publication Trends of Top 10 Most Productive Authors', fontsize=16)
    plt.legend(fontsize=10)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    
    # Save the plot
    save_plot(plt, "author_productivity_trends.png", subject_main_dir, "authors")
    
    # Create heatmap of top author productivity by decade
    plt.figure(figsize=(15, 12))
    
    # Prepare heatmap data
    heatmap_data = []
    for author in top_authors:
        row = [author_publication_counts[decade].get(author, 0) for decade in decades]
        heatmap_data.append(row)
    
    # Create heatmap
    sns.heatmap(heatmap_data, annot=True, fmt="d", 
               xticklabels=[f"{d}s" for d in decades],
               yticklabels=top_authors,
               cmap="YlGnBu")
    
    plt.xlabel('Decade', fontsize=12)
    plt.ylabel('Author', fontsize=12)
    plt.title('Publication Count Heatmap of Top 20 Most Productive Authors', fontsize=16)
    plt.tight_layout()
    
    # Save the plot
    save_plot(plt, "author_productivity_heatmap.png", subject_main_dir, "authors")
    
    return {
        'author_counts_by_decade': author_publication_counts,
        'top_authors': top_authors,
        'total_publication_counts': total_pub_counts
    }

# Run the custom author productivity analysis
productivity_results = analyze_author_productivity(df, subject_main_dir)
print(f"Analyzed productivity for {len(productivity_results['total_publication_counts'])} unique authors")
print(f"Top 5 most productive authors:")
for author, count in productivity_results['total_publication_counts'].most_common(5):
    print(f"  {author}: {count} publications")

# Countries

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import geopandas as gpd
import plotly.express as px
import plotly.graph_objects as go
import folium
from folium.plugins import MarkerCluster
from itertools import combinations
from collections import Counter
import json
import matplotlib.pyplot as plt
import seaborn as sns
import kaleido  # Required for plotly.write_image

# Utility functions for country network analysis
def create_country_pairs(countries_list):
    """
    Create all possible pairs of countries from a list of countries
    
    Parameters:
    countries_list (list): List of country codes
    
    Returns:
    list: List of country pairs (tuples)
    """
    if not countries_list or len(countries_list) < 2:
        return []
    
    # Create all possible pairs
    return list(combinations(sorted(countries_list), 2))

def count_country_pairs(df, country_col='Countries_flat'):
    """
    Count co-appearances of countries
    
    Parameters:
    df (DataFrame): DataFrame with country lists
    country_col (str): Name of column containing country lists
    
    Returns:
    Counter: Dictionary of country pairs and their counts
    """
    pair_counts = Counter()
    
    # Process each row
    for countries in df[country_col]:
        if isinstance(countries, list) and len(countries) >= 2:
            pairs = create_country_pairs(countries)
            pair_counts.update(pairs)
    
    return pair_counts

def get_country_centroids(world_gdf, iso_cols=['iso_a2', 'ISO_A2', 'ISO_A2_EH']):
    """
    Get centroid coordinates for each country
    
    Parameters:
    world_gdf (GeoDataFrame): World map GeoDataFrame
    iso_cols (list): Possible column names for ISO country codes
    
    Returns:
    dict: Dictionary mapping country codes to (longitude, latitude)
    """
    country_positions = {}
    
    # Determine which ISO column exists in the dataframe
    available_cols = [col for col in iso_cols if col in world_gdf.columns]
    
    if not available_cols:
        print(f"Warning: No ISO country code columns found in the dataframe. Available columns: {world_gdf.columns.tolist()}")
        # Try to use a different identifier as fallback
        if 'ADMIN' in world_gdf.columns:
            print("Using 'ADMIN' column as fallback. Note: This might not match your country codes.")
            iso_col = 'ADMIN'
        elif 'NAME' in world_gdf.columns:
            print("Using 'NAME' column as fallback. Note: This might not match your country codes.")
            iso_col = 'NAME'
        else:
            print("No suitable country identifier columns found.")
            return country_positions
    else:
        iso_col = available_cols[0]
        print(f"Using '{iso_col}' for country codes.")
    
    # Print first few values to help with debugging
    print(f"Sample values from '{iso_col}' column: {world_gdf[iso_col].head().tolist()}")
    
    for idx, row in world_gdf.iterrows():
        if row[iso_col] not in ['-99', '-1', None, '']:  # Skip invalid codes
            centroid = row.geometry.representative_point()
            country_positions[row[iso_col]] = (centroid.y, centroid.x)  # lat, lon for folium
    
    return country_positions

def get_top_countries(df, country_col='Countries_flat', top_n=20):
    """
    Get top countries by appearance count
    
    Parameters:
    df (DataFrame): DataFrame with country lists
    country_col (str): Name of column containing country lists
    top_n (int): Number of top countries to return
    
    Returns:
    dict: Dictionary of country codes and their counts
    """
    country_counts = Counter()
    
    for countries in df[country_col]:
        if isinstance(countries, list):
            country_counts.update(countries)
    
    return dict(country_counts.most_common(top_n))

def analyze_country_coappearances_by_decade(df, country_col='Countries_flat', decade_col='decade'):
    """
    Analyze country co-appearances by decade
    
    Parameters:
    df (DataFrame): DataFrame with country lists and decade information
    country_col (str): Name of column containing country lists
    decade_col (str): Name of column containing decade information
    
    Returns:
    dict: Dictionary of decades with country pair counts
    """
    decades = df[decade_col].unique()
    decade_pairs = {}
    
    for decade in decades:
        decade_df = df[df[decade_col] == decade]
        if not decade_df.empty:
            decade_pairs[decade] = count_country_pairs(decade_df, country_col)
    
    return decade_pairs

def plot_country_coappearance_network(df_flattened, subject_main_dir, min_weight=2):
    """
    Create and save visualizations of country co-appearances
    
    Parameters:
    df_flattened (DataFrame): DataFrame with flattened country lists
    subject_main_dir (str/Path): Main directory for saving visualizations
    min_weight (int): Minimum weight to include in visualizations
    """
    # Load world map data
    # Handle the deprecated dataset by downloading directly
    try:
        # Try the old method first for backward compatibility
        world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
    except (AttributeError, ModuleNotFoundError):
        # If that fails, download the data directly
        world_url = "https://naciscdn.org/naturalearth/110m/cultural/ne_110m_admin_0_countries.zip"
        world = gpd.read_file(world_url)
        
    # Print available columns for debugging
    print(f"Available columns in world dataframe: {world.columns.tolist()}")
    
    # Get country centroids
    country_centroids = get_country_centroids(world)
    
    # Try to build a mapping of country codes to full names
    country_names = {}
    name_cols = ['NAME', 'ADMIN', 'name', 'admin', 'NAME_EN', 'name_en']
    code_cols = ['ISO_A2', 'ISO_A3', 'iso_a2', 'iso_a3']
    
    # Find valid name and code columns
    valid_name_col = next((col for col in name_cols if col in world.columns), None)
    valid_code_col = next((col for col in code_cols if col in world.columns), None)
    
    if valid_name_col and valid_code_col:
        for idx, row in world.iterrows():
            if row[valid_code_col] not in ['-99', '-1', None, '']:
                country_names[row[valid_code_col]] = row[valid_name_col]
    
    # Count country pairs
    pair_counts = count_country_pairs(df_flattened)
    
    # Get top countries
    top_countries = get_top_countries(df_flattened)
    
    # Create interactive folium map
    map_center = [20, 0]  # Center of the map
    m = folium.Map(location=map_center, zoom_start=2, tiles='CartoDB positron')
    
    # Add country markers
    for country, count in top_countries.items():
        if country in country_centroids:
            lat, lon = country_centroids[country]
            
            # Get full country name if available
            country_label = country_names.get(country, country)
            
            folium.CircleMarker(
                location=[lat, lon],
                radius=min(20, max(5, np.log1p(count))),  # Scale marker size by log of count
                color='blue',
                fill=True,
                fill_color='blue',
                fill_opacity=0.6,
                popup=f"{country_label}: {count} appearances"
            ).add_to(m)
            
            # Add country label
            folium.Marker(
                location=[lat, lon],
                icon=folium.DivIcon(
                    icon_size=(150, 36),
                    icon_anchor=(75, 18),
                    html=f'<div style="font-size: 10pt; color: black; font-weight: bold; text-align: center;">{country_label}</div>'
                )
            ).add_to(m)
    
    # Add connections between countries
    for (country1, country2), weight in pair_counts.items():
        if weight >= min_weight and country1 in country_centroids and country2 in country_centroids:
            lat1, lon1 = country_centroids[country1]
            lat2, lon2 = country_centroids[country2]
            
            # Scale line weight
            line_weight = min(10, max(1, np.log1p(weight) / 2))
            
            # Create line with lighter color (light blue)
            folium.PolyLine(
                locations=[[lat1, lon1], [lat2, lon2]],
                weight=line_weight,
                color='#ADD8E6',  # Light blue
                opacity=min(0.8, max(0.2, weight / 100)),
                popup=f"{country1}-{country2}: {weight} co-appearances"
            ).add_to(m)
    
    # Save interactive map
    map_path = Path(subject_main_dir) / 'countries' / 'country_coappearance_network.html'
    map_dir = map_path.parent
    map_dir.mkdir(parents=True, exist_ok=True)
    m.save(str(map_path))
    
    # Analyze co-appearances by decade
    decade_pairs = analyze_country_coappearances_by_decade(df_flattened)
    
    # Create visualizations for decade analysis
    plot_decade_analysis(decade_pairs, top_countries, subject_main_dir)
    
    # Create plotly express choropleth map
    create_plotly_choropleth(df_flattened, world, subject_main_dir)

def plot_decade_analysis(decade_pairs, top_countries, subject_main_dir):
    """
    Create and save visualizations for decade analysis
    
    Parameters:
    decade_pairs (dict): Dictionary of decades with country pair counts
    top_countries (dict): Dictionary of top countries
    subject_main_dir (str/Path): Main directory for saving visualizations
    """
    # Prepare data for top co-appearances over decades
    decades = sorted(decade_pairs.keys())
    top_country_codes = list(top_countries.keys())[:10]  # Top 10 countries
    
    # Track persistent pairs
    persistent_pairs = {}
    for decade, pairs in decade_pairs.items():
        for pair, count in pairs.most_common(50):  # Consider top 50 pairs
            if pair not in persistent_pairs:
                persistent_pairs[pair] = []
            persistent_pairs[pair].append((decade, count))
    
    # Filter for pairs that appear in multiple decades
    multi_decade_pairs = {pair: decades for pair, decades in persistent_pairs.items() 
                        if len(decades) > 1}
    
    # Create heatmap of co-appearances over decades
    plt.figure(figsize=(14, 10))
    
    # Prepare data for heatmap
    heatmap_data = []
    pair_labels = []
    
    # Take top 20 most persistent pairs
    sorted_persistent = sorted(multi_decade_pairs.items(), 
                             key=lambda x: len(x[1]), reverse=True)[:20]
    
    for pair, decade_counts in sorted_persistent:
        row = []
        pair_labels.append(f"{pair[0]}-{pair[1]}")
        
        # Create row with counts for each decade
        decade_dict = {d: c for d, c in decade_counts}
        for decade in decades:
            row.append(decade_dict.get(decade, 0))
        
        heatmap_data.append(row)
    
    # Create and save heatmap
    plt.figure(figsize=(12, 10))
    sns.heatmap(heatmap_data, cmap="YlOrRd", annot=True, fmt="d", 
                xticklabels=decades, yticklabels=pair_labels)
    plt.title("Persistent Country Co-appearances by Decade")
    plt.xlabel("Decade")
    plt.ylabel("Country Pair")
    plt.tight_layout()
    
    # Save the plot
    save_plot(plt, "persistent_coappearances_by_decade.png", subject_main_dir, "countries")
    
    # Create bar chart of top countries by decade
    plt.figure(figsize=(14, 8))
    
    # Count country appearances by decade
    decade_country_counts = {}
    for decade in decades:
        decade_df = df_flattened[df_flattened['decade'] == decade]
        decade_country_counts[decade] = get_top_countries(decade_df, top_n=10)
    
    # Plot top 5 countries for each decade
    for i, decade in enumerate(decades):
        counts = decade_country_counts[decade]
        countries = list(counts.keys())[:5]  # Top 5
        values = [counts[c] for c in countries]
        
        plt.subplot(1, len(decades), i+1)
        plt.barh(countries, values, color='skyblue')
        plt.title(f"Decade: {decade}")
        plt.xlabel("Count")
        if i == 0:
            plt.ylabel("Country")
        plt.tight_layout()
    
    # Save the plot
    save_plot(plt, "top_countries_by_decade.png", subject_main_dir, "countries")

def create_plotly_choropleth(df_flattened, world, subject_main_dir):
    """
    Create a Plotly Express choropleth map for country appearances
    
    Parameters:
    df_flattened (DataFrame): DataFrame with flattened country lists
    world (GeoDataFrame): World map GeoDataFrame
    subject_main_dir (str/Path): Main directory for saving visualizations
    """
    # Count country appearances
    country_counts = get_top_countries(df_flattened, top_n=None)  # Get all countries
    
    # Prepare data for plotly
    choropleth_data = []
    for country, count in country_counts.items():
        choropleth_data.append({
            'country_code': country,
            'count': count
        })
    
    choropleth_df = pd.DataFrame(choropleth_data)
    
    # Check if we need to convert from ISO-2 to ISO-3
    # Plotly only supports ISO-3, country names, USA-states, or geojson-id
    locationmode = 'ISO-3'
    
    # Create choropleth map
    fig = px.choropleth(
        choropleth_df,
        locations='country_code',
        color='count',
        hover_name='country_code',
        color_continuous_scale=px.colors.sequential.Plasma,
        title='Global Country Appearances Count',
        locationmode=locationmode,
        projection='natural earth'
    )
    
    # Improve layout
    fig.update_layout(
        margin={"r":0,"t":40,"l":0,"b":0},
        coloraxis_colorbar=dict(
            title="Appearances"
        )
    )
    
    # Save as HTML
    plotly_path = Path(subject_main_dir) / 'countries' / 'country_appearances_choropleth.html'
    plotly_dir = plotly_path.parent
    plotly_dir.mkdir(parents=True, exist_ok=True)
    fig.write_html(str(plotly_path))
    
    # Save as image (PNG)
    img_path = plotly_path.with_suffix('.png')
    fig.write_image(str(img_path), width=1200, height=800)
    
    print(f"Saved choropleth visualization to {plotly_path} and {img_path}")
    
    # Improve layout
    fig.update_layout(
        margin={"r":0,"t":40,"l":0,"b":0},
        coloraxis_colorbar=dict(
            title="Appearances"
        )
    )
    
    # Save as HTML
    plotly_path = Path(subject_main_dir) / 'countries' / 'country_appearances_choropleth.html'
    plotly_dir = plotly_path.parent
    plotly_dir.mkdir(parents=True, exist_ok=True)
    fig.write_html(str(plotly_path))

# Example usage
if __name__ == "__main__":
    # Assuming df_flattened is already loaded with columns:
    # - Countries_flat: lists of country codes
    # - decade: decade information
    
    # Example data structure (comment out when using real data)
    # df_flattened = pd.DataFrame({
    #     'Countries_flat': [
    #         ['US', 'CA', 'MX'], 
    #         ['FR', 'DE', 'IT'],
    #         ['US', 'UK', 'FR'],
    #         ['CN', 'JP', 'KR']
    #     ],
    #     'decade': ['1990s', '1990s', '2000s', '2010s']
    # })
    
    # Define main directory
    
    # Create and save visualizations
    plot_country_coappearance_network(df_flattened, subject_main_dir, min_weight=2)

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import geopandas as gpd
import plotly.express as px
import plotly.graph_objects as go
import folium
from folium.plugins import MarkerCluster
from itertools import combinations
from collections import Counter
import json
import matplotlib.pyplot as plt
import seaborn as sns
import kaleido  # Required for plotly.write_image
import io
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
from PIL import Image

# Utility functions for country network analysis
def create_country_pairs(countries_list):
    """
    Create all possible pairs of countries from a list of countries
    
    Parameters:
    countries_list (list): List of country codes
    
    Returns:
    list: List of country pairs (tuples)
    """
    if not countries_list or len(countries_list) < 2:
        return []
    
    # Create all possible pairs
    return list(combinations(sorted(countries_list), 2))

def count_country_pairs(df, country_col='Countries_flat'):
    """
    Count co-appearances of countries
    
    Parameters:
    df (DataFrame): DataFrame with country lists
    country_col (str): Name of column containing country lists
    
    Returns:
    Counter: Dictionary of country pairs and their counts
    """
    pair_counts = Counter()
    
    # Process each row
    for countries in df[country_col]:
        if isinstance(countries, list) and len(countries) >= 2:
            pairs = create_country_pairs(countries)
            pair_counts.update(pairs)
    
    return pair_counts

def get_country_centroids(world_gdf, iso_cols=['iso_a2', 'ISO_A2', 'ISO_A2_EH']):
    """
    Get centroid coordinates for each country
    
    Parameters:
    world_gdf (GeoDataFrame): World map GeoDataFrame
    iso_cols (list): Possible column names for ISO country codes
    
    Returns:
    dict: Dictionary mapping country codes to (longitude, latitude)
    """
    country_positions = {}
    
    # Determine which ISO column exists in the dataframe
    available_cols = [col for col in iso_cols if col in world_gdf.columns]
    
    if not available_cols:
        print(f"Warning: No ISO country code columns found in the dataframe. Available columns: {world_gdf.columns.tolist()}")
        # Try to use a different identifier as fallback
        if 'ADMIN' in world_gdf.columns:
            print("Using 'ADMIN' column as fallback. Note: This might not match your country codes.")
            iso_col = 'ADMIN'
        elif 'NAME' in world_gdf.columns:
            print("Using 'NAME' column as fallback. Note: This might not match your country codes.")
            iso_col = 'NAME'
        else:
            print("No suitable country identifier columns found.")
            return country_positions
    else:
        iso_col = available_cols[0]
        print(f"Using '{iso_col}' for country codes.")
    
    # Print first few values to help with debugging
    print(f"Sample values from '{iso_col}' column: {world_gdf[iso_col].head().tolist()}")
    
    for idx, row in world_gdf.iterrows():
        if row[iso_col] not in ['-99', '-1', None, '']:  # Skip invalid codes
            centroid = row.geometry.representative_point()
            country_positions[row[iso_col]] = (centroid.y, centroid.x)  # lat, lon for folium
    
    return country_positions

def get_top_countries(df, country_col='Countries_flat', top_n=20):
    """
    Get top countries by appearance count
    
    Parameters:
    df (DataFrame): DataFrame with country lists
    country_col (str): Name of column containing country lists
    top_n (int): Number of top countries to return
    
    Returns:
    dict: Dictionary of country codes and their counts
    """
    country_counts = Counter()
    
    for countries in df[country_col]:
        if isinstance(countries, list):
            country_counts.update(countries)
    
    return dict(country_counts.most_common(top_n))

def analyze_country_coappearances_by_decade(df, country_col='Countries_flat', decade_col='decade'):
    """
    Analyze country co-appearances by decade
    
    Parameters:
    df (DataFrame): DataFrame with country lists and decade information
    country_col (str): Name of column containing country lists
    decade_col (str): Name of column containing decade information
    
    Returns:
    dict: Dictionary of decades with country pair counts
    """
    decades = df[decade_col].unique()
    decade_pairs = {}
    
    for decade in decades:
        decade_df = df[df[decade_col] == decade]
        if not decade_df.empty:
            decade_pairs[decade] = count_country_pairs(decade_df, country_col)
    
    return decade_pairs

def save_folium_as_png(map_obj, output_path, width=1200, height=800):
    """
    Save a folium map as a PNG file
    
    Parameters:
    map_obj (folium.Map): Folium map object
    output_path (str/Path): Path to save the PNG file
    width (int): Width of the output image
    height (int): Height of the output image
    """
    # Save the map as an HTML file temporarily
    temp_html = str(output_path).replace('.png', '_temp.html')
    map_obj.save(temp_html)
    
    try:
        # Set up Chrome options for headless rendering
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument(f"--window-size={width},{height}")
        
        # Initialize a browser
        driver = webdriver.Chrome(options=chrome_options)
        
        # Load the HTML file
        driver.get(f"file://{Path(temp_html).absolute()}")
        
        # Wait for the map to fully load
        time.sleep(3)
        
        # Take a screenshot
        driver.save_screenshot(str(output_path))
        
        # Close the browser
        driver.quit()
        
        print(f"Saved map as PNG: {output_path}")
        
        # Optionally clean up the temporary HTML file
        Path(temp_html).unlink()
        
    except Exception as e:
        print(f"Error saving map as PNG: {e}")
        print("Make sure you have installed Chrome and chromedriver")
        print("Falling back to just HTML output")
        
def plot_country_coappearance_network(df_flattened, subject_main_dir, min_weight=2):
    """
    Create and save visualizations of country co-appearances
    
    Parameters:
    df_flattened (DataFrame): DataFrame with flattened country lists
    subject_main_dir (str/Path): Main directory for saving visualizations
    min_weight (int): Minimum weight to include in visualizations
    """
    # Load world map data
    # Handle the deprecated dataset by downloading directly
    try:
        # Try the old method first for backward compatibility
        world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
    except (AttributeError, ModuleNotFoundError):
        # If that fails, download the data directly
        world_url = "https://naciscdn.org/naturalearth/110m/cultural/ne_110m_admin_0_countries.zip"
        world = gpd.read_file(world_url)
        
    # Print available columns for debugging
    print(f"Available columns in world dataframe: {world.columns.tolist()}")
    
    # Get country centroids
    country_centroids = get_country_centroids(world)
    
    # Try to build a mapping of country codes to full names
    country_names = {}
    name_cols = ['NAME', 'ADMIN', 'name', 'admin', 'NAME_EN', 'name_en']
    code_cols = ['ISO_A2', 'ISO_A3', 'iso_a2', 'iso_a3']
    
    # Find valid name and code columns
    valid_name_col = next((col for col in name_cols if col in world.columns), None)
    valid_code_col = next((col for col in code_cols if col in world.columns), None)
    
    if valid_name_col and valid_code_col:
        for idx, row in world.iterrows():
            if row[valid_code_col] not in ['-99', '-1', None, '']:
                country_names[row[valid_code_col]] = row[valid_name_col]
    
    # Count country pairs
    pair_counts = count_country_pairs(df_flattened)
    
    # Get top countries
    top_countries = get_top_countries(df_flattened)
    
    # Create interactive folium map
    map_center = [20, 0]  # Center of the map
    m = folium.Map(location=map_center, zoom_start=2, tiles='CartoDB positron')
    
    # Add country markers
    for country, count in top_countries.items():
        if country in country_centroids:
            lat, lon = country_centroids[country]
            
            # Get full country name if available
            country_label = country_names.get(country, country)
            
            folium.CircleMarker(
                location=[lat, lon],
                radius=min(20, max(5, np.log1p(count))),  # Scale marker size by log of count
                color='blue',
                fill=True,
                fill_color='blue',
                fill_opacity=0.6,
                popup=f"{country_label}: {count} appearances"
            ).add_to(m)
            
            # Add country label
            folium.Marker(
                location=[lat, lon],
                icon=folium.DivIcon(
                    icon_size=(150, 36),
                    icon_anchor=(75, 18),
                    html=f'<div style="font-size: 10pt; color: black; font-weight: bold; text-align: center;">{country_label}</div>'
                )
            ).add_to(m)
    
    # Add connections between countries
    for (country1, country2), weight in pair_counts.items():
        if weight >= min_weight and country1 in country_centroids and country2 in country_centroids:
            lat1, lon1 = country_centroids[country1]
            lat2, lon2 = country_centroids[country2]
            
            # Scale line weight
            line_weight = min(10, max(1, np.log1p(weight) / 2))
            
            # Create line with lighter color (light blue)
            folium.PolyLine(
                locations=[[lat1, lon1], [lat2, lon2]],
                weight=line_weight,
                color='#ADD8E6',  # Light blue
                opacity=min(0.8, max(0.2, weight / 100)),
                popup=f"{country1}-{country2}: {weight} co-appearances"
            ).add_to(m)
    
    # Save interactive map
    map_path = Path(subject_main_dir) / 'countries' / 'country_coappearance_network.html'
    map_dir = map_path.parent
    map_dir.mkdir(parents=True, exist_ok=True)
    m.save(str(map_path))
    
    # Save as PNG as well
    png_path = map_path.with_suffix('.png')
    try:
        save_folium_as_png(m, png_path, width=1600, height=1000)
    except Exception as e:
        print(f"Warning: Could not save map as PNG. Error: {e}")
        print("Make sure you have installed the required packages: selenium, pillow, and have Chrome/chromedriver available.")
    
    # Analyze co-appearances by decade
    decade_pairs = analyze_country_coappearances_by_decade(df_flattened)
    
    # Create visualizations for decade analysis
    plot_decade_analysis(decade_pairs, top_countries, subject_main_dir)
    
    # Create plotly express choropleth map
    create_plotly_choropleth(df_flattened, world, subject_main_dir)

def plot_decade_analysis(decade_pairs, top_countries, subject_main_dir):
    """
    Create and save visualizations for decade analysis
    
    Parameters:
    decade_pairs (dict): Dictionary of decades with country pair counts
    top_countries (dict): Dictionary of top countries
    subject_main_dir (str/Path): Main directory for saving visualizations
    """
    # Prepare data for top co-appearances over decades
    decades = sorted(decade_pairs.keys())
    top_country_codes = list(top_countries.keys())[:10]  # Top 10 countries
    
    # Track persistent pairs
    persistent_pairs = {}
    for decade, pairs in decade_pairs.items():
        for pair, count in pairs.most_common(50):  # Consider top 50 pairs
            if pair not in persistent_pairs:
                persistent_pairs[pair] = []
            persistent_pairs[pair].append((decade, count))
    
    # Filter for pairs that appear in multiple decades
    multi_decade_pairs = {pair: decades for pair, decades in persistent_pairs.items() 
                        if len(decades) > 1}
    
    # Create heatmap of co-appearances over decades
    plt.figure(figsize=(14, 10))
    
    # Prepare data for heatmap
    heatmap_data = []
    pair_labels = []
    
    # Take top 20 most persistent pairs
    sorted_persistent = sorted(multi_decade_pairs.items(), 
                             key=lambda x: len(x[1]), reverse=True)[:20]
    
    for pair, decade_counts in sorted_persistent:
        row = []
        pair_labels.append(f"{pair[0]}-{pair[1]}")
        
        # Create row with counts for each decade
        decade_dict = {d: c for d, c in decade_counts}
        for decade in decades:
            row.append(decade_dict.get(decade, 0))
        
        heatmap_data.append(row)
    
    # Create and save heatmap
    plt.figure(figsize=(12, 10))
    sns.heatmap(heatmap_data, cmap="YlOrRd", annot=True, fmt="d", 
                xticklabels=decades, yticklabels=pair_labels)
    plt.title("Persistent Country Co-appearances by Decade")
    plt.xlabel("Decade")
    plt.ylabel("Country Pair")
    plt.tight_layout()
    
    # Save the plot
    save_plot(plt, "persistent_coappearances_by_decade.png", subject_main_dir, "countries")
    
    # Create bar chart of top countries by decade
    plt.figure(figsize=(14, 8))
    
    # Count country appearances by decade
    decade_country_counts = {}
    for decade in decades:
        decade_df = df_flattened[df_flattened['decade'] == decade]
        decade_country_counts[decade] = get_top_countries(decade_df, top_n=10)
    
    # Plot top 5 countries for each decade
    for i, decade in enumerate(decades):
        counts = decade_country_counts[decade]
        countries = list(counts.keys())[:5]  # Top 5
        values = [counts[c] for c in countries]
        
        plt.subplot(1, len(decades), i+1)
        plt.barh(countries, values, color='skyblue')
        plt.title(f"Decade: {decade}")
        plt.xlabel("Count")
        if i == 0:
            plt.ylabel("Country")
        plt.tight_layout()
    
    # Save the plot
    save_plot(plt, "top_countries_by_decade.png", subject_main_dir, "countries")

def create_plotly_choropleth(df_flattened, world, subject_main_dir):
    """
    Create a Plotly Express choropleth map for country appearances
    
    Parameters:
    df_flattened (DataFrame): DataFrame with flattened country lists
    world (GeoDataFrame): World map GeoDataFrame
    subject_main_dir (str/Path): Main directory for saving visualizations
    """
    # Count country appearances
    country_counts = get_top_countries(df_flattened, top_n=None)  # Get all countries
    
    # Prepare data for plotly
    choropleth_data = []
    for country, count in country_counts.items():
        choropleth_data.append({
            'country_code': country,
            'count': count
        })
    
    choropleth_df = pd.DataFrame(choropleth_data)
    
    # Check if we need to convert from ISO-2 to ISO-3
    # Plotly only supports ISO-3, country names, USA-states, or geojson-id
    locationmode = 'ISO-3'
    
    # Create choropleth map
    fig = px.choropleth(
        choropleth_df,
        locations='country_code',
        color='count',
        hover_name='country_code',
        color_continuous_scale=px.colors.sequential.Plasma,
        title='Global Country Appearances Count',
        locationmode=locationmode,
        projection='natural earth'
    )
    
    # Improve layout
    fig.update_layout(
        margin={"r":0,"t":40,"l":0,"b":0},
        coloraxis_colorbar=dict(
            title="Appearances"
        )
    )
    
    # Save as HTML
    plotly_path = Path(subject_main_dir) / 'countries' / 'country_appearances_choropleth.html'
    plotly_dir = plotly_path.parent
    plotly_dir.mkdir(parents=True, exist_ok=True)
    fig.write_html(str(plotly_path))
    
    # Save as image (PNG)
    img_path = plotly_path.with_suffix('.png')
    fig.write_image(str(img_path), width=1200, height=800)
    
    print(f"Saved choropleth visualization to {plotly_path} and {img_path}")
    
    # Improve layout
    fig.update_layout(
        margin={"r":0,"t":40,"l":0,"b":0},
        coloraxis_colorbar=dict(
            title="Appearances"
        )
    )
    
    # Save as HTML
    plotly_path = Path(subject_main_dir) / 'countries' / 'country_appearances_choropleth.html'
    plotly_dir = plotly_path.parent
    plotly_dir.mkdir(parents=True, exist_ok=True)
    fig.write_html(str(plotly_path))

# Example usage
if __name__ == "__main__":
    # Assuming df_flattened is already loaded with columns:
    # - Countries_flat: lists of country codes
    # - decade: decade information
    
    # Example data structure (comment out when using real data)
    # df_flattened = pd.DataFrame({
    #     'Countries_flat': [
    #         ['US', 'CA', 'MX'], 
    #         ['FR', 'DE', 'IT'],
    #         ['US', 'UK', 'FR'],
    #         ['CN', 'JP', 'KR']
    #     ],
    #     'decade': ['1990s', '1990s', '2000s', '2010s']
    # })
    
 
    
    # Create and save visualizations
    plot_country_coappearance_network(df_flattened, subject_main_dir, min_weight=2)

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import geopandas as gpd
import plotly.express as px
import plotly.graph_objects as go
import folium
from folium.plugins import MarkerCluster
from itertools import combinations
from collections import Counter
import json
import matplotlib.pyplot as plt
import seaborn as sns
import kaleido  # Required for plotly.write_image
import io
import time
from PIL import Image

# Utility functions for country network analysis
def create_country_pairs(countries_list):
    """
    Create all possible pairs of countries from a list of countries
    
    Parameters:
    countries_list (list): List of country codes
    
    Returns:
    list: List of country pairs (tuples)
    """
    if not countries_list or len(countries_list) < 2:
        return []
    
    # Create all possible pairs
    return list(combinations(sorted(countries_list), 2))

def count_country_pairs(df, country_col='Countries_flat'):
    """
    Count co-appearances of countries
    
    Parameters:
    df (DataFrame): DataFrame with country lists
    country_col (str): Name of column containing country lists
    
    Returns:
    Counter: Dictionary of country pairs and their counts
    """
    pair_counts = Counter()
    
    # Process each row
    for countries in df[country_col]:
        if isinstance(countries, list) and len(countries) >= 2:
            pairs = create_country_pairs(countries)
            pair_counts.update(pairs)
    
    return pair_counts

def get_country_centroids(world_gdf, iso_cols=['iso_a2', 'ISO_A2', 'ISO_A2_EH']):
    """
    Get centroid coordinates for each country
    
    Parameters:
    world_gdf (GeoDataFrame): World map GeoDataFrame
    iso_cols (list): Possible column names for ISO country codes
    
    Returns:
    dict: Dictionary mapping country codes to (longitude, latitude)
    """
    country_positions = {}
    
    # Determine which ISO column exists in the dataframe
    available_cols = [col for col in iso_cols if col in world_gdf.columns]
    
    if not available_cols:
        print(f"Warning: No ISO country code columns found in the dataframe. Available columns: {world_gdf.columns.tolist()}")
        # Try to use a different identifier as fallback
        if 'ADMIN' in world_gdf.columns:
            print("Using 'ADMIN' column as fallback. Note: This might not match your country codes.")
            iso_col = 'ADMIN'
        elif 'NAME' in world_gdf.columns:
            print("Using 'NAME' column as fallback. Note: This might not match your country codes.")
            iso_col = 'NAME'
        else:
            print("No suitable country identifier columns found.")
            return country_positions
    else:
        iso_col = available_cols[0]
        print(f"Using '{iso_col}' for country codes.")
    
    # Print first few values to help with debugging
    print(f"Sample values from '{iso_col}' column: {world_gdf[iso_col].head().tolist()}")
    
    for idx, row in world_gdf.iterrows():
        if row[iso_col] not in ['-99', '-1', None, '']:  # Skip invalid codes
            centroid = row.geometry.representative_point()
            country_positions[row[iso_col]] = (centroid.y, centroid.x)  # lat, lon for folium
    
    return country_positions

def get_top_countries(df, country_col='Countries_flat', top_n=20):
    """
    Get top countries by appearance count
    
    Parameters:
    df (DataFrame): DataFrame with country lists
    country_col (str): Name of column containing country lists
    top_n (int): Number of top countries to return
    
    Returns:
    dict: Dictionary of country codes and their counts
    """
    country_counts = Counter()
    
    for countries in df[country_col]:
        if isinstance(countries, list):
            country_counts.update(countries)
    
    return dict(country_counts.most_common(top_n))

def analyze_country_coappearances_by_decade(df, country_col='Countries_flat', decade_col='decade'):
    """
    Analyze country co-appearances by decade
    
    Parameters:
    df (DataFrame): DataFrame with country lists and decade information
    country_col (str): Name of column containing country lists
    decade_col (str): Name of column containing decade information
    
    Returns:
    dict: Dictionary of decades with country pair counts
    """
    decades = df[decade_col].unique()
    decade_pairs = {}
    
    for decade in decades:
        decade_df = df[df[decade_col] == decade]
        if not decade_df.empty:
            decade_pairs[decade] = count_country_pairs(decade_df, country_col)
    
    return decade_pairs

def create_static_map(df_flattened, world, country_centroids, pair_counts, top_countries, country_names, min_weight=2):
    """
    Create a static matplotlib version of the country network map
    
    Parameters:
    df_flattened (DataFrame): DataFrame with flattened country lists
    world (GeoDataFrame): World map GeoDataFrame
    country_centroids (dict): Dictionary of country centroids
    pair_counts (Counter): Country pair counts
    top_countries (dict): Top countries by count
    country_names (dict): Country code to name mapping
    min_weight (int): Minimum weight to include connections
    
    Returns:
    matplotlib.figure.Figure: Figure with the map
    """
    # Create figure and axis
    fig, ax = plt.subplots(figsize=(16, 10), dpi=150)
    
    # Plot world map background
    world.boundary.plot(ax=ax, linewidth=0.5, color='gray')
    
    # Plot connections
    for (country1, country2), weight in pair_counts.items():
        if weight >= min_weight and country1 in country_centroids and country2 in country_centroids:
            # Get coordinates
            y1, x1 = country_centroids[country1]
            y2, x2 = country_centroids[country2]
            
            # Scale line width by weight
            line_width = min(3, max(0.5, np.log1p(weight) / 3))
            
            # Plot line
            ax.plot([x1, x2], [y1, y2], color='lightblue', linewidth=line_width, 
                   alpha=min(0.8, max(0.2, weight / 100)), zorder=1)
    
    # Plot country nodes
    for country, count in top_countries.items():
        if country in country_centroids:
            y, x = country_centroids[country]
            
            # Scale marker size by count
            marker_size = min(150, max(30, np.log1p(count) * 20))
            
            # Get country name if available
            country_label = country_names.get(country, country)
            
            # Plot node
            ax.scatter(x, y, s=marker_size, color='blue', alpha=0.6, zorder=2)
            
            # Add country label
            ax.text(x, y, country_label, fontsize=8, ha='center', va='center', 
                   color='black', fontweight='bold', zorder=3)
    
    # Add title and adjust layout
    ax.set_title('Country Co-appearance Network', fontsize=14)
    ax.set_axis_off()
    plt.tight_layout()
    
    return fig

def plot_country_coappearance_network(df_flattened, subject_main_dir, min_weight=2):
    """
    Create and save visualizations of country co-appearances
    
    Parameters:
    df_flattened (DataFrame): DataFrame with flattened country lists
    subject_main_dir (str/Path): Main directory for saving visualizations
    min_weight (int): Minimum weight to include in visualizations
    """
    # Load world map data
    # Handle the deprecated dataset by downloading directly
    try:
        # Try the old method first for backward compatibility
        world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
    except (AttributeError, ModuleNotFoundError):
        # If that fails, download the data directly
        world_url = "https://naciscdn.org/naturalearth/110m/cultural/ne_110m_admin_0_countries.zip"
        world = gpd.read_file(world_url)
        
    # Print available columns for debugging
    print(f"Available columns in world dataframe: {world.columns.tolist()}")
    
    # Get country centroids
    country_centroids = get_country_centroids(world)
    
    # Try to build a mapping of country codes to full names
    country_names = {}
    name_cols = ['NAME', 'ADMIN', 'name', 'admin', 'NAME_EN', 'name_en']
    code_cols = ['ISO_A2', 'ISO_A3', 'iso_a2', 'iso_a3']
    
    # Find valid name and code columns
    valid_name_col = next((col for col in name_cols if col in world.columns), None)
    valid_code_col = next((col for col in code_cols if col in world.columns), None)
    
    if valid_name_col and valid_code_col:
        for idx, row in world.iterrows():
            if row[valid_code_col] not in ['-99', '-1', None, '']:
                country_names[row[valid_code_col]] = row[valid_name_col]
    
    # Count country pairs
    pair_counts = count_country_pairs(df_flattened)
    
    # Get top countries
    top_countries = get_top_countries(df_flattened)
    
    # Create interactive folium map
    map_center = [20, 0]  # Center of the map
    m = folium.Map(location=map_center, zoom_start=2, tiles='CartoDB positron')
    
    # Add country markers
    for country, count in top_countries.items():
        if country in country_centroids:
            lat, lon = country_centroids[country]
            
            # Get full country name if available
            country_label = country_names.get(country, country)
            
            folium.CircleMarker(
                location=[lat, lon],
                radius=min(20, max(5, np.log1p(count))),  # Scale marker size by log of count
                color='blue',
                fill=True,
                fill_color='blue',
                fill_opacity=0.6,
                popup=f"{country_label}: {count} appearances"
            ).add_to(m)
            
            # Add country label
            folium.Marker(
                location=[lat, lon],
                icon=folium.DivIcon(
                    icon_size=(150, 36),
                    icon_anchor=(75, 18),
                    html=f'<div style="font-size: 10pt; color: black; font-weight: bold; text-align: center;">{country_label}</div>'
                )
            ).add_to(m)
    
    # Add connections between countries
    for (country1, country2), weight in pair_counts.items():
        if weight >= min_weight and country1 in country_centroids and country2 in country_centroids:
            lat1, lon1 = country_centroids[country1]
            lat2, lon2 = country_centroids[country2]
            
            # Scale line weight
            line_weight = min(10, max(1, np.log1p(weight) / 2))
            
            # Create line with lighter color (light blue)
            folium.PolyLine(
                locations=[[lat1, lon1], [lat2, lon2]],
                weight=line_weight,
                color='#ADD8E6',  # Light blue
                opacity=min(0.8, max(0.2, weight / 100)),
                popup=f"{country1}-{country2}: {weight} co-appearances"
            ).add_to(m)
    
    # Save interactive map
    map_path = Path(subject_main_dir) / 'countries' / 'country_coappearance_network.html'
    map_dir = map_path.parent
    map_dir.mkdir(parents=True, exist_ok=True)
    m.save(str(map_path))
    
    # Create and save static matplotlib version of the map
    print("Creating static map image...")
    static_fig = create_static_map(
        df_flattened, world, country_centroids, 
        pair_counts, top_countries, country_names, min_weight
    )
    
    # Save static map as PNG
    static_map_path = Path(subject_main_dir) / 'countries' / 'country_coappearance_network_static.png'
    static_fig.savefig(static_map_path, dpi=300, bbox_inches='tight')
    plt.close(static_fig)
    print(f"Saved static map as: {static_map_path}")
    
    # Analyze co-appearances by decade
    decade_pairs = analyze_country_coappearances_by_decade(df_flattened)
    
    # Create visualizations for decade analysis
    plot_decade_analysis(decade_pairs, top_countries, subject_main_dir)
    
    # Create plotly express choropleth map
    create_plotly_choropleth(df_flattened, world, subject_main_dir)

def plot_decade_analysis(decade_pairs, top_countries, subject_main_dir):
    """
    Create and save visualizations for decade analysis
    
    Parameters:
    decade_pairs (dict): Dictionary of decades with country pair counts
    top_countries (dict): Dictionary of top countries
    subject_main_dir (str/Path): Main directory for saving visualizations
    """
    # Prepare data for top co-appearances over decades
    decades = sorted(decade_pairs.keys())
    top_country_codes = list(top_countries.keys())[:10]  # Top 10 countries
    
    # Track persistent pairs
    persistent_pairs = {}
    for decade, pairs in decade_pairs.items():
        for pair, count in pairs.most_common(50):  # Consider top 50 pairs
            if pair not in persistent_pairs:
                persistent_pairs[pair] = []
            persistent_pairs[pair].append((decade, count))
    
    # Filter for pairs that appear in multiple decades
    multi_decade_pairs = {pair: decades for pair, decades in persistent_pairs.items() 
                        if len(decades) > 1}
    
    # Create heatmap of co-appearances over decades
    plt.figure(figsize=(14, 10))
    
    # Prepare data for heatmap
    heatmap_data = []
    pair_labels = []
    
    # Take top 20 most persistent pairs
    sorted_persistent = sorted(multi_decade_pairs.items(), 
                             key=lambda x: len(x[1]), reverse=True)[:20]
    
    for pair, decade_counts in sorted_persistent:
        row = []
        pair_labels.append(f"{pair[0]}-{pair[1]}")
        
        # Create row with counts for each decade
        decade_dict = {d: c for d, c in decade_counts}
        for decade in decades:
            row.append(decade_dict.get(decade, 0))
        
        heatmap_data.append(row)
    
    # Create and save heatmap
    plt.figure(figsize=(12, 10))
    sns.heatmap(heatmap_data, cmap="YlOrRd", annot=True, fmt="d", 
                xticklabels=decades, yticklabels=pair_labels)
    plt.title("Persistent Country Co-appearances by Decade")
    plt.xlabel("Decade")
    plt.ylabel("Country Pair")
    plt.tight_layout()
    
    # Save the plot
    save_plot(plt, "persistent_coappearances_by_decade.png", subject_main_dir, "countries")
    
    # Create bar chart of top countries by decade
    plt.figure(figsize=(14, 8))
    
    # Count country appearances by decade
    decade_country_counts = {}
    for decade in decades:
        decade_df = df_flattened[df_flattened['decade'] == decade]
        decade_country_counts[decade] = get_top_countries(decade_df, top_n=10)
    
    # Plot top 5 countries for each decade
    for i, decade in enumerate(decades):
        counts = decade_country_counts[decade]
        countries = list(counts.keys())[:5]  # Top 5
        values = [counts[c] for c in countries]
        
        plt.subplot(1, len(decades), i+1)
        plt.barh(countries, values, color='skyblue')
        plt.title(f"Decade: {decade}")
        plt.xlabel("Count")
        if i == 0:
            plt.ylabel("Country")
        plt.tight_layout()
    
    # Save the plot
    save_plot(plt, "top_countries_by_decade.png", subject_main_dir, "countries")

def create_plotly_choropleth(df_flattened, world, subject_main_dir):
    """
    Create a Plotly Express choropleth map for country appearances
    
    Parameters:
    df_flattened (DataFrame): DataFrame with flattened country lists
    world (GeoDataFrame): World map GeoDataFrame
    subject_main_dir (str/Path): Main directory for saving visualizations
    """
    # Count country appearances
    country_counts = get_top_countries(df_flattened, top_n=None)  # Get all countries
    
    # Prepare data for plotly
    choropleth_data = []
    for country, count in country_counts.items():
        choropleth_data.append({
            'country_code': country,
            'count': count
        })
    
    choropleth_df = pd.DataFrame(choropleth_data)
    
    # Check if we need to convert from ISO-2 to ISO-3
    # Plotly only supports ISO-3, country names, USA-states, or geojson-id
    locationmode = 'ISO-3'
    
    # Create choropleth map
    fig = px.choropleth(
        choropleth_df,
        locations='country_code',
        color='count',
        hover_name='country_code',
        color_continuous_scale=px.colors.sequential.Plasma,
        title='Global Country Appearances Count',
        locationmode=locationmode,
        projection='natural earth'
    )
    
    # Improve layout
    fig.update_layout(
        margin={"r":0,"t":40,"l":0,"b":0},
        coloraxis_colorbar=dict(
            title="Appearances"
        )
    )
    
    # Save as HTML
    plotly_path = Path(subject_main_dir) / 'countries' / 'country_appearances_choropleth.html'
    plotly_dir = plotly_path.parent
    plotly_dir.mkdir(parents=True, exist_ok=True)
    fig.write_html(str(plotly_path))
    
    # Save as image (PNG)
    img_path = plotly_path.with_suffix('.png')
    fig.write_image(str(img_path), width=1200, height=800)
    
    print(f"Saved choropleth visualization to {plotly_path} and {img_path}")
    
    # Improve layout
    fig.update_layout(
        margin={"r":0,"t":40,"l":0,"b":0},
        coloraxis_colorbar=dict(
            title="Appearances"
        )
    )
    
    # Save as HTML
    plotly_path = Path(subject_main_dir) / 'countries' / 'country_appearances_choropleth.html'
    plotly_dir = plotly_path.parent
    plotly_dir.mkdir(parents=True, exist_ok=True)
    fig.write_html(str(plotly_path))

# Example usage
if __name__ == "__main__":
    # Assuming df_flattened is already loaded with columns:
    # - Countries_flat: lists of country codes
    # - decade: decade information
    
    # Example data structure (comment out when using real data)
    # df_flattened = pd.DataFrame({
    #     'Countries_flat': [
    #         ['US', 'CA', 'MX'], 
    #         ['FR', 'DE', 'IT'],
    #         ['US', 'UK', 'FR'],
    #         ['CN', 'JP', 'KR']
    #     ],
    #     'decade': ['1990s', '1990s', '2000s', '2010s']
    # })
    
    # Define main directory
    
    # Create and save visualizations
    plot_country_coappearance_network(df_flattened, subject_main_dir, min_weight=2)