In [None]:
# Feb 28: saved all functions in utils.py
#addon: SUCCEEDS to remove most frequent words (threshold: 72!)
# (copied cell 1!) SUCCESS at printing out the adjacency matrix per book as a means to show an illustrative example for the thesis, done on November 2, 2024
import os
import glob
import re
import string
import csv
import contractions
import spacy
import pandas as pd
import numpy as np
from collections import defaultdict, Counter
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel
import networkx as nx
import community as community_louvain
from gensim.models import KeyedVectors
import gensim.downloader as api
import utils

# Load spaCy model and NLP tools
nlp = spacy.load("en_core_web_sm")
nltk.download('punkt_tab')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Directory and pattern to read text files
directory = r"C:\Users\emine\Readings_Listenings"
pattern = "*.txt"


# Collect file paths
file_paths = glob.glob(os.path.join(directory, pattern))
texts = read_text_files(file_paths)

    
# Load stopwords and headwords
proper_nouns = load_proper_nouns(r"C:\Users\emine\try_env\Lists\mod_proper_nouns_misspelled.csv")

headword_directory = r”C:\Users\emine\try_env\Lists\BNC-COCA_Headwords”

# Load all three headwords lists
headwords_one = load_word_list(r"C:\Users\emine\try_env\Lists\BNC-COCA_Headwords\BNC-COCA_headwords_1000.txt")
headwords_two = load_word_list(r"C:\Users\emine\try_env\Lists\BNC-COCA_Headwords\BNC-COCA_headwords_2000.txt")
headwords_three = load_word_list(r"C:\Users\emine\try_env\Lists\BNC-COCA_Headwords\BNC-COCA_headwords_3000.txt")


# Lemmatize the headword lists to ensure comparison is consistent with the text processing pipeline
list1_words = lemmatize_word_list(headwords_one)  # Initialize word_list with list1_words
list2_words = lemmatize_word_list(headwords_two)  # Initialize word_list with list2_words
list3_words = lemmatize_word_list(headwords_three)  # Initialize word_list with list3_words

        
# Load stop words and proper nouns
proper_nouns = load_proper_nouns(r"C:\Users\emine\data_descriptions\mod_proper_nouns.csv")


# Visualization function (same as provided but generalized for more options)
def plot_word_frequencies(word_frequencies, top_n=20, log_scale=False):
    """
    Plots a bar chart of the top N most frequent words.
    If log_scale is True, applies a logarithmic scale to the y-axis.
    """
    most_common = word_frequencies.most_common(top_n)
    words = [word for word, freq in most_common]
    frequencies = [freq for word, freq in most_common]

    plt.figure(figsize=(12, 8))
    plt.bar(words, frequencies, color='skyblue')
    plt.xlabel('Words', fontsize=11)
    plt.ylabel('Frequencies' if not log_scale else 'Log(Frequencies)', fontsize=14)
    plt.title(f"Top {top_n} Most Frequent Words", fontsize=16)
    plt.xticks(rotation=90, ha='right', fontsize=11)
    if log_scale:
        plt.yscale('log')
    plt.tight_layout()
    plt.show()

# Compute word frequencies
word_frequencies = compute_word_frequencies(texts, proper_nouns, stop_words)
threshold = 72
high_frequency_words = [word for word, freq in word_frequencies.items() if freq > threshold]

print(f"Words occurring more than {threshold} times:")
print(high_frequency_words)


# Directory for saving and reading files
save_directory = r"C:\Users\emine\try_env\snapshots_RL\10_snapshots_RL"

# Collect files per book
snapshots = {}
graphs = {}

#Define the file ranges for each book
book_ranges = {
    1: range(1, 19),  # Book 1: 1_1_More to 1_18_More
    2: range(1, 21),  # Book 2: 2_1_More to 2_20_More
    3: range(1, 14),  # Book 3: 3_1_More to 3_13_More
    4: range(1, 14)   # Book 4: 4_1_More to 4_13_More
}

# Populate the snapshots with text files for each book and unit
for book, units in book_ranges.items():
    snapshots[book] = []
    for unit in units:
        # Create the text filename for each book and unit (e.g., 1_1_More.txt)
        file_name = f"{book}_{unit}_More.txt"
        file_path = os.path.join(directory, file_name)
        # Add the file path to the snapshots dictionary if the file exists
        if os.path.exists(file_path):
            snapshots[book].append(file_path)
        else:
            print(f"File not found: {file_path}")

# Assuming global_graph is the overall network that includes all nodes from all books
global_graph = nx.Graph()
global_partition = {}  # Stores community memberships across all books


# Update the most common words to be excluded in the graphs by updating stop words!
stop_words.update(high_frequency_words)
        
# Process each book's snapshot
for book, file_paths in snapshots.items():
    #print(f"\nProcessing Book {book}:")  # Output the book number
    #print(f"Files considered for this snapshot: {file_paths}")  # Output the list of files
    # Initialize the graph for this book
    if book not in graphs:
        graphs[book] = nx.Graph()

    # Collect all unique words across all units for this book
    all_unique_words = set()
    # Create a dictionary to accumulate co-occurrence counts (for reporting?)
    cooccurrence_counts = defaultdict(lambda: defaultdict(int))

    for file_path in file_paths:
        # Process each text file (unit)
        with open(file_path, "r", encoding="utf-8") as file:
            text = file.read()

        # Process the text to get the words
        words = filter_text(text, proper_nouns, stop_words)
        if not words:
            print(f"Warning: No valid words extracted from file {file_path}")
            continue
            
        # Build the co-occurrence matrix with a window size of 9
        window_size = 9
        adj_matrix, unique_words = build_cooccurrence_matrix(words, window_size)
        all_unique_words.update(unique_words)  # Collect all unique words
        
        # Accumulate co-occurrence counts
        for i, word_i in enumerate(unique_words):
            for j, word_j in enumerate(unique_words):
                if adj_matrix[i, j] != 0:
                    cooccurrence_counts[word_i][word_j] += adj_matrix[i, j]

    # Convert all unique words to a sorted list
    all_unique_words = sorted(list(all_unique_words))
    num_words = len(all_unique_words)
    print(f"\nBook {book} - Number of unique words (nodes): {num_words}")

    # Create a DataFrame for the book-level adjacency matrix
    df_book_adj_matrix = pd.DataFrame(
        np.zeros((num_words, num_words), dtype=int),
        index=all_unique_words,
        columns=all_unique_words
    )
        
    # Fill the DataFrame with co-occurrence counts
    for word_i, neighbors in cooccurrence_counts.items():
        for word_j, count in neighbors.items():
            df_book_adj_matrix.at[word_i, word_j] = count

    # Calculate the sum of each row (which corresponds to the total co-occurrence count for each word)
    row_sums = df_book_adj_matrix.sum(axis=1)
    
    # Order the DataFrame by the row sums in descending order
    df_book_adj_matrix = df_book_adj_matrix.loc[row_sums.sort_values(ascending=False).index]
    
    # Also reorder columns to match the row ordering
    df_book_adj_matrix = df_book_adj_matrix[df_book_adj_matrix.index]
    
    # Display a sample of 10 x 10 rows and columns
    sample_df = df_book_adj_matrix.iloc[:10, :10]
    print("Sample of 10 x 10 from the adjacency matrix (ordered by word occurrences):")
    print(sample_df)
    
    # Proceed with creating the graph from the DataFrame
    df_ordered = order_dataframe(df_book_adj_matrix)
    edges = adjacency_matrix_df_to_edge_list(df_ordered)
    for edge in edges:
        graphs[book].add_edge(edge[0], edge[1], weight=edge[2])
        global_graph.add_edge(edge[0], edge[1], weight=edge[2])

    
    # perform community detection
    partition = community_louvain.best_partition(graphs[book]) 
    # Store the partition in the global partition
    global_partition.update(partition)
    
    # Assign community and topics to each node before saving
    for node in graphs[book].nodes():
        # Assign topic keywords to each node
        graphs[book].nodes[node]['topics'] = ', '.join(topic_keywords_dict.get(node, {'other'}))

        # Assign community membership to nodes from the global partition
        if node in global_partition:
            graphs[book].nodes[node]['community'] = global_partition[node]
        else:
            # If the node has not been assigned globally, we assign a default community
            graphs[book].nodes[node]['community'] = 'undefined'

        # Assign node shape based on the word lists (list1_words, list2_words, list3_words)
        node_shape = assign_node_shape(node, list1_words, list2_words, list3_words)
        graphs[book].nodes[node]['shape'] = node_shape  # Add shape as an attribute
        
        # Add the node and edges to the global graph (to build a global representation)
        global_graph.add_node(node, **graphs[book].nodes[node])

    #Add edges from bookspecific graph to global graph
    global_graph.add_edges_from(graphs[book].edges(data=True))

    # Check if the graph has nodes and edges
    if graphs[book].number_of_nodes() == 0 or graphs[book].number_of_edges() == 0:
        print(f"Warning: The graph for Book {book} is empty! No nodes or edges.")
    else:
        print(f"Graph for Book {book} has {graphs[book].number_of_nodes()} nodes and {graphs[book].number_of_edges()} edges.")

    # Save the graph for the book once all nodes are processed with their communities
    gexf_filename = os.path.join(save_directory, f"{book}_2024_12_global_good_mod_w9_RL_LESSFREQ.gexf")
    nx.write_gexf(graphs[book], gexf_filename)
    print(f"Graph for Book {book} saved as {gexf_filename}")

    # Calculate network metrics for each snapshot (after community assignment)
    degree_centrality = nx.degree_centrality(graphs[book])
    betweenness_centrality = nx.betweenness_centrality(graphs[book])
    modularity_louvain = community_louvain.modularity(
        {n: graphs[book].nodes[n]['community'] for n in graphs[book].nodes()},
        graphs[book]
    )
   
    # Print out the metrics for comparison
    print(f"Book {book} - Louvain Modularity: {modularity_louvain:.3f}")
    #print(f"Book {book} - Manual Modularity: {modularity_manual:.3f}")
    #print(f"Book {book} - Degree Centrality (top 5 nodes): {sorted(degree_centrality.items(), key=lambda x: x[1], reverse=True)[:5]}")
    #print(f"Book {book} - Betweenness Centrality (top 5 nodes): {sorted(betweenness_centrality.items(), key=lambda x: x[1], reverse=True)[:5]}")

from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load pre-trained Word2Vec model (as an example)
# Replace with the correct path to your word embeddings model
word_vectors = KeyedVectors.load_word2vec_format(r"C:\Users\emine\Downloads\GoogleNewsvectorsnegative300.bin", binary=True)

# Function to calculate topic coherence using your adjacency matrix
def calculate_topic_coherence(community_words, adj_matrix, word_to_index):
    coherence_score = 0
    word_pairs = 0
    
    # Calculate the coherence by summing co-occurrences for all word pairs in the community
    for i, word1 in enumerate(community_words):
        for j, word2 in enumerate(community_words):
            if i != j:
                idx1 = word_to_index.get(word1)
                idx2 = word_to_index.get(word2)
                if idx1 is not None and idx2 is not None:
                    coherence_score += adj_matrix[idx1, idx2]
                    word_pairs += 1
    
    # Return average pairwise co-occurrence (topic coherence)
    return coherence_score / word_pairs if word_pairs > 0 else 0

# Function to calculate semantic similarity using word vectors
def calculate_semantic_similarity(community_words, word_vectors):
    similarity_score = 0
    word_pairs = 0
    missing_words = 0 # Track how many words are missing from word_vectors
    
    for i, word1 in enumerate(community_words):
        for j, word2 in enumerate(community_words):
            if i != j:
                try:
                    vec1 = word_vectors[word1]
                    vec2 = word_vectors[word2]
                    # Calculate cosine similarity between word vectors
                    similarity = cosine_similarity([vec1], [vec2])[0][0]
                    similarity_score += similarity
                    word_pairs += 1
                except KeyError:
                    missing_words += 1
                    # Skip words not found in the word vector model
                    continue
    print(f"Skipped {missing_words} word pairs due to missing vectors.")
    
    # Return average pairwise similarity (semantic similarity)
    return similarity_score / word_pairs if word_pairs > 0 else 0

# Create a global partition for modularity analysis for the global graph
communities_global = defaultdict(list)
for node in global_graph.nodes():
    if 'community' in global_graph.nodes[node]:
        community_id = global_graph.nodes[node]['community']
        communities_global[community_id].append(node)
    else:
        # Assign a default community if none is specified
        communities_global['undefined'].append(node)

# Prepare data for CSV: word, assigned community, frequency (counts of node)
data_for_csv = []

for community_id, nodes in communities_global.items():
    community_subgraph = global_graph.subgraph(nodes)
    # Calculate degree of each node (acting as word frequency)
    node_degrees = dict(community_subgraph.degree())

    # Add each word (node) to the list with its community and calculated frequency (degree)
    for word, degree in node_degrees.items():
        data_for_csv.append([word, community_id, degree])
        
# Now, loop over communities and calculate both coherence and similarity
for community_id, community_words in communities_global.items():
    # Rebuild the co-occurrence matrix and word_to_index for this community
    adj_matrix, unique_words = build_cooccurrence_matrix(community_words, window_size=9)
    word_to_index = {word: idx for idx, word in enumerate(unique_words)}
    
    coherence = calculate_topic_coherence(community_words, adj_matrix, word_to_index)
    semantic_similarity = calculate_semantic_similarity(community_words, word_vectors)
    
    print(f"Community {community_id} - Coherence: {coherence}, Semantic Similarity: {semantic_similarity}")

# Write to CSV file
csv_file = '2024_12_community_word_categorization_good_mod_w9_LESSFREQ.csv'
with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    # Write the header
    writer.writerow(['Word', 'Community', 'Frequency'])

    # Write the data
    writer.writerows(data_for_csv)

print(f"Data successfully written to {csv_file}")

# Save the global graph for reference
save_directory = r"C:\Users\emine\try_env\snapshots_RL\10_snapshots_RL"
global_snapshot_filename = os.path.join(save_directory, "2024_12_global_good_w9_mod_RL_LESSFREQ.gexf")
nx.write_gexf(global_graph, global_snapshot_filename)
print(f"Global graph saved as {global_snapshot_filename}")



In [None]:
# Feb 28: saved all functions in utils.py and attempts to consolidate file paths in BASE_DIR
#addon: SUCCEEDS to remove most frequent words (threshold: 72!)
# (copied cell 1!) SUCCESS at printing out the adjacency matrix per book as a means to show an illustrative example for the thesis, done on November 2, 2024
import os
import utils  # Import utility functions
import spacy
import pandas as pd
import networkx as nx
import nltk
from nltk.corpus import stopwords
import community as community_louvain
import random
import numpy as np

random.seed(42)
np.random.seed(42)

# Load spaCy model and NLP tools
nlp = spacy.load("en_core_web_sm")
nltk.download('punkt_tab')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Define base directory
BASE_DIR = r"C:\Users\emine\try_env\2025_02"

# Directory and pattern to read text files
directory = os.path.join(BASE_DIR, "texts")
pattern = "*.txt"

# Collect file paths
file_paths = glob.glob(os.path.join(directory, pattern))
texts = utils.read_text_files(directory)

#Load proper nouns and word lists from centralized directory
lists_directory = os.path.join(BASE_DIR, "lists")

# Load manually created proper_nouns file and and BNC-COCA headwords' list
proper_nouns = utils.load_proper_nouns(os.path.join(lists_directory, "mod_proper_nouns_misspelled.csv"))

# Load all three headwords lists
headwords_one = utils.load_word_list(os.path.join(lists_directory, "BNC-COCA_headwords_1000.txt"))
headwords_two = utils.load_word_list(os.path.join(lists_directory, "BNC-COCA_headwords_2000.txt"))
headwords_three = utils.load_word_list(os.path.join(lists_directory, "BNC-COCA_headwords_3000.txt"))


# Lemmatize the headword lists to ensure comparison is consistent with the text processing pipeline
list1_words = utils.lemmatize_word_list(headwords_one)  # Initialize word_list with list1_words
list2_words = utils.lemmatize_word_list(headwords_two)  # Initialize word_list with list2_words
list3_words = utils.lemmatize_word_list(headwords_three)  # Initialize word_list with list3_words



# Visualization function (same as provided but generalized for more options)
def plot_word_frequencies(word_frequencies, top_n=20, log_scale=False):
    """
    Plots a bar chart of the top N most frequent words.
    If log_scale is True, applies a logarithmic scale to the y-axis.
    """
    most_common = word_frequencies.most_common(top_n)
    words = [word for word, freq in most_common]
    frequencies = [freq for word, freq in most_common]

    plt.figure(figsize=(12, 8))
    plt.bar(words, frequencies, color='skyblue')
    plt.xlabel('Words', fontsize=11)
    plt.ylabel('Frequencies' if not log_scale else 'Log(Frequencies)', fontsize=14)
    plt.title(f"Top {top_n} Most Frequent Words", fontsize=16)
    plt.xticks(rotation=90, ha='right', fontsize=11)
    if log_scale:
        plt.yscale('log')
    plt.tight_layout()
    plt.show()

# Compute word frequencies
word_frequencies = utils.compute_word_frequencies(texts, proper_nouns, stop_words)
threshold = 72
high_frequency_words = [word for word, freq in word_frequencies.items() if freq > threshold]

print(f"Words occurring more than {threshold} times:")
print(high_frequency_words)

# Define save directory inside BASE_DIR
save_directory = os.path.join(BASE_DIR, "snapshots")

# Collect files per book
snapshots = {}
graphs = {}

#Define the file ranges for each book
book_ranges = {
    1: range(1, 19),  # Book 1: 1_1_More to 1_18_More
    2: range(1, 21),  # Book 2: 2_1_More to 2_20_More
    3: range(1, 14),  # Book 3: 3_1_More to 3_13_More
    4: range(1, 14)   # Book 4: 4_1_More to 4_13_More
}

# Populate the snapshots with text files for each book and unit
for book, units in book_ranges.items():
    snapshots[book] = []
    for unit in units:
        # Create the text filename for each book and unit (e.g., 1_1_More.txt)
        file_name = f"{book}_{unit}_More.txt"
        file_path = os.path.join(directory, file_name)
        # Add the file path to the snapshots dictionary if the file exists
        if os.path.exists(file_path):
            snapshots[book].append(file_path)
        else:
            print(f"File not found: {file_path}")

# Assuming global_graph is the overall network that includes all nodes from all books
global_graph = nx.Graph()
global_partition = {}  # Stores community memberships across all books


# Update the most common words to be excluded in the graphs by updating stop words!
stop_words.update(high_frequency_words)
        
# Process each book's snapshot
for book, file_paths in snapshots.items():
    #print(f"\nProcessing Book {book}:")  # Output the book number
    #print(f"Files considered for this snapshot: {file_paths}")  # Output the list of files
    # Initialize the graph for this book
    if book not in graphs:
        graphs[book] = nx.Graph()

    # Collect all unique words across all units for this book
    all_unique_words = set()
    # Create a dictionary to accumulate co-occurrence counts (for reporting?)
    cooccurrence_counts = defaultdict(lambda: defaultdict(int))

    for file_path in file_paths:
        # Process each text file (unit)
        with open(file_path, "r", encoding="utf-8") as file:
            text = file.read()

        # Process the text to get the words
        words = utils.filter_text(text, proper_nouns, stop_words)
        if not words:
            print(f"Warning: No valid words extracted from file {file_path}")
            continue
            
        # Build the co-occurrence matrix with a window size of 9
        window_size = 9
        adj_matrix, unique_words = utils.build_cooccurrence_matrix(words, window_size)
        all_unique_words.update(unique_words)  # Collect all unique words
        
        # Accumulate co-occurrence counts
        for i, word_i in enumerate(unique_words):
            for j, word_j in enumerate(unique_words):
                if adj_matrix[i, j] != 0:
                    cooccurrence_counts[word_i][word_j] += adj_matrix[i, j]

    # Convert all unique words to a sorted list
    all_unique_words = sorted(list(all_unique_words))
    num_words = len(all_unique_words)
    print(f"\nBook {book} - Number of unique words (nodes): {num_words}")

    # Create a DataFrame for the book-level adjacency matrix
    df_book_adj_matrix = pd.DataFrame(
        np.zeros((num_words, num_words), dtype=int),
        index=all_unique_words,
        columns=all_unique_words
    )
        
    # Fill the DataFrame with co-occurrence counts
    for word_i, neighbors in cooccurrence_counts.items():
        for word_j, count in neighbors.items():
            df_book_adj_matrix.at[word_i, word_j] = count

    # Calculate the sum of each row (which corresponds to the total co-occurrence count for each word)
    row_sums = df_book_adj_matrix.sum(axis=1)
    
    # Order the DataFrame by the row sums in descending order
    df_book_adj_matrix = df_book_adj_matrix.loc[row_sums.sort_values(ascending=False).index]
    
    # Also reorder columns to match the row ordering
    df_book_adj_matrix = df_book_adj_matrix[df_book_adj_matrix.index]
    
    # Display a sample of 10 x 10 rows and columns
    sample_df = df_book_adj_matrix.iloc[:10, :10]
    print("Sample of 10 x 10 from the adjacency matrix (ordered by word occurrences):")
    print(sample_df)
    
    # Proceed with creating the graph from the DataFrame
    df_ordered = utils.order_dataframe(df_book_adj_matrix)
    edges = utils.adjacency_matrix_df_to_edge_list(df_ordered)
    for edge in edges:
        graphs[book].add_edge(edge[0], edge[1], weight=edge[2])
        global_graph.add_edge(edge[0], edge[1], weight=edge[2])

    
    # perform community detection
    partition = community_louvain.best_partition(graphs[book]) 
    # Store the partition in the global partition
    global_partition.update(partition)
    
    # Assign community and topics to each node before saving
    for node in graphs[book].nodes():
        # Assign topic keywords to each node
        graphs[book].nodes[node]['topics'] = ', '.join(topic_keywords_dict.get(node, {'other'}))

        # Assign community membership to nodes from the global partition
        if node in global_partition:
            graphs[book].nodes[node]['community'] = global_partition[node]
        else:
            # If the node has not been assigned globally, we assign a default community
            graphs[book].nodes[node]['community'] = 'undefined'

        # Assign node shape based on the word lists (list1_words, list2_words, list3_words)
        node_shape = utils.assign_node_shape(node, list1_words, list2_words, list3_words)
        graphs[book].nodes[node]['shape'] = node_shape  # Add shape as an attribute
        
        # Add the node and edges to the global graph (to build a global representation)
        global_graph.add_node(node, **graphs[book].nodes[node])

    #Add edges from bookspecific graph to global graph
    global_graph.add_edges_from(graphs[book].edges(data=True))

    # Check if the graph has nodes and edges
    if graphs[book].number_of_nodes() == 0 or graphs[book].number_of_edges() == 0:
        print(f"Warning: The graph for Book {book} is empty! No nodes or edges.")
    else:
        print(f"Graph for Book {book} has {graphs[book].number_of_nodes()} nodes and {graphs[book].number_of_edges()} edges.")

    # Save the graph for the book once all nodes are processed with their communities
    gexf_filename = os.path.join(save_directory, f"{book}_202502_global_good_mod_w9_72threshold.gexf")
    nx.write_gexf(graphs[book], gexf_filename)
    print(f"Graph for Book {book} saved as {gexf_filename}")

    # Calculate network metrics for each snapshot (after community assignment)
    degree_centrality = nx.degree_centrality(graphs[book])
    betweenness_centrality = nx.betweenness_centrality(graphs[book])
    modularity_louvain = community_louvain.modularity(
        {n: graphs[book].nodes[n]['community'] for n in graphs[book].nodes()},
        graphs[book]
    )
   
    # Print out the metrics for comparison
    print(f"Book {book} - Louvain Modularity: {modularity_louvain:.3f}")
    #print(f"Book {book} - Manual Modularity: {modularity_manual:.3f}")
    #print(f"Book {book} - Degree Centrality (top 5 nodes): {sorted(degree_centrality.items(), key=lambda x: x[1], reverse=True)[:5]}")
    #print(f"Book {book} - Betweenness Centrality (top 5 nodes): {sorted(betweenness_centrality.items(), key=lambda x: x[1], reverse=True)[:5]}")

from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


# Function to calculate topic coherence using your adjacency matrix
def calculate_topic_coherence(community_words, adj_matrix, word_to_index):
    coherence_score = 0
    word_pairs = 0
    
    # Calculate the coherence by summing co-occurrences for all word pairs in the community
    for i, word1 in enumerate(community_words):
        for j, word2 in enumerate(community_words):
            if i != j:
                idx1 = word_to_index.get(word1)
                idx2 = word_to_index.get(word2)
                if idx1 is not None and idx2 is not None:
                    coherence_score += adj_matrix[idx1, idx2]
                    word_pairs += 1
    
    # Return average pairwise co-occurrence (topic coherence)
    return coherence_score / word_pairs if word_pairs > 0 else 0


# Create a global partition for modularity analysis for the global graph
communities_global = defaultdict(list)
for node in global_graph.nodes():
    if 'community' in global_graph.nodes[node]:
        community_id = global_graph.nodes[node]['community']
        communities_global[community_id].append(node)
    else:
        # Assign a default community if none is specified
        communities_global['undefined'].append(node)

# Prepare data for CSV: word, assigned community, frequency (counts of node)
data_for_csv = []

for community_id, nodes in communities_global.items():
    community_subgraph = global_graph.subgraph(nodes)
    # Calculate degree of each node (acting as word frequency)
    node_degrees = dict(community_subgraph.degree())

    # Add each word (node) to the list with its community and calculated frequency (degree)
    for word, degree in node_degrees.items():
        data_for_csv.append([word, community_id, degree])
        
# Now, loop over communities and calculate both coherence and similarity
for community_id, community_words in communities_global.items():
    # Rebuild the co-occurrence matrix and word_to_index for this community
    adj_matrix, unique_words = utils.build_cooccurrence_matrix(community_words, window_size=9)
    word_to_index = {word: idx for idx, word in enumerate(unique_words)}
    
    coherence = calculate_topic_coherence(community_words, adj_matrix, word_to_index)
    semantic_similarity = calculate_semantic_similarity(community_words, word_vectors)
    
    print(f"Community {community_id} - Coherence: {coherence}, Semantic Similarity: {semantic_similarity}")

utils.save_cooccurrence_matrix(save_directory, adj_matrix, unique_words, "cooccurrence_matrix.csv")

# Write to CSV file
csv_file = '202502_community_word_categorization.csv'
with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    # Write the header
    writer.writerow(['Word', 'Community', 'Frequency'])

    # Write the data
    writer.writerows(data_for_csv)

print(f"Data successfully written to {csv_file}")

# Save the global graph for reference
global_snapshot_filename = os.path.join(save_directory, "202502_global_w9_72_threshold.gexf")
nx.write_gexf(global_graph, global_snapshot_filename)
print(f"Global graph saved as {global_snapshot_filename}")



In [1]:
# attempts to create one global network

# Feb 28: saved all functions in utils.py and attempts to consolidate file paths in BASE_DIR
#addon: SUCCEEDS to remove most frequent words (threshold: 72!)
# (copied cell 1!) SUCCESS at printing out the adjacency matrix per book as a means to show an illustrative example for the thesis, done on November 2, 2024
import os
import utils  # Import utility functions
import spacy
import glob
import pandas as pd
import networkx as nx
import nltk
from nltk.corpus import stopwords
import community as community_louvain
import random
import numpy as np
from collections import defaultdict
import csv

random.seed(42)
np.random.seed(42)

# define stop words
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))  # Define stop_words

# Define base directory
BASE_DIR = r"C:\Users\emine\try_env\2025_02"

# Directory and pattern to read text files
directory = os.path.join(BASE_DIR, "texts")
pattern = "*.txt"

# Collect file paths
file_paths = glob.glob(os.path.join(directory, pattern))
texts = utils.read_text_files(directory)

#Load proper nouns and word lists from centralized directory
lists_directory = os.path.join(BASE_DIR, "lists")

# Load manually created proper_nouns file and and BNC-COCA headwords' list
proper_nouns = utils.load_proper_nouns(os.path.join(lists_directory, "mod_proper_nouns_misspelled.csv"))

# Load all three headwords lists
headwords_one = utils.load_word_list(os.path.join(lists_directory, "BNC-COCA_headwords_1000.txt"))
headwords_two = utils.load_word_list(os.path.join(lists_directory, "BNC-COCA_headwords_2000.txt"))
headwords_three = utils.load_word_list(os.path.join(lists_directory, "BNC-COCA_headwords_3000.txt"))


# Lemmatize the headword lists to ensure comparison is consistent with the text processing pipeline
list1_words = utils.lemmatize_word_list(headwords_one)  # Initialize word_list with list1_words
list2_words = utils.lemmatize_word_list(headwords_two)  # Initialize word_list with list2_words
list3_words = utils.lemmatize_word_list(headwords_three)  # Initialize word_list with list3_words


# Visualization function (same as provided but generalized for more options)
def plot_word_frequencies(word_frequencies, top_n=20, log_scale=False):
    """
    Plots a bar chart of the top N most frequent words.
    If log_scale is True, applies a logarithmic scale to the y-axis.
    """
    most_common = word_frequencies.most_common(top_n)
    words = [word for word, freq in most_common]
    frequencies = [freq for word, freq in most_common]

    plt.figure(figsize=(12, 8))
    plt.bar(words, frequencies, color='skyblue')
    plt.xlabel('Words', fontsize=11)
    plt.ylabel('Frequencies' if not log_scale else 'Log(Frequencies)', fontsize=14)
    plt.title(f"Top {top_n} Most Frequent Words", fontsize=16)
    plt.xticks(rotation=90, ha='right', fontsize=11)
    if log_scale:
        plt.yscale('log')
    plt.tight_layout()
    plt.show()

# Compute word frequencies
word_frequencies = utils.compute_word_frequencies(texts, proper_nouns, stop_words)
threshold = 72
high_frequency_words = [word for word, freq in word_frequencies.items() if freq > threshold]

print(f"Words occurring more than {threshold} times:")
print(high_frequency_words)

# Define save directory inside BASE_DIR
save_directory = os.path.join(BASE_DIR, "snapshots")

# Collect files per book
snapshots = {}
graphs = {}

#Define the file ranges for each book
book_ranges = {
    1: range(1, 19),  # Book 1: 1_1_More to 1_18_More
    2: range(1, 21),  # Book 2: 2_1_More to 2_20_More
    3: range(1, 14),  # Book 3: 3_1_More to 3_13_More
    4: range(1, 14)   # Book 4: 4_1_More to 4_13_More
}

# Define book snapshots before processing
snapshots = {}

modularity_scores = {}  # Initialize empty dictionary to store modularity scores


# Populate snapshots with file paths
for book, units in book_ranges.items():
    snapshots[book] = []
    for unit in units:
        file_name = f"{book}_{unit}_More.txt"
        file_path = os.path.join(directory, file_name)
        if os.path.exists(file_path):
            snapshots[book].append(file_path)
        else:
            print(f"File not found: {file_path}")

# Update the most common words to be excluded in the graphs by updating stop words!
stop_words.update(high_frequency_words)

# Create a single global co-occurrence matrix
global_cooccurrence_counts = defaultdict(lambda: defaultdict(int))

# Process each book's text and build a combined co-occurrence matrix
for book, file_paths in snapshots.items():
    all_unique_words = set()

    for file_path in file_paths:
        with open(file_path, "r", encoding="utf-8") as file:
            text = file.read()

        # Extract filtered words
        words = utils.filter_text(text, proper_nouns, stop_words)
        if not words:
            continue
        
        # Build co-occurrence matrix
        window_size = 9
        adj_matrix, unique_words = utils.build_cooccurrence_matrix(words, window_size)
        all_unique_words.update(unique_words)
        
        # Update global co-occurrence counts
        for i, word_i in enumerate(unique_words):
            for j, word_j in enumerate(unique_words):
                if word_i != word_j:  # Prevent self-loops
                    global_cooccurrence_counts[word_i][word_j] += adj_matrix[i, j]

# Build global graph
global_graph = nx.Graph()

# Add nodes
for word in sorted(global_cooccurrence_counts.keys()):
    global_graph.add_node(word)

# Add edges
for word_i, neighbors in global_cooccurrence_counts.items():
    for word_j, weight in neighbors.items():
        if weight > 0:
            global_graph.add_edge(word_i, word_j, weight=weight)

# Perform community detection
global_partition = community_louvain.best_partition(global_graph)
word_to_topic = {word: topic_id for word, topic_id in global_partition.items()}

# Compute modularity score for the global topic model
global_modularity = community_louvain.modularity(global_partition, global_graph)

print(f"🌍 Global Graph Modularity Score: {global_modularity:.3f}")
       
# Process books individually
graphs = {}
for book, file_paths in snapshots.items():
    book_graph = nx.Graph()

    for file_path in file_paths:
        with open(file_path, "r", encoding="utf-8") as file:
            text = file.read()

        words = utils.filter_text(text, proper_nouns, stop_words)
        if not words:
            continue

        adj_matrix, unique_words = utils.build_cooccurrence_matrix(words, window_size=9)

        # Assign global topic labels
        for word in unique_words:
            book_graph.add_node(word, topic=word_to_topic.get(word, "unknown"))

        # Add edges
        for i, word_i in enumerate(unique_words):
            for j, word_j in enumerate(unique_words):
                if adj_matrix[i, j] != 0:
                    book_graph.add_edge(word_i, word_j, weight=adj_matrix[i, j])

        # Compute modularity score for each book-specific graph
        book_partition = community_louvain.best_partition(book_graph)  # Detect book-level communities
        book_modularity = community_louvain.modularity(book_partition, book_graph)
        
        print(f"📖 Book {book} - Modularity Score: {book_modularity:.3f}")
        
        # Store modularity scores for later analysis
        modularity_scores[book] = book_modularity

    graphs[book] = book_graph
    #**Save Book Graph in GEXF Format**
    book_gexf_filename = os.path.join(save_directory, f"book_{book}_random_network.gexf")
    nx.write_gexf(book_graph, book_gexf_filename)
    print(f"📂 Saved book graph: {book_gexf_filename}")

    # **Add book-specific graph to global graph**
    global_graph.add_edges_from(book_graph.edges(data=True))

    # Add book-specific grrah to global graph    
    global_graph.add_edges_from(book_graph.edges(data=True))

print(f"Final global graph: {global_graph.number_of_nodes()} nodes, {global_graph.number_of_edges()} edges.")

# **Save Global Graph in GEXF Format**
global_gexf_filename = os.path.join(save_directory, "202502_global_random_network.gexf")
nx.write_gexf(global_graph, global_gexf_filename)
print(f"📂 Saved global graph: {global_gexf_filename}")

# Save global topic assignments
output_file = os.path.join(BASE_DIR, "random_global_topic_assignments.csv")
utils.save_output_file(BASE_DIR, "random_global_topic_assignments.csv", "\n".join([f"{w},{t}" for w, t in word_to_topic.items()]))

print(f"Global topic assignments saved to {output_file}.")

# Save modularity scores
modularity_output_file = os.path.join(BASE_DIR, "modularity_scores.csv")
with open(modularity_output_file, "w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["Book", "Modularity Score"])
    
    for book, score in modularity_scores.items():
        writer.writerow([book, score])

print(f"📂 Modularity scores saved in {modularity_output_file}")


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\emine\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\emine\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\emine\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Words occurring more than 72 times:
['number', 'thank', 'get', 'right', 'name', 'eat', 'time', 'day', 'lot', 'food', 'people', 'family', 'big', 'home', 'man', 'old', 'many', 'feel', 'let', 'know', 'problem', 'live', 'boy', 'girl', 'like', 'give', 'play', 'school', 'love', 'good', 'great', 'think', 'next', 'new', 'start', 'phone', 'book', 'mum', 'dad', 'dream', 'course', 'help', 'want', 'thing', 'see', 'say', 'come', 'hour', 'walk', 'stop', 'watch', 'try', 'tell', 'take', 'friend', 'room', 'work', 'read', 'water', 'leave', 'ask', 'find', 'house', 'look', 'call', 'place', 'happy', 'way', 'party', 'year', 'need', 'child', 'talk', 'make', 'money', 'put', 'last', 'world', 'park', 'happen', 'show', 'woman', 'use', 'story', 'interviewer']
🌍 Global Graph Modularity Score: 0.244
📖 Book 1 - Modularity Score: 0.272
📖 Book 1 - Modularity Score: 0.573
📖 Book 1 - Modularity Score: 0.637
📖 Book 1 - Modularity Score: 0.645
📖 Book 1 - Modularity Score: 0.658
📖 Book 1 - Modularity Score: 0.645
📖 Book 1 

In [2]:
# attempts to use community louvain in the global graph as well and calculate modularity only for the book ranges!
# adds source to dataframe (first appearance in the book) March 21
import os
import utils  # Import utility functions
import spacy
import glob
import pandas as pd
import networkx as nx
import nltk
from nltk.corpus import stopwords
import community as community_louvain
import random
import numpy as np
from collections import defaultdict
import csv

random.seed(42)
np.random.seed(42)

# define stop words
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))  # Define stop_words

# Define base directory
BASE_DIR = r"C:\Users\emine\try_env\2025_02"

# Directory and pattern to read text files
directory = os.path.join(BASE_DIR, "texts")
pattern = "*.txt"

# Collect file paths
file_paths = glob.glob(os.path.join(directory, pattern))
texts = utils.read_text_files(directory)

#Load proper nouns and word lists from centralized directory
lists_directory = os.path.join(BASE_DIR, "lists")

# Load manually created proper_nouns file and and BNC-COCA headwords' list
proper_nouns = utils.load_proper_nouns(os.path.join(lists_directory, "mod_proper_nouns_misspelled.csv"))

# Load all three headwords lists
headwords_one = utils.load_word_list(os.path.join(lists_directory, "BNC-COCA_headwords_1000.txt"))
headwords_two = utils.load_word_list(os.path.join(lists_directory, "BNC-COCA_headwords_2000.txt"))
headwords_three = utils.load_word_list(os.path.join(lists_directory, "BNC-COCA_headwords_3000.txt"))


# Lemmatize the headword lists to ensure comparison is consistent with the text processing pipeline
list1_words = utils.lemmatize_word_list(headwords_one)  # Initialize word_list with list1_words
list2_words = utils.lemmatize_word_list(headwords_two)  # Initialize word_list with list2_words
list3_words = utils.lemmatize_word_list(headwords_three)  # Initialize word_list with list3_words


# Visualization function (same as provided but generalized for more options)
def plot_word_frequencies(word_frequencies, top_n=20, log_scale=False):
    """
    Plots a bar chart of the top N most frequent words.
    If log_scale is True, applies a logarithmic scale to the y-axis.
    """
    most_common = word_frequencies.most_common(top_n)
    words = [word for word, freq in most_common]
    frequencies = [freq for word, freq in most_common]

    plt.figure(figsize=(12, 8))
    plt.bar(words, frequencies, color='skyblue')
    plt.xlabel('Words', fontsize=11)
    plt.ylabel('Frequencies' if not log_scale else 'Log(Frequencies)', fontsize=14)
    plt.title(f"Top {top_n} Most Frequent Words", fontsize=16)
    plt.xticks(rotation=90, ha='right', fontsize=11)
    if log_scale:
        plt.yscale('log')
    plt.tight_layout()
    plt.show()

# Compute word frequencies
word_frequencies = utils.compute_word_frequencies(texts, proper_nouns, stop_words)
threshold = 72
high_frequency_words = [word for word, freq in word_frequencies.items() if freq > threshold]

print(f"Words occurring more than {threshold} times:")
print(high_frequency_words)

# Define save directory inside BASE_DIR
save_directory = os.path.join(BASE_DIR, "snapshots")

# Collect files per book
snapshots = {}
graphs = {}

#Define the file ranges for each book
book_ranges = {
    1: range(1, 19),  # Book 1: 1_1_More to 1_18_More
    2: range(1, 21),  # Book 2: 2_1_More to 2_20_More
    3: range(1, 14),  # Book 3: 3_1_More to 3_13_More
    4: range(1, 14)   # Book 4: 4_1_More to 4_13_More
}

# Define book snapshots before processing
snapshots = {}

modularity_scores = {}  # Initialize empty dictionary to store modularity scores
# Track first appearance of each token
token_first_appearance = {}

# Populate snapshots with file paths
for book, units in book_ranges.items():
    snapshots[book] = []
    for unit in units:
        file_name = f"{book}_{unit}_More.txt"
        file_path = os.path.join(directory, file_name)
        if os.path.exists(file_path):
            snapshots[book].append(file_path)
        else:
            print(f"File not found: {file_path}")

# Update the most common words to be excluded in the graphs by updating stop words!
stop_words.update(high_frequency_words)

# Create a single global co-occurrence matrix
global_cooccurrence_counts = defaultdict(lambda: defaultdict(int))

# Process each book's text and build a combined co-occurrence matrix
for book, file_paths in snapshots.items():
    all_unique_words = set()

    for file_path in file_paths:
        with open(file_path, "r", encoding="utf-8") as file:
            text = file.read()

        # Extract filtered words
        words = utils.filter_text(text, proper_nouns, stop_words)
        if not words:
            continue
        
        # Build co-occurrence matrix
        window_size = 9
        adj_matrix, unique_words = utils.build_cooccurrence_matrix(words, window_size)
        all_unique_words.update(unique_words)

        # Track first appearance of each word
        for word in unique_words:
            if word not in token_first_appearance:
                token_first_appearance[word] = book
        
        # Update global co-occurrence counts
        for i, word_i in enumerate(unique_words):
            for j, word_j in enumerate(unique_words):
                if word_i != word_j:  # Prevent self-loops
                    global_cooccurrence_counts[word_i][word_j] += adj_matrix[i, j]

# Build global graph
global_graph = nx.Graph()
# Add nodes
for word in sorted(global_cooccurrence_counts.keys()):
    global_graph.add_node(word, source=token_first_appearance.get(word, "unknown"))
# Add edges
for word_i, neighbors in global_cooccurrence_counts.items():
    for word_j, weight in neighbors.items():
        if weight > 0:
            global_graph.add_edge(word_i, word_j, weight=weight)

# Perform community detection
global_partition = community_louvain.best_partition(global_graph)
word_to_topic = {word: topic_id for word, topic_id in global_partition.items()}

# Add topics to global graph nodes
for word, topic in word_to_topic.items():
    if word in global_graph.nodes:
        global_graph.nodes[word]['topic'] = topic

# Compute modularity score for the global topic model
global_modularity = community_louvain.modularity(global_partition, global_graph)
print(f"🌍 Global Graph Modularity Score: {global_modularity:.3f}")
       
# Process books individually
graphs = {}
for book, file_paths in snapshots.items():
    book_graph = nx.Graph()
    all_words_in_book = set()

    for file_path in file_paths:
        with open(file_path, "r", encoding="utf-8") as file:
            text = file.read()

        words = utils.filter_text(text, proper_nouns, stop_words)
        if not words:
            continue

        adj_matrix, unique_words = utils.build_cooccurrence_matrix(words, window_size=9)
        all_words_in_book.update(unique_words)

        # Assign global topic labels
        for word in unique_words:
            book_graph.add_node(word, topic=word_to_topic.get(word, "unknown"), source=token_first_appearance.get(word, book))

        # Add edges
        for i, word_i in enumerate(unique_words):
            for j, word_j in enumerate(unique_words):
                if adj_matrix[i, j] != 0:
                    book_graph.add_edge(word_i, word_j, weight=adj_matrix[i, j])

    # Compute modularity score for each book-specific graph
    book_partition = community_louvain.best_partition(book_graph)  # Detect book-level communities
    book_modularity = community_louvain.modularity(book_partition, book_graph)
    
    print(f"📖 Book {book} - Modularity Score: {book_modularity:.3f}")
    
    # Store modularity scores for later analysis
    modularity_scores[book] = book_modularity
    graphs[book] = book_graph
    
    #**Save Book Graph in GEXF Format**
    book_gexf_filename = os.path.join(save_directory, f"2025_03_book_{book}_random_network.gexf")
    nx.write_gexf(book_graph, book_gexf_filename)
    print(f"📂 Saved book graph: {book_gexf_filename}")

    
# **Save Global Graph in GEXF Format**
global_gexf_filename = os.path.join(save_directory, "202503_global_random_network.gexf")
nx.write_gexf(global_graph, global_gexf_filename)
print(f"📂 Saved global graph: {global_gexf_filename}")

# Save modularity scores
modularity_output_file = os.path.join(BASE_DIR, "2025_03_modularity_scores.csv")
with open(modularity_output_file, "w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["Book", "Modularity Score"])

    for book, score in modularity_scores.items():
        writer.writerow([book, score])

print(f"📂 Modularity scores saved in {modularity_output_file}")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\emine\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Words occurring more than 72 times:
['number', 'thank', 'get', 'right', 'name', 'eat', 'time', 'day', 'lot', 'food', 'people', 'family', 'big', 'home', 'man', 'old', 'many', 'feel', 'let', 'know', 'problem', 'live', 'boy', 'girl', 'like', 'give', 'play', 'school', 'love', 'good', 'great', 'think', 'next', 'new', 'start', 'phone', 'book', 'mum', 'dad', 'dream', 'course', 'help', 'want', 'thing', 'see', 'say', 'come', 'hour', 'walk', 'stop', 'watch', 'try', 'tell', 'take', 'friend', 'room', 'work', 'read', 'water', 'leave', 'ask', 'find', 'house', 'look', 'call', 'place', 'happy', 'way', 'party', 'year', 'need', 'child', 'talk', 'make', 'money', 'put', 'last', 'world', 'park', 'happen', 'show', 'woman', 'use', 'story', 'interviewer']
🌍 Global Graph Modularity Score: 0.244
📖 Book 1 - Modularity Score: 0.538
📂 Saved book graph: C:\Users\emine\try_env\2025_02\snapshots\2025_03_book_1_random_network.gexf
📖 Book 2 - Modularity Score: 0.443
📂 Saved book graph: C:\Users\emine\try_env\2025_02\sn

In [1]:
# attempts to use community louvain in the global graph as well and calculate modularity only for the book ranges!
# adds source to dataframe (first appearance in the book) March 21
import os
import utils  # Import utility functions
import spacy
import glob
import pandas as pd
import networkx as nx
import nltk
from nltk.corpus import stopwords
import community as community_louvain
import random
import numpy as np
from collections import defaultdict
import csv

random.seed(42)
np.random.seed(42)

# define stop words
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))  # Define stop_words

# Define base directory
BASE_DIR = r"C:\Users\emine\try_env\2025_02"

# Directory and pattern to read text files
directory = os.path.join(BASE_DIR, "texts")
pattern = "*.txt"

# Collect file paths
file_paths = glob.glob(os.path.join(directory, pattern))
texts = utils.read_text_files(directory)

#Load proper nouns and word lists from centralized directory
lists_directory = os.path.join(BASE_DIR, "lists")

# Load manually created proper_nouns file and and BNC-COCA headwords' list
proper_nouns = utils.load_proper_nouns(os.path.join(lists_directory, "mod_proper_nouns_misspelled.csv"))

# Load all three headwords lists
headwords_one = utils.load_word_list(os.path.join(lists_directory, "BNC-COCA_headwords_1000.txt"))
headwords_two = utils.load_word_list(os.path.join(lists_directory, "BNC-COCA_headwords_2000.txt"))
headwords_three = utils.load_word_list(os.path.join(lists_directory, "BNC-COCA_headwords_3000.txt"))


# Lemmatize the headword lists to ensure comparison is consistent with the text processing pipeline
list1_words = utils.lemmatize_word_list(headwords_one)  # Initialize word_list with list1_words
list2_words = utils.lemmatize_word_list(headwords_two)  # Initialize word_list with list2_words
list3_words = utils.lemmatize_word_list(headwords_three)  # Initialize word_list with list3_words


# Visualization function (same as provided but generalized for more options)
def plot_word_frequencies(word_frequencies, top_n=20, log_scale=False):
    """
    Plots a bar chart of the top N most frequent words.
    If log_scale is True, applies a logarithmic scale to the y-axis.
    """
    most_common = word_frequencies.most_common(top_n)
    words = [word for word, freq in most_common]
    frequencies = [freq for word, freq in most_common]

    plt.figure(figsize=(12, 8))
    plt.bar(words, frequencies, color='skyblue')
    plt.xlabel('Words', fontsize=11)
    plt.ylabel('Frequencies' if not log_scale else 'Log(Frequencies)', fontsize=14)
    plt.title(f"Top {top_n} Most Frequent Words", fontsize=16)
    plt.xticks(rotation=90, ha='right', fontsize=11)
    if log_scale:
        plt.yscale('log')
    plt.tight_layout()
    plt.show()

# Compute word frequencies
word_frequencies = utils.compute_word_frequencies(texts, proper_nouns, stop_words)
threshold = 72
high_frequency_words = [word for word, freq in word_frequencies.items() if freq > threshold]

print(f"Words occurring more than {threshold} times:")
print(high_frequency_words)

# Define save directory inside BASE_DIR
save_directory = os.path.join(BASE_DIR, "snapshots")

# Collect files per book
snapshots = {}
graphs = {}

#Define the file ranges for each book
book_ranges = {
    1: range(1, 19),  # Book 1: 1_1_More to 1_18_More
    2: range(1, 21),  # Book 2: 2_1_More to 2_20_More
    3: range(1, 14),  # Book 3: 3_1_More to 3_13_More
    4: range(1, 14)   # Book 4: 4_1_More to 4_13_More
}

# Define book snapshots before processing
snapshots = {}

modularity_scores = {}  # Initialize empty dictionary to store modularity scores
# Track first appearance of each token
token_first_appearance = {}

# Populate snapshots with file paths
for book, units in book_ranges.items():
    snapshots[book] = []
    for unit in units:
        file_name = f"{book}_{unit}_More.txt"
        file_path = os.path.join(directory, file_name)
        if os.path.exists(file_path):
            snapshots[book].append(file_path)
        else:
            print(f"File not found: {file_path}")

# Update the most common words to be excluded in the graphs by updating stop words!
stop_words.update(high_frequency_words)

# Create a single global co-occurrence matrix
global_cooccurrence_counts = defaultdict(lambda: defaultdict(int))

# Process each book's text and build a combined co-occurrence matrix
for book, file_paths in snapshots.items():
    all_unique_words = set()

    for file_path in file_paths:
        with open(file_path, "r", encoding="utf-8") as file:
            text = file.read()

        # Extract filtered words
        words = utils.filter_text(text, proper_nouns, stop_words)
        if not words:
            continue
        
        # Build co-occurrence matrix
        window_size = 2
        adj_matrix, unique_words = utils.build_cooccurrence_matrix(words, window_size)
        all_unique_words.update(unique_words)

        # Track first appearance of each word
        for word in unique_words:
            if word not in token_first_appearance:
                token_first_appearance[word] = book
        
        # Update global co-occurrence counts
        for i, word_i in enumerate(unique_words):
            for j, word_j in enumerate(unique_words):
                if word_i != word_j:  # Prevent self-loops
                    global_cooccurrence_counts[word_i][word_j] += adj_matrix[i, j]

# Build global graph
global_graph = nx.Graph()
# Add nodes
for word in sorted(global_cooccurrence_counts.keys()):
    global_graph.add_node(word, source=token_first_appearance.get(word, "unknown"))
# Add edges
for word_i, neighbors in global_cooccurrence_counts.items():
    for word_j, weight in neighbors.items():
        if weight > 0:
            global_graph.add_edge(word_i, word_j, weight=weight)

# Perform community detection
global_partition = community_louvain.best_partition(global_graph)
word_to_topic = {word: topic_id for word, topic_id in global_partition.items()}

# Add topics to global graph nodes
for word, topic in word_to_topic.items():
    if word in global_graph.nodes:
        global_graph.nodes[word]['topic'] = topic

# Compute modularity score for the global topic model
global_modularity = community_louvain.modularity(global_partition, global_graph)
print(f"🌍 Global Graph Modularity Score: {global_modularity:.3f}")
       
# Process books individually
graphs = {}
for book, file_paths in snapshots.items():
    book_graph = nx.Graph()
    all_words_in_book = set()

    for file_path in file_paths:
        with open(file_path, "r", encoding="utf-8") as file:
            text = file.read()

        words = utils.filter_text(text, proper_nouns, stop_words)
        if not words:
            continue

        adj_matrix, unique_words = utils.build_cooccurrence_matrix(words, window_size=2)
        all_words_in_book.update(unique_words)

        # Assign global topic labels
        for word in unique_words:
            book_graph.add_node(word, topic=word_to_topic.get(word, "unknown"), source=token_first_appearance.get(word, book))

        # Add edges
        for i, word_i in enumerate(unique_words):
            for j, word_j in enumerate(unique_words):
                if adj_matrix[i, j] != 0:
                    book_graph.add_edge(word_i, word_j, weight=adj_matrix[i, j])

    # Compute modularity score for each book-specific graph
    book_partition = community_louvain.best_partition(book_graph)  # Detect book-level communities
    book_modularity = community_louvain.modularity(book_partition, book_graph)
    
    print(f"📖 Book {book} - Modularity Score: {book_modularity:.3f}")
    
    # Store modularity scores for later analysis
    modularity_scores[book] = book_modularity
    graphs[book] = book_graph
    
    #**Save Book Graph in GEXF Format**
    book_gexf_filename = os.path.join(save_directory, f"2025_04_w2_book_{book}_random_network.gexf")
    nx.write_gexf(book_graph, book_gexf_filename)
    print(f"📂 Saved book graph: {book_gexf_filename}")

    
# **Save Global Graph in GEXF Format**
global_gexf_filename = os.path.join(save_directory, "202504_w2_global_random_network.gexf")
nx.write_gexf(global_graph, global_gexf_filename)
print(f"📂 Saved global graph: {global_gexf_filename}")

# Save modularity scores
modularity_output_file = os.path.join(BASE_DIR, "2025_04_w2_modularity_scores.csv")
with open(modularity_output_file, "w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["Book", "Modularity Score"])

    for book, score in modularity_scores.items():
        writer.writerow([book, score])

print(f"📂 Modularity scores saved in {modularity_output_file}")

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\emine\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\emine\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\emine\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Words occurring more than 72 times:
['number', 'thank', 'get', 'right', 'name', 'eat', 'time', 'day', 'lot', 'food', 'people', 'family', 'big', 'home', 'man', 'old', 'many', 'feel', 'let', 'know', 'problem', 'live', 'boy', 'girl', 'like', 'give', 'play', 'school', 'love', 'good', 'great', 'think', 'next', 'new', 'start', 'phone', 'book', 'mum', 'dad', 'dream', 'course', 'help', 'want', 'thing', 'see', 'say', 'come', 'hour', 'walk', 'stop', 'watch', 'try', 'tell', 'take', 'friend', 'room', 'work', 'read', 'water', 'leave', 'ask', 'find', 'house', 'look', 'call', 'place', 'happy', 'way', 'party', 'year', 'need', 'child', 'talk', 'make', 'money', 'put', 'last', 'world', 'park', 'happen', 'show', 'woman', 'use', 'story', 'interviewer']
🌍 Global Graph Modularity Score: 0.327
📖 Book 1 - Modularity Score: 0.538
📂 Saved book graph: C:\Users\emine\try_env\2025_02\snapshots\2025_04_w2_book_1_random_network.gexf
📖 Book 2 - Modularity Score: 0.440
📂 Saved book graph: C:\Users\emine\try_env\2025_02

In [1]:
# attempts to use community louvain in the global graph as well and calculate modularity only for the book ranges!
# adds source more granular in unit form to dataframe (first appearance in the book) April 8
import os
import utils  # Import utility functions
import spacy
import glob
import pandas as pd
import networkx as nx
import nltk
from nltk.corpus import stopwords
import community as community_louvain
import random
import numpy as np
from collections import defaultdict
import csv

random.seed(42)
np.random.seed(42)

# define stop words
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))  # Define stop_words

# Define base directory
BASE_DIR = r"C:\Users\emine\try_env\2025_02"

# Directory and pattern to read text files
directory = os.path.join(BASE_DIR, "texts")
pattern = "*.txt"

# Collect file paths
file_paths = glob.glob(os.path.join(directory, pattern))
texts = utils.read_text_files(directory)

#Load proper nouns and word lists from centralized directory
lists_directory = os.path.join(BASE_DIR, "lists")

# Load manually created proper_nouns file and and BNC-COCA headwords' list
proper_nouns = utils.load_proper_nouns(os.path.join(lists_directory, "mod_proper_nouns_misspelled.csv"))

# Load all three headwords lists
headwords_one = utils.load_word_list(os.path.join(lists_directory, "BNC-COCA_headwords_1000.txt"))
headwords_two = utils.load_word_list(os.path.join(lists_directory, "BNC-COCA_headwords_2000.txt"))
headwords_three = utils.load_word_list(os.path.join(lists_directory, "BNC-COCA_headwords_3000.txt"))


# Lemmatize the headword lists to ensure comparison is consistent with the text processing pipeline
list1_words = utils.lemmatize_word_list(headwords_one)  # Initialize word_list with list1_words
list2_words = utils.lemmatize_word_list(headwords_two)  # Initialize word_list with list2_words
list3_words = utils.lemmatize_word_list(headwords_three)  # Initialize word_list with list3_words

def unit_id_to_float(unit_id):
    book, unit = unit_id.split("_")
    return float(f"{book}.{int(unit):02d}")

# Visualization function (same as provided but generalized for more options)
def plot_word_frequencies(word_frequencies, top_n=20, log_scale=False):
    """
    Plots a bar chart of the top N most frequent words.
    If log_scale is True, applies a logarithmic scale to the y-axis.
    """
    most_common = word_frequencies.most_common(top_n)
    words = [word for word, freq in most_common]
    frequencies = [freq for word, freq in most_common]

    plt.figure(figsize=(12, 8))
    plt.bar(words, frequencies, color='skyblue')
    plt.xlabel('Words', fontsize=11)
    plt.ylabel('Frequencies' if not log_scale else 'Log(Frequencies)', fontsize=14)
    plt.title(f"Top {top_n} Most Frequent Words", fontsize=16)
    plt.xticks(rotation=90, ha='right', fontsize=11)
    if log_scale:
        plt.yscale('log')
    plt.tight_layout()
    plt.show()

# Compute word frequencies
word_frequencies = utils.compute_word_frequencies(texts, proper_nouns, stop_words)
threshold = 120
high_frequency_words = [word for word, freq in word_frequencies.items() if freq > threshold]

print(f"Words occurring more than {threshold} times:")
print(high_frequency_words)

# Define save directory inside BASE_DIR
save_directory = os.path.join(BASE_DIR, "snapshots")

# Collect files per book
snapshots = {}
graphs = {}

#Define the file ranges for each book
book_ranges = {
    1: range(1, 19),  # Book 1: 1_1_More to 1_18_More
    2: range(1, 21),  # Book 2: 2_1_More to 2_20_More
    3: range(1, 14),  # Book 3: 3_1_More to 3_13_More
    4: range(1, 14)   # Book 4: 4_1_More to 4_13_More
}

# Define book snapshots before processing
snapshots = {}

modularity_scores = {}  # Initialize empty dictionary to store modularity scores
# Track first appearance of each token
token_first_appearance = {}

# Populate snapshots with file paths
for book, units in book_ranges.items():
    snapshots[book] = []
    for unit in units:
        file_name = f"{book}_{unit}_More.txt"
        file_path = os.path.join(directory, file_name)
        if os.path.exists(file_path):
            snapshots[book].append(file_path)
        else:
            print(f"File not found: {file_path}")

# Update the most common words to be excluded in the graphs by updating stop words!
stop_words.update(high_frequency_words)

# Create a single global co-occurrence matrix
global_cooccurrence_counts = defaultdict(lambda: defaultdict(int))

# Process each book's text and build a combined co-occurrence matrix
for book, file_paths in snapshots.items():
    all_unique_words = set()

    for file_path in file_paths:
        with open(file_path, "r", encoding="utf-8") as file:
            text = file.read()

        # Extract filtered words
        words = utils.filter_text(text, proper_nouns, stop_words)
        if not words:
            continue
        
        # Build co-occurrence matrix
        window_size = 3
        adj_matrix, unique_words = utils.build_cooccurrence_matrix(words, window_size)
        all_unique_words.update(unique_words)

        ## Get the unit ID from filename
        filename = os.path.basename(file_path)          # "1_4_More.txt"
        name_no_ext = os.path.splitext(filename)[0]     # "1_4_More"
        parts = name_no_ext.split("_")                  # ['1', '4', 'More']

        unit_id = "_".join(os.path.basename(file_path).split("_")[:2])

        # Track first appearance by unit
        for word in unique_words:
            if word not in token_first_appearance:
                token_first_appearance[word] = unit_id

        for i, word_i in enumerate(unique_words):
            for j, word_j in enumerate(unique_words):
                if word_i != word_j:
                    global_cooccurrence_counts[word_i][word_j] += adj_matrix[i, j]

# Build global graph
global_graph = nx.Graph()
# Add nodes
for word in sorted(global_cooccurrence_counts.keys()):
    global_graph.add_node(word, source=token_first_appearance.get(word, "unknown"))

# Add dynamic 'start' time for Gephi timeline
for word in global_graph.nodes:
    unit_id = token_first_appearance.get(word)
    if unit_id and isinstance(unit_id, str) and "_" in unit_id:
        global_graph.nodes[word]['start'] = unit_id_to_float(unit_id)

# Add edges
for word_i, neighbors in global_cooccurrence_counts.items():
    for word_j, weight in neighbors.items():
        if weight > 0:
            global_graph.add_edge(word_i, word_j, weight=weight)

# Perform community detection
global_partition = community_louvain.best_partition(global_graph)
word_to_topic = {word: topic_id for word, topic_id in global_partition.items()}

# Add topics to global graph nodes
for word, topic in word_to_topic.items():
    if word in global_graph.nodes:
        global_graph.nodes[word]['topic'] = topic

# Compute modularity score for the global topic model
global_modularity = community_louvain.modularity(global_partition, global_graph)
print(f"🌍 Global Graph Modularity Score: {global_modularity:.3f}")
       
# Process books individually
graphs = {}
for book, file_paths in snapshots.items():
    book_graph = nx.Graph()
    all_words_in_book = set()

    for file_path in file_paths:
        with open(file_path, "r", encoding="utf-8") as file:
            text = file.read()

        words = utils.filter_text(text, proper_nouns, stop_words)
        if not words:
            continue

        adj_matrix, unique_words = utils.build_cooccurrence_matrix(words, window_size=3)
        all_words_in_book.update(unique_words)

        
        # Assign global topic labels and timeline
        for word in unique_words:
            source = token_first_appearance.get(word, "unknown")
            start = unit_id_to_float(source) if "_" in str(source) else None
            book_graph.add_node(
                word,
                topic=word_to_topic.get(word, "unknown"),
                source=source,
                start=start,
                end=start + 0.1 if start is not None else None
            )


        # Add edges
        for i, word_i in enumerate(unique_words):
            for j, word_j in enumerate(unique_words):
                if adj_matrix[i, j] != 0:
                    book_graph.add_edge(word_i, word_j, weight=adj_matrix[i, j])

    # Compute modularity score for each book-specific graph
    book_partition = community_louvain.best_partition(book_graph)  # Detect book-level communities
    book_modularity = community_louvain.modularity(book_partition, book_graph)
    
    print(f"📖 Book {book} - Modularity Score: {book_modularity:.3f}")
    
    # Store modularity scores for later analysis
    modularity_scores[book] = book_modularity
    graphs[book] = book_graph
    
    #**Save Book Graph in GEXF Format**
    book_gexf_filename = os.path.join(save_directory, f"2025_04_w3_book_{book}_random_network.gexf")
    nx.write_gexf(book_graph, book_gexf_filename)
    print(f"📂 Saved book graph: {book_gexf_filename}")

    
# **Save Global Graph in GEXF Format**
global_gexf_filename = os.path.join(save_directory, "202504_w3_global_random_network.gexf")
nx.write_gexf(global_graph, global_gexf_filename)
print(f"📂 Saved global graph: {global_gexf_filename}")

# Save modularity scores
modularity_output_file = os.path.join(BASE_DIR, "2025_04_w3_modularity_scores.csv")
with open(modularity_output_file, "w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["Book", "Modularity Score"])

    for book, score in modularity_scores.items():
        writer.writerow([book, score])

print(f"📂 Modularity scores saved in {modularity_output_file}")

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\emine\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\emine\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\emine\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Words occurring more than 120 times:
['get', 'eat', 'time', 'day', 'lot', 'people', 'big', 'man', 'many', 'let', 'know', 'live', 'like', 'give', 'play', 'school', 'good', 'great', 'think', 'start', 'book', 'mum', 'dad', 'help', 'want', 'thing', 'see', 'say', 'come', 'walk', 'tell', 'take', 'friend', 'work', 'ask', 'find', 'look', 'call', 'year', 'make']
🌍 Global Graph Modularity Score: 0.244
📖 Book 1 - Modularity Score: 0.523
📂 Saved book graph: C:\Users\emine\try_env\2025_02\snapshots\2025_04_w3_book_1_random_network.gexf
📖 Book 2 - Modularity Score: 0.407
📂 Saved book graph: C:\Users\emine\try_env\2025_02\snapshots\2025_04_w3_book_2_random_network.gexf
📖 Book 3 - Modularity Score: 0.350
📂 Saved book graph: C:\Users\emine\try_env\2025_02\snapshots\2025_04_w3_book_3_random_network.gexf
📖 Book 4 - Modularity Score: 0.377
📂 Saved book graph: C:\Users\emine\try_env\2025_02\snapshots\2025_04_w3_book_4_random_network.gexf
📂 Saved global graph: C:\Users\emine\try_env\2025_02\snapshots\202504

In [2]:
# adds topic distribution in numbers / percentages April 11
# attempts to use community louvain in the global graph as well and calculate modularity only for the book ranges!
# adds source more granular in unit form to dataframe (first appearance in the book) April 8
import os
import utils  # Import utility functions
import spacy
import glob
import pandas as pd
import networkx as nx
import nltk
from nltk.corpus import stopwords
import community as community_louvain
import random
import numpy as np
from collections import defaultdict
import csv

random.seed(42)
np.random.seed(42)

# define stop words
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))  # Define stop_words

# Define base directory
BASE_DIR = r"C:\Users\emine\try_env\2025_02"

# Directory and pattern to read text files
directory = os.path.join(BASE_DIR, "texts")
pattern = "*.txt"

# Collect file paths
file_paths = glob.glob(os.path.join(directory, pattern))
texts = utils.read_text_files(directory)

#Load proper nouns and word lists from centralized directory
lists_directory = os.path.join(BASE_DIR, "lists")

# Load manually created proper_nouns file and and BNC-COCA headwords' list
proper_nouns = utils.load_proper_nouns(os.path.join(lists_directory, "mod_proper_nouns_misspelled.csv"))

# Load all three headwords lists
headwords_one = utils.load_word_list(os.path.join(lists_directory, "BNC-COCA_headwords_1000.txt"))
headwords_two = utils.load_word_list(os.path.join(lists_directory, "BNC-COCA_headwords_2000.txt"))
headwords_three = utils.load_word_list(os.path.join(lists_directory, "BNC-COCA_headwords_3000.txt"))


# Lemmatize the headword lists to ensure comparison is consistent with the text processing pipeline
list1_words = utils.lemmatize_word_list(headwords_one)  # Initialize word_list with list1_words
list2_words = utils.lemmatize_word_list(headwords_two)  # Initialize word_list with list2_words
list3_words = utils.lemmatize_word_list(headwords_three)  # Initialize word_list with list3_words

def unit_id_to_float(unit_id):
    book, unit = unit_id.split("_")
    return float(f"{book}.{int(unit):02d}")

# Visualization function (same as provided but generalized for more options)
def plot_word_frequencies(word_frequencies, top_n=20, log_scale=False):
    """
    Plots a bar chart of the top N most frequent words.
    If log_scale is True, applies a logarithmic scale to the y-axis.
    """
    most_common = word_frequencies.most_common(top_n)
    words = [word for word, freq in most_common]
    frequencies = [freq for word, freq in most_common]

    plt.figure(figsize=(12, 8))
    plt.bar(words, frequencies, color='skyblue')
    plt.xlabel('Words', fontsize=11)
    plt.ylabel('Frequencies' if not log_scale else 'Log(Frequencies)', fontsize=14)
    plt.title(f"Top {top_n} Most Frequent Words", fontsize=16)
    plt.xticks(rotation=90, ha='right', fontsize=11)
    if log_scale:
        plt.yscale('log')
    plt.tight_layout()
    plt.show()

# Compute word frequencies
word_frequencies = utils.compute_word_frequencies(texts, proper_nouns, stop_words)
threshold = 60
high_frequency_words = [word for word, freq in word_frequencies.items() if freq > threshold]

print(f"Words occurring more than {threshold} times:")
print(high_frequency_words)

# Define save directory inside BASE_DIR
save_directory = os.path.join(BASE_DIR, "snapshots")

# Collect files per book
snapshots = {}
graphs = {}

#Define the file ranges for each book
book_ranges = {
    1: range(1, 19),  # Book 1: 1_1_More to 1_18_More
    2: range(1, 21),  # Book 2: 2_1_More to 2_20_More
    3: range(1, 14),  # Book 3: 3_1_More to 3_13_More
    4: range(1, 14)   # Book 4: 4_1_More to 4_13_More
}

# Define book snapshots before processing
snapshots = {}

modularity_scores = {}  # Initialize empty dictionary to store modularity scores
# Track first appearance of each token
token_first_appearance = {}

# Populate snapshots with file paths
for book, units in book_ranges.items():
    snapshots[book] = []
    for unit in units:
        file_name = f"{book}_{unit}_More.txt"
        file_path = os.path.join(directory, file_name)
        if os.path.exists(file_path):
            snapshots[book].append(file_path)
        else:
            print(f"File not found: {file_path}")

# Update the most common words to be excluded in the graphs by updating stop words!
stop_words.update(high_frequency_words)

# Create a single global co-occurrence matrix
global_cooccurrence_counts = defaultdict(lambda: defaultdict(int))

# Process each book's text and build a combined co-occurrence matrix
for book, file_paths in snapshots.items():
    all_unique_words = set()

    for file_path in file_paths:
        with open(file_path, "r", encoding="utf-8") as file:
            text = file.read()

        # Extract filtered words
        words = utils.filter_text(text, proper_nouns, stop_words)
        if not words:
            continue
        
        # Build co-occurrence matrix
        window_size = 3
        adj_matrix, unique_words = utils.build_cooccurrence_matrix(words, window_size)
        all_unique_words.update(unique_words)

        ## Get the unit ID from filename
        filename = os.path.basename(file_path)          # "1_4_More.txt"
        name_no_ext = os.path.splitext(filename)[0]     # "1_4_More"
        parts = name_no_ext.split("_")                  # ['1', '4', 'More']

        unit_id = "_".join(os.path.basename(file_path).split("_")[:2])

        # Track first appearance by unit
        for word in unique_words:
            if word not in token_first_appearance:
                token_first_appearance[word] = unit_id

        for i, word_i in enumerate(unique_words):
            for j, word_j in enumerate(unique_words):
                if word_i != word_j:
                    global_cooccurrence_counts[word_i][word_j] += adj_matrix[i, j]

# Build global graph
global_graph = nx.Graph()
# Add nodes
for word in sorted(global_cooccurrence_counts.keys()):
    global_graph.add_node(word, source=token_first_appearance.get(word, "unknown"))

# Add dynamic 'start' time for Gephi timeline
for word in global_graph.nodes:
    unit_id = token_first_appearance.get(word)
    if unit_id and isinstance(unit_id, str) and "_" in unit_id:
        global_graph.nodes[word]['start'] = unit_id_to_float(unit_id)

# Add edges
for word_i, neighbors in global_cooccurrence_counts.items():
    for word_j, weight in neighbors.items():
        if weight > 0:
            global_graph.add_edge(word_i, word_j, weight=weight)

# Perform community detection
global_partition = community_louvain.best_partition(global_graph)
word_to_topic = {word: topic_id for word, topic_id in global_partition.items()}

# Add topics to global graph nodes
for word, topic in word_to_topic.items():
    if word in global_graph.nodes:
        global_graph.nodes[word]['topic'] = topic

# Compute modularity score for the global topic model
global_modularity = community_louvain.modularity(global_partition, global_graph)
print(f"🌍 Global Graph Modularity Score: {global_modularity:.3f}")

time_windows = []
# First, build a unit_id -> (start, end) lookup based on time_windows
unit_interval_map = {}

for start, end in time_windows:
    for book, units in book_ranges.items():
        for unit in units:
            uid = f"{book}_{unit}"
            uid_float = unit_id_to_float(uid)
            if start <= uid_float <= end:
                unit_interval_map[uid] = (start, end)


    # Define two time windows per book
    time_windows.append((start_unit, midpoint))
    time_windows.append((midpoint, end_unit))


# Process books individually
graphs = {}
unit_topic_rows = []
for book, file_paths in snapshots.items():
    book_graph = nx.Graph()
    all_words_in_book = set()

    for file_path in file_paths:
        with open(file_path, "r", encoding="utf-8") as file:
            text = file.read()

        words = utils.filter_text(text, proper_nouns, stop_words)
        # Build unit-level topic distribution based on global topics
        topic_counts = defaultdict(int)
        total_words_with_topic = 0
        
        for word in words:
            topic = word_to_topic.get(word)
            if topic is not None:
                topic_counts[topic] += 1
                total_words_with_topic += 1
        
        # Extract unit_id and start/end times
        filename = os.path.basename(file_path)
        name_no_ext = os.path.splitext(filename)[0]
        parts = name_no_ext.split("_")
        unit_id = "_".join(parts[:2])
        start_float, end_float = unit_interval_map.get(unit_id, (None, None))

        # Prepare row with topic percentages
        row = {"start": start_float, "end": end_float}
        for topic in sorted(word_to_topic.values()):
            count = topic_counts.get(topic, 0)
            percent = (count / total_words_with_topic * 100) if total_words_with_topic > 0 else 0.0
            row[f"topic_{topic}"] = round(percent, 2)
        
        unit_topic_rows.append(row)
        if not words:
            continue

        adj_matrix, unique_words = utils.build_cooccurrence_matrix(words, window_size=3)
        all_words_in_book.update(unique_words)

        
        # Assign global topic labels and timeline
        for word in unique_words:
            source = token_first_appearance.get(word, "unknown")
            start = unit_id_to_float(source) if "_" in str(source) else None
            book_graph.add_node(
                word,
                topic=word_to_topic.get(word, "unknown"),
                source=source,
                start=start,
                end=start + 0.1 if start is not None else None
            )


        # Add edges
        for i, word_i in enumerate(unique_words):
            for j, word_j in enumerate(unique_words):
                if adj_matrix[i, j] != 0:
                    book_graph.add_edge(word_i, word_j, weight=adj_matrix[i, j])

    # Compute modularity score for each book-specific graph
    book_partition = community_louvain.best_partition(book_graph)  # Detect book-level communities
    book_modularity = community_louvain.modularity(book_partition, book_graph)
    
    print(f"📖 Book {book} - Modularity Score: {book_modularity:.3f}")
    
    # Store modularity scores for later analysis
    modularity_scores[book] = book_modularity
    graphs[book] = book_graph
    
    #**Save Book Graph in GEXF Format**
    book_gexf_filename = os.path.join(save_directory, f"2025_05_w3_book_{book}__network.gexf")
    nx.write_gexf(book_graph, book_gexf_filename)
    print(f"📂 Saved book graph: {book_gexf_filename}")


import json
from collections import defaultdict
import os

# ========== CONFIG ==========
export_dir = os.path.join(BASE_DIR, "analysis_exports")
os.makedirs(export_dir, exist_ok=True)

# ========== DATA CONTAINERS ==========

token_counts_per_book = defaultdict(lambda: defaultdict(int))  # book -> word -> count
unit_tokens = defaultdict(list)                                # unit_id -> list of tokens

# ========== REPROCESS TEXTS FOR EXPORTS ==========

for book, file_paths in snapshots.items():
    for file_path in file_paths:
        with open(file_path, "r", encoding="utf-8") as file:
            text = file.read()

        # Extract filtered words
        words = utils.filter_text(text, proper_nouns, stop_words)
        if not words:
            continue

        # Get unit ID (e.g., "1_4" from "1_4_More.txt")
        filename = os.path.basename(file_path)
        unit_id = "_".join(os.path.splitext(filename)[0].split("_")[:2])

        # Record words at unit level
        unit_tokens[unit_id].extend(words)

        # Update frequency count for the book
        for word in words:
            token_counts_per_book[book][word] += 1

# ========== EXPORT WORD TO TOPIC ==========
with open(os.path.join(export_dir, "word_to_topic.json"), "w", encoding="utf-8") as f:
    json.dump(word_to_topic, f, ensure_ascii=False, indent=2)
print("✅ Exported word_to_topic.json")

# ========== EXPORT TOKEN COUNTS PER BOOK ==========
with open(os.path.join(export_dir, "token_counts_per_book.json"), "w", encoding="utf-8") as f:
    json.dump(token_counts_per_book, f, ensure_ascii=False, indent=2)
print("✅ Exported token_counts_per_book.json")

# ========== EXPORT UNIT TOKENS ==========
with open(os.path.join(export_dir, "unit_tokens.json"), "w", encoding="utf-8") as f:
    json.dump(unit_tokens, f, ensure_ascii=False, indent=2)
print("✅ Exported unit_tokens.json")


    
# **Save Global Graph in GEXF Format**
global_gexf_filename = os.path.join(save_directory, "202505_w3_global_network.gexf")
nx.write_gexf(global_graph, global_gexf_filename)
print(f"📂 Saved global graph: {global_gexf_filename}")


# Export per-unit topic coverage
unit_topic_csv = os.path.join(save_directory, "2025_05_w3_unit_topic_coverage.csv")
all_topics = sorted(set(word_to_topic.values()))
fieldnames = ["start", "end"] + [f"topic_{topic}" for topic in all_topics]

with open(unit_topic_csv, "w", newline="") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for row in unit_topic_rows:
        writer.writerow(row)

print(f"📄 Saved unit-level topic coverage CSV to {unit_topic_csv}")

# Export global core vocabulary (nodes in the global graph)
network_core_vocab = list(global_graph.nodes)

# Save to JSON
export_path = "analysis_exports/network_core_vocab.json"
with open(export_path, "w", encoding="utf-8") as f:
    json.dump(network_core_vocab, f, ensure_ascii=False, indent=2)

print(f"📤 Exported network-based core vocabulary to: {export_path}")


# Collect words by topic
topic_examples = defaultdict(list)
for word, topic in word_to_topic.items(): 
    topic_examples[topic].append(word)

# Print sample words per topic
print("\n🔍 Topic Assignment Examples:")
for topic_id, words in sorted(topic_examples.items()):
    sampled_words = random.sample(words, min(10, len(words)))  # up to 10 per topic
    print(f"Topic {topic_id:2d}: {', '.join(sampled_words)}")


# Save modularity scores
modularity_output_file = os.path.join(BASE_DIR, "2025_05_w3_distributional_modularity_scores.csv")
with open(modularity_output_file, "w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["Book", "Modularity Score"])

    for book, score in modularity_scores.items():
        writer.writerow([book, score])

print(f"📂 Modularity scores saved in {modularity_output_file}")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\emine\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Words occurring more than 100 times:
['get', 'eat', 'time', 'day', 'lot', 'people', 'big', 'man', 'many', 'let', 'know', 'live', 'like', 'give', 'play', 'school', 'love', 'good', 'great', 'think', 'new', 'start', 'book', 'mum', 'dad', 'help', 'want', 'thing', 'see', 'say', 'come', 'walk', 'watch', 'tell', 'take', 'friend', 'work', 'leave', 'ask', 'find', 'house', 'look', 'call', 'year', 'talk', 'make', 'money', 'world', 'woman']
🌍 Global Graph Modularity Score: 0.251
📖 Book 1 - Modularity Score: 0.534
📂 Saved book graph: C:\Users\emine\try_env\2025_02\snapshots\2025_04_w3_book_1__distr_random_network.gexf
📖 Book 2 - Modularity Score: 0.419
📂 Saved book graph: C:\Users\emine\try_env\2025_02\snapshots\2025_04_w3_book_2__distr_random_network.gexf
📖 Book 3 - Modularity Score: 0.358
📂 Saved book graph: C:\Users\emine\try_env\2025_02\snapshots\2025_04_w3_book_3__distr_random_network.gexf
📖 Book 4 - Modularity Score: 0.388
📂 Saved book graph: C:\Users\emine\try_env\2025_02\snapshots\2025_04_w

In [14]:
# adds topic distribution in numbers / percentages April 11
# attempts to use community louvain in the global graph as well and calculate modularity only for the book ranges!
# adds source more granular in unit form to dataframe (first appearance in the book) April 8
import os
import utils  # Import utility functions
import spacy
import glob
import pandas as pd
import networkx as nx
import nltk
from nltk.corpus import stopwords
import community as community_louvain
import random
import numpy as np
from collections import defaultdict
import csv

#seeds random generators for reproducibility
random.seed(42)
np.random.seed(42)

# define stop words
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))  # Define stop_words


# -----------sets base directory, locates txt files and reads them into texts
BASE_DIR = r"C:\Users\emine\try_env\2025_02"
directory = os.path.join(BASE_DIR, "texts")
pattern = "*.txt"
file_paths = glob.glob(os.path.join(directory, pattern))
texts = utils.read_text_files(directory)
# -----------sets base directory, locates txt files and reads them into texts

#----- other directories to set
save_directory = os.path.join(BASE_DIR, "snapshots")
unit_graphs_dir = os.path.join(BASE_DIR, "unit_graphs")
os.makedirs(unit_graphs_dir, exist_ok=True)
#----- other directories to set

# ----------- loading headword  & proper-nouns lists, lemmatizing them for consistency
lists_directory = os.path.join(BASE_DIR, "lists")
proper_nouns = utils.load_proper_nouns(os.path.join(lists_directory, "mod_proper_nouns_misspelled.csv"))
headwords_one = utils.load_word_list(os.path.join(lists_directory, "BNC-COCA_headwords_1000.txt"))
headwords_two = utils.load_word_list(os.path.join(lists_directory, "BNC-COCA_headwords_2000.txt"))
headwords_three = utils.load_word_list(os.path.join(lists_directory, "BNC-COCA_headwords_3000.txt"))
list1_words = utils.lemmatize_word_list(headwords_one)  # Initialize word_list with list1_words
list2_words = utils.lemmatize_word_list(headwords_two)  # Initialize word_list with list2_words
list3_words = utils.lemmatize_word_list(headwords_three)  # Initialize word_list with list3_words
# ----------- loading headword  & proper-nouns lists, lemmatizing them for consistency

def unit_id_to_float(unit_id):
    book, unit = unit_id.split("_")
    return float(f"{book}.{int(unit):02d}")

#----------- Compute word frequencies
word_frequencies = utils.compute_word_frequencies(texts, proper_nouns, stop_words)
threshold = 100
high_frequency_words = [word for word, freq in word_frequencies.items() if freq > threshold]
# Update the most common words to be excluded in the graphs by updating stop words!
stop_words.update(high_frequency_words)
#----------- Compute word frequencies

print(f"Words occurring more than {threshold} times:")
print(high_frequency_words)

#----------- org text files by book and unit, stored in snapshots
snapshots = {}
graphs = {}

book_ranges = {
    1: range(1, 19),  # Book 1: 1_1_More to 1_18_More
    2: range(1, 21),  # Book 2: 2_1_More to 2_20_More
    3: range(1, 14),  # Book 3: 3_1_More to 3_13_More
    4: range(1, 14)   # Book 4: 4_1_More to 4_13_More
}
#----------- org text files by book and unit, stored in snapshots
    
#-----------initialize global co-occurrence structure; populated as unit texts are processed
global_cooccurrence_counts = defaultdict(lambda: defaultdict(int))
global_graph = nx.Graph()
#-----------initialize global co-occurrence structure; populated as unit texts are processed

#----------- Populate snapshots with file paths
for book, units in book_ranges.items():
    snapshots[book] = []
    for unit in units:
        file_name = f"{book}_{unit}_More.txt"
        file_path = os.path.join(directory, file_name)
        if os.path.exists(file_path):
            snapshots[book].append(file_path)
        else:
            print(f"File not found: {file_path}")

# Extract filtered words
        words = utils.filter_text(text, proper_nouns, stop_words)
        if not words:
            continue
        
        # Build co-occurrence matrix
        window_size = 3
        adj_matrix, unique_words = utils.build_cooccurrence_matrix(words, window_size)
        for word in unique_words:
            unit_graph.add_node(word, topic=word_to_topic.get(word, "unknown"))


            
# Track first appearance of each token
token_first_appearance = {}

for book, file_paths in snapshots.items():
    book_graph = nx.Graph()
    all_words_in_book = set()

    for file_path in file_paths:
        with open(file_path, "r", encoding="utf-8") as file:
            text = file.read()

        # Extract filtered words
        words = utils.filter_text(text, proper_nouns, stop_words)
        if not words:
            continue
        
        # Build co-occurrence matrix
        window_size = 3
        adj_matrix, unique_words = utils.build_cooccurrence_matrix(words, window_size)
        for word in unique_words:
            unit_graph.add_node(word, topic=word_to_topic.get(word, "unknown"))

        ## Get the unit ID from filename
        filename = os.path.basename(file_path)          # "1_4_More.txt"
        name_no_ext = os.path.splitext(filename)[0]     # "1_4_More"
        parts = name_no_ext.split("_")                  # ['1', '4', 'More']

        unit_id = "_".join(os.path.basename(file_path).split("_")[:2])

        # Track first appearance by unit
        for word in unique_words:
            if word not in token_first_appearance:
                token_first_appearance[word] = unit_id

        for i, word_i in enumerate(unique_words):
            for j, word_j in enumerate(unique_words):
                if word_i != word_j:
                    global_cooccurrence_counts[word_i][word_j] += adj_matrix[i, j]

        # Create unit-level graph
        unit_graph = nx.Graph()

        # ---- unit-based analysis graphs -----#
        # Assign global topic labels and timeline to unit graph
        for word in unique_words:
            source = token_first_appearance.get(word, "unknown")
            start = unit_id_to_float(source) if "_" in str(source) else None
            unit_graph.add_node(
                word,
                topic=word_to_topic.get(word, "unknown"),
                source=source,
                start=start,
                end=start + 0.1 if start is not None else None
            )

        # Add edges to unit graph
        for i, word_i in enumerate(unique_words):
            for j, word_j in enumerate(unique_words):
                if adj_matrix[i, j] != 0:
                    unit_graph.add_edge(word_i, word_j, weight=adj_matrix[i, j])

        # Save unit-level graph
        unit_gexf_filename = os.path.join(
            unit_graphs_dir, f"2025_04_w3_unit_{unit_id}__graph.gexf"
        )

        nx.write_gexf(unit_graph, unit_gexf_filename)
        print(f"📄 Saved unit graph: {unit_gexf_filename}")

# Process each book's text and build a combined co-occurrence matrix
unit_graphs = defaultdict(dict)



#-------------- global graph finalization - constructs global graph, runs community detection, stores topic assignments, computes modularity score
for word in sorted(global_cooccurrence_counts.keys()):
    global_graph.add_node(word, source=token_first_appearance.get(word, "unknown"))
# Add dynamic 'start' time for Gephi timeline
for word in global_graph.nodes:
    unit_id = token_first_appearance.get(word)
    if unit_id and isinstance(unit_id, str) and "_" in unit_id:
        global_graph.nodes[word]['start'] = unit_id_to_float(unit_id)
# Add edges
for word_i, neighbors in global_cooccurrence_counts.items():
    for word_j, weight in neighbors.items():
        if weight > 0:
            global_graph.add_edge(word_i, word_j, weight=weight)
# Perform community detection
global_partition = community_louvain.best_partition(global_graph)
word_to_topic = {word: topic_id for word, topic_id in global_partition.items()}
# Add topics to global nodes
for word, topic in word_to_topic.items():
    if word in global_graph.nodes:
        global_graph.nodes[word]['topic'] = topic
# modularity score
global_modularity = community_louvain.modularity(global_partition, global_graph)
print(f"🌍 Global Graph Modularity Score: {global_modularity:.3f}")
#-------------- global graph finalization - constructs global graph, runs community detection, stores topic assignments, computes modularity score




    # ---- unoit-based analysis graphs -----#

# --- Define custom time windows for each book ---
time_windows = []
unit_interval_map = {}

for book, units in book_ranges.items():
    unit_floats = [unit_id_to_float(f"{book}_{unit}") for unit in units]
    unit_floats.sort()

    if len(unit_floats) == 0:
        continue

    mid_index = len(unit_floats) // 2
    part1 = unit_floats[:mid_index]
    part2 = unit_floats[mid_index:]

    if part1:
        time_windows.append((part1[0], part1[-1]))
    if part2:
        time_windows.append((part2[0], part2[-1]))

# Create lookup from unit_id -> (start, end)
for start, end in time_windows:
    for book, units in book_ranges.items():
        for unit in units:
            uid = f"{book}_{unit}"
            uid_float = unit_id_to_float(uid)
            if start <= uid_float <= end:
                unit_interval_map[uid] = (start, end)

# Process books individually
graphs = {}
unit_topic_rows = []
for book, file_paths in snapshots.items():
    book_graph = nx.Graph()
    all_words_in_book = set()

    for file_path in file_paths:
        with open(file_path, "r", encoding="utf-8") as file:
            text = file.read()

        words = utils.filter_text(text, proper_nouns, stop_words)
        # Build unit-level topic distribution based on global topics
        topic_counts = defaultdict(int)
        total_words_with_topic = 0
        
        for word in words:
            topic = word_to_topic.get(word)
            if topic is not None:
                topic_counts[topic] += 1
                total_words_with_topic += 1
        
        # Extract unit_id and start/end times
        filename = os.path.basename(file_path)
        name_no_ext = os.path.splitext(filename)[0]
        parts = name_no_ext.split("_")
        unit_id = "_".join(parts[:2])
        start_float, end_float = unit_interval_map.get(unit_id, (None, None))

        # Prepare row with topic percentages
        row = {"start": start_float, "end": end_float}
        for topic in sorted(word_to_topic.values()):
            count = topic_counts.get(topic, 0)
            percent = (count / total_words_with_topic * 100) if total_words_with_topic > 0 else 0.0
            row[f"topic_{topic}"] = round(percent, 2)
        
        unit_topic_rows.append(row)
        if not words:
            continue

        adj_matrix, unique_words = utils.build_cooccurrence_matrix(words, window_size=3)
        all_words_in_book.update(unique_words)

        
        # Assign global topic labels and timeline
        for word in unique_words:
            source = token_first_appearance.get(word, "unknown")
            start = unit_id_to_float(source) if "_" in str(source) else None
            book_graph.add_node(
                word,
                topic=word_to_topic.get(word, "unknown"),
                source=source,
                start=start,
                end=start + 0.1 if start is not None else None
            )


        # Add edges
        for i, word_i in enumerate(unique_words):
            for j, word_j in enumerate(unique_words):
                if adj_matrix[i, j] != 0:
                    book_graph.add_edge(word_i, word_j, weight=adj_matrix[i, j])

    # Compute modularity score for each book-specific graph
    book_partition = community_louvain.best_partition(book_graph)  # Detect book-level communities
    book_modularity = community_louvain.modularity(book_partition, book_graph)
    
    print(f"📖 Book {book} - Modularity Score: {book_modularity:.3f}")
    
    # Store modularity scores for later analysis
    modularity_scores[book] = book_modularity
    graphs[book] = book_graph
    
    #**Save Book Graph in GEXF Format**
    book_gexf_filename = os.path.join(save_directory, f"2025_04_w3_book_{book}_metrics_network.gexf")
    nx.write_gexf(book_graph, book_gexf_filename)
    print(f"📂 Saved book graph: {book_gexf_filename}")


import json
from collections import defaultdict
import os

# ========== CONFIG ==========
export_dir = os.path.join(BASE_DIR, "analysis_exports")
os.makedirs(export_dir, exist_ok=True)

# ========== DATA CONTAINERS ==========

token_counts_per_book = defaultdict(lambda: defaultdict(int))  # book -> word -> count
unit_tokens = defaultdict(list)                                # unit_id -> list of tokens

# ========== REPROCESS TEXTS FOR EXPORTS ==========

for book, file_paths in snapshots.items():
    for file_path in file_paths:
        with open(file_path, "r", encoding="utf-8") as file:
            text = file.read()

        # Extract filtered words
        words = utils.filter_text(text, proper_nouns, stop_words)
        if not words:
            continue

        # Get unit ID (e.g., "1_4" from "1_4_More.txt")
        filename = os.path.basename(file_path)
        unit_id = "_".join(os.path.splitext(filename)[0].split("_")[:2])

        # Record words at unit level
        unit_tokens[unit_id].extend(words)

        # Update frequency count for the book
        for word in words:
            token_counts_per_book[book][word] += 1

# ========== EXPORT WORD TO TOPIC ==========
with open(os.path.join(export_dir, "word_to_topic.json"), "w", encoding="utf-8") as f:
    json.dump(word_to_topic, f, ensure_ascii=False, indent=2)
print("✅ Exported word_to_topic.json")

# ========== EXPORT TOKEN COUNTS PER BOOK ==========
with open(os.path.join(export_dir, "token_counts_per_book.json"), "w", encoding="utf-8") as f:
    json.dump(token_counts_per_book, f, ensure_ascii=False, indent=2)
print("✅ Exported token_counts_per_book.json")

# ========== EXPORT UNIT TOKENS ==========
with open(os.path.join(export_dir, "unit_tokens.json"), "w", encoding="utf-8") as f:
    json.dump(unit_tokens, f, ensure_ascii=False, indent=2)
print("✅ Exported unit_tokens.json")


    
# **Save Global Graph in GEXF Format**
global_gexf_filename = os.path.join(save_directory, "202504_w3_global_metrics_network.gexf")
nx.write_gexf(global_graph, global_gexf_filename)
print(f"📂 Saved global graph: {global_gexf_filename}")


# Export per-unit topic coverage
unit_topic_csv = os.path.join(save_directory, "2025_04_w3_unit_topic_coverage.csv")
all_topics = sorted(set(word_to_topic.values()))
fieldnames = ["start", "end"] + [f"topic_{topic}" for topic in all_topics]

with open(unit_topic_csv, "w", newline="") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for row in unit_topic_rows:
        writer.writerow(row)

print(f"📄 Saved unit-level topic coverage CSV to {unit_topic_csv}")

# Export global core vocabulary (nodes in the global graph)
network_core_vocab = list(global_graph.nodes)

# Save to JSON
export_path = "analysis_exports/network_core_vocab.json"
with open(export_path, "w", encoding="utf-8") as f:
    json.dump(network_core_vocab, f, ensure_ascii=False, indent=2)

print(f"📤 Exported network-based core vocabulary to: {export_path}")


# Collect words by topic
topic_examples = defaultdict(list)
for word, topic in word_to_topic.items(): 
    topic_examples[topic].append(word)

# Print sample words per topic
print("\n🔍 Topic Assignment Examples:")
for topic_id, words in sorted(topic_examples.items()):
    sampled_words = random.sample(words, min(10, len(words)))  # up to 10 per topic
    print(f"Topic {topic_id:2d}: {', '.join(sampled_words)}")


# Save modularity scores
modularity_output_file = os.path.join(BASE_DIR, "2025_04_w3_distributional_modularity_scores.csv")
with open(modularity_output_file, "w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["Book", "Modularity Score"])

    for book, score in modularity_scores.items():
        writer.writerow([book, score])

print(f"📂 Modularity scores saved in {modularity_output_file}")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\emine\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Words occurring more than 100 times:
['get', 'eat', 'time', 'day', 'lot', 'people', 'big', 'man', 'many', 'let', 'know', 'live', 'like', 'give', 'play', 'school', 'love', 'good', 'great', 'think', 'new', 'start', 'book', 'mum', 'dad', 'help', 'want', 'thing', 'see', 'say', 'come', 'walk', 'watch', 'tell', 'take', 'friend', 'work', 'leave', 'ask', 'find', 'house', 'look', 'call', 'year', 'talk', 'make', 'money', 'world', 'woman']
📄 Saved unit graph: C:\Users\emine\try_env\2025_02\unit_graphs\2025_04_w3_unit_1_1__graph.gexf
📄 Saved unit graph: C:\Users\emine\try_env\2025_02\unit_graphs\2025_04_w3_unit_1_2__graph.gexf
📄 Saved unit graph: C:\Users\emine\try_env\2025_02\unit_graphs\2025_04_w3_unit_1_3__graph.gexf
📄 Saved unit graph: C:\Users\emine\try_env\2025_02\unit_graphs\2025_04_w3_unit_1_4__graph.gexf
📄 Saved unit graph: C:\Users\emine\try_env\2025_02\unit_graphs\2025_04_w3_unit_1_5__graph.gexf
📄 Saved unit graph: C:\Users\emine\try_env\2025_02\unit_graphs\2025_04_w3_unit_1_6__graph.ge

In [2]:
# adds topic distribution in numbers / percentages April 11
# attempts to use community louvain in the global graph as well and calculate modularity only for the book ranges!
# adds source more granular in unit form to dataframe (first appearance in the book) April 8
import os
import utils  # Import utility functions
import spacy
import glob
import pandas as pd
import networkx as nx
import nltk
from nltk.corpus import stopwords
import community as community_louvain
import random
import numpy as np
from collections import defaultdict
import csv
import json

#seeds random generators for reproducibility
random.seed(42)
np.random.seed(42)

# define stop words
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))  # Define stop_words

def unit_id_to_float(unit_id):
    book, unit = unit_id.split("_")
    return float(f"{book}.{int(unit):02d}")
    

# -----------sets base directory, locates txt files and reads them into texts
BASE_DIR = r"C:\Users\emine\try_env\2025_02"
directory = os.path.join(BASE_DIR, "texts")
pattern = "*.txt"
file_paths = glob.glob(os.path.join(directory, pattern))
texts = utils.read_text_files(directory)
export_dir = os.path.join(BASE_DIR, "analysis_exports")

# -----------sets base directory, locates txt files and reads them into texts

#----- other directories to set
save_directory = os.path.join(BASE_DIR, "snapshots")
unit_graphs_dir = os.path.join(BASE_DIR, "unit_graphs")
os.makedirs(unit_graphs_dir, exist_ok=True)
#----- other directories to set

# ----------- loading headword  & proper-nouns lists, lemmatizing them for consistency
lists_directory = os.path.join(BASE_DIR, "lists")
proper_nouns = utils.load_proper_nouns(os.path.join(lists_directory, "mod_proper_nouns_misspelled.csv"))
headwords_one = utils.load_word_list(os.path.join(lists_directory, "BNC-COCA_headwords_1000.txt"))
headwords_two = utils.load_word_list(os.path.join(lists_directory, "BNC-COCA_headwords_2000.txt"))
headwords_three = utils.load_word_list(os.path.join(lists_directory, "BNC-COCA_headwords_3000.txt"))
list1_words = utils.lemmatize_word_list(headwords_one)  # Initialize word_list with list1_words
list2_words = utils.lemmatize_word_list(headwords_two)  # Initialize word_list with list2_words
list3_words = utils.lemmatize_word_list(headwords_three)  # Initialize word_list with list3_words
# ----------- loading headword  & proper-nouns lists, lemmatizing them for consistency


#----------- Compute word frequencies
word_frequencies = utils.compute_word_frequencies(texts, proper_nouns, stop_words)
threshold = 100
high_frequency_words = [word for word, freq in word_frequencies.items() if freq > threshold]
# Update the most common words to be excluded in the graphs by updating stop words!
stop_words.update(high_frequency_words)
#----------- Compute word frequencies

print(f"Words occurring more than {threshold} times:")
print(high_frequency_words)

#----------- # Initialize data structures
snapshots = {}  
global_cooccurrence_counts = defaultdict(lambda: defaultdict(int))
unit_graphs = defaultdict(dict)
token_first_appearance = {}
modularity_scores = {}


# --- Define custom time windows for each book ---
time_windows = []
unit_interval_map = {}  # Initialize unit_interval_map
unit_topic_rows = []  # For storing topic distribution per unit

book_ranges = {
    1: range(1, 19),  # Book 1: 1_1_More to 1_18_More
    2: range(1, 21),  # Book 2: 2_1_More to 2_20_More
    3: range(1, 14),  # Book 3: 3_1_More to 3_13_More
    4: range(1, 14)   # Book 4: 4_1_More to 4_13_More
}


# Define time windows for each book
for book, units in book_ranges.items():
    unit_floats = [unit_id_to_float(f"{book}_{unit}") for unit in units]
    unit_floats.sort()
    if unit_floats:
        mid_index = len(unit_floats) // 2
        part1 = unit_floats[:mid_index]
        part2 = unit_floats[mid_index:]
        if part1:
            time_windows.append((part1[0], part1[-1]))
        if part2:
            time_windows.append((part2[0], part2[-1]))


# Create lookup from unit_id -> (start, end)
for start, end in time_windows:
    for book, units in book_ranges.items():
        for unit in units:
            uid = f"{book}_{unit}"
            uid_float = unit_id_to_float(uid)
            if start <= uid_float <= end:
                unit_interval_map[uid] = (start, end)
                
# Now `unit_interval_map` is defined and populated


#----------- Populate snapshots with file paths
for book, units in book_ranges.items():
    snapshots[book] = []
    for unit in units:
        file_name = f"{book}_{unit}_More.txt"
        file_path = os.path.join(directory, file_name)
        if os.path.exists(file_path):
            snapshots[book].append(file_path)
        else:
            print(f"File not found: {file_path}")
#----------- org text files by book and unit, stored in snapshots


#----------- Unit-level graph construction (process each file/unit)
for book, file_paths in snapshots.items():
    all_unique_words = set()
    for file_path in file_paths:
        # Initialize unit graph for each text file
        unit_graph = nx.Graph()
        with open(file_path, "r", encoding="utf-8") as file:
            text = file.read()

        words = utils.filter_text(text, proper_nouns, stop_words)
        if not words:
            continue

        # Build co-occurrence matrix for unit
        window_size = 3
        adj_matrix, unique_words = utils.build_cooccurrence_matrix(words, window_size)
        all_unique_words.update(unique_words)
        
         # Get the unit ID from filename (e.g., "1_4_More.txt" -> "1_4")
        filename = os.path.basename(file_path)          # "1_4_More.txt"
        name_no_ext = os.path.splitext(filename)[0]     # "1_4_More"
        parts = name_no_ext.split("_")                  # ['1', '4', 'More']
    
        unit_id = "_".join(parts[:2])                    # "1_4" (book and unit number)
        # Check if unit_id was successfully derived
        print(f"Processing unit: {unit_id}")

       # Get the time window for this unit
        start_float, end_float = unit_interval_map.get(unit_id, (None, None))

        # Update global co-occurrence counts
        for i, word_i in enumerate(unique_words):
            for j, word_j in enumerate(unique_words):
                if word_i != word_j:
                    global_cooccurrence_counts[word_i][word_j] += adj_matrix[i, j]

        # Add nodes to the unit-level graph
        for word in unique_words:
            # Track word's appearance and assign topic and start/end times
            source = token_first_appearance.get(word, "unknown")
            start = unit_id_to_float(source) if "_" in str(source) else None
            unit_graph.add_node(
                word,
                source=source,
                unit=unit_id, # Add unit ID as attribute
                unit_start=start_float,  # Assign time interval start
                unit_end=end_float       # Assign time interval end
            )

        ## Add edges based on co-occurrence matrix
        for i, word_i in enumerate(unique_words):
            for j, word_j in enumerate(unique_words):
                if adj_matrix[i, j] != 0:
                    unit_graph.add_edge(word_i, word_j, weight=adj_matrix[i, j])

        unit_graphs[unit_id] = unit_graph  # Store the unit graph
    
#-----------initialize global co-occurrence structure; populated as unit texts are processed
global_graph = nx.Graph()

# Add nodes and edges for global graph
for word in sorted(global_cooccurrence_counts.keys()):
    global_graph.add_node(word, source=token_first_appearance.get(word, "unknown"))
# add edges 
for word_i, neighbors in global_cooccurrence_counts.items():
    for word_j, weight in neighbors.items():
        if weight > 0:
            global_graph.add_edge(word_i, word_j, weight=weight)

# Add dynamic 'start' time for Gephi timeline
for word in global_graph.nodes:
    unit_id = token_first_appearance.get(word)

    if unit_id and isinstance(unit_id, str) and "_" in unit_id:
        start_val = float(f"{unit_id_to_float(unit_id):.2f}")
        global_graph.nodes[word]['start'] = f"{start_val:.2f}"
        global_graph.nodes[word]['end'] = f"{start_val + 0.1:.2f}"
    else:
        global_graph.nodes[word]['start'] = None
        global_graph.nodes[word]['end'] = None
        
# Perform community detection
global_partition = community_louvain.best_partition(global_graph)
word_to_topic = {word: topic_id for word, topic_id in global_partition.items()}
#Add topics to global graph nodes
for word, topic in word_to_topic.items():
    if word in global_graph.nodes:
        global_graph.nodes[word]['topic'] = topic


# modularity score
global_modularity = community_louvain.modularity(global_partition, global_graph)
print(f"🌍 Global Graph Modularity Score: {global_modularity:.3f}")

# Now assign topics to the nodes in the unit-level graphs after community detection
for unit_id, unit_graph in unit_graphs.items():
    for word in unit_graph.nodes:
        unit_graph.nodes[word]['topic'] = word_to_topic.get(word, "unknown")
        # Save the unit graphs after topics are assigned
for unit_id, unit_graph in unit_graphs.items():
    unit_gexf_filename = os.path.join(unit_graphs_dir, f"{unit_id}_graph.gexf")
    nx.write_gexf(unit_graph, unit_gexf_filename)
    print(f"📄 Saved unit graph: {unit_gexf_filename}")
    
# Save global graph after topics are assigned
global_gexf_filename = os.path.join(save_directory, "global_graph_with_topics.gexf")
nx.write_gexf(global_graph, global_gexf_filename)
print(f"📂 Saved global graph: {global_gexf_filename}")

#----------------Book-level graph construction (process each book)
graphs = {}
for book, file_paths in snapshots.items():
    book_graph = nx.Graph()
    all_words_in_book = set()

    for file_path in file_paths:
        with open(file_path, "r", encoding="utf-8") as file:
            text = file.read()

        words = utils.filter_text(text, proper_nouns, stop_words)
        # Build unit-level topic distribution based on global topics
        topic_counts = defaultdict(int)
        total_words_with_topic = 0
        
        for word in words:
            topic = word_to_topic.get(word)
            if topic is not None:
                topic_counts[topic] += 1
                total_words_with_topic += 1

        # Add dynamic 'start' time for Gephi timeline
    for word in global_graph.nodes:
        # Get the unit ID from first appearance
        unit_id = token_first_appearance.get(word)
        
        # Get the time interval from `unit_interval_map`
        if unit_id and isinstance(unit_id, str) and "_" in unit_id:
            start_val = float(f"{unit_id_to_float(unit_id):.2f}")
            global_graph.nodes[word]['start'] = f"{start_val:.2f}"
            global_graph.nodes[word]['end'] = f"{start_val + 0.1:.2f}"
        else:
            # If no valid unit_id found, you can set the default time window or leave them as None
            global_graph.nodes[word]['start'] = None
            global_graph.nodes[word]['end'] = None
            
        # Extract unit_id and start/end times
        filename = os.path.basename(file_path)
        name_no_ext = os.path.splitext(filename)[0]
        parts = name_no_ext.split("_")
        unit_id = "_".join(parts[:2])
        start_float, end_float = unit_interval_map.get(unit_id, (None, None))

        # Prepare row with topic percentages
        row = {"start": start_float, "end": end_float}
        for topic in sorted(word_to_topic.values()):
            count = topic_counts.get(topic, 0)
            percent = (count / total_words_with_topic * 100) if total_words_with_topic > 0 else 0.0
            row[f"topic_{topic}"] = round(percent, 2)
        
        unit_topic_rows.append(row)
        if not words:
            continue

        adj_matrix, unique_words = utils.build_cooccurrence_matrix(words, window_size=3)
        all_words_in_book.update(unique_words)

        
        # Assign global topic labels and timeline
        for word in unique_words:
            source = token_first_appearance.get(word, "unknown")
            if "_" in str(source):
                start_val = float(f"{unit_id_to_float(source):.2f}")
                start_str = f"{start_val:.2f}"
                end_str = f"{start_val + 0.1:.2f}"
            else:
                start_str = None
                end_str = None
        
        book_graph.add_node(
            word,
            topic=word_to_topic.get(word, "unknown"),
            source=source,
            start=start_str,
            end=end_str
        )

        # Add edges
        for i, word_i in enumerate(unique_words):
            for j, word_j in enumerate(unique_words):
                if adj_matrix[i, j] != 0:
                    book_graph.add_edge(word_i, word_j, weight=adj_matrix[i, j])

    # Compute modularity score for each book-specific graph
    book_partition = community_louvain.best_partition(book_graph)  # Detect book-level communities
    book_modularity = community_louvain.modularity(book_partition, book_graph)
    
    print(f"📖 Book {book} - Modularity Score: {book_modularity:.3f}")
    
    # Store modularity scores for later analysis
    modularity_scores[book] = book_modularity
    graphs[book] = book_graph
    
    #**Save Book Graph in GEXF Format**
    book_gexf_filename = os.path.join(save_directory, f"20250425_book_{book}__distr_random_network.gexf")
    nx.write_gexf(book_graph, book_gexf_filename)
    print(f"📂 Saved book graph: {book_gexf_filename}")

# Step 6: Export data
# Export word-to-topic mapping
with open(os.path.join(export_dir, "20250425word_to_topic.json"), "w", encoding="utf-8") as f:
    json.dump(word_to_topic, f, ensure_ascii=False, indent=2)
print("✅ Exported word_to_topic.json")


# 2. Export token counts per book (word frequency for each book)
token_counts_per_book = defaultdict(lambda: defaultdict(int))  # book -> word -> count
unit_tokens = defaultdict(list)  # unit_id -> list of tokens

for book, file_paths in snapshots.items():
    for file_path in file_paths:
        with open(file_path, "r", encoding="utf-8") as file:
            text = file.read()

        words = utils.filter_text(text, proper_nouns, stop_words)
        if not words:
            continue

        # Record words at unit level
        filename = os.path.basename(file_path)
        unit_id = "_".join(os.path.splitext(filename)[0].split("_")[:2])

        unit_tokens[unit_id].extend(words)

        # Update frequency count for the book
        for word in words:
            token_counts_per_book[book][word] += 1

# Export token counts per book
with open(os.path.join(export_dir, "20250425token_counts_per_book.json"), "w", encoding="utf-8") as f:
    json.dump(token_counts_per_book, f, ensure_ascii=False, indent=2)
print("✅ Exported 20250425token_counts_per_book.json")

# 3. Export unit tokens (words per unit)
with open(os.path.join(export_dir, "20250425unit_tokens.json"), "w", encoding="utf-8") as f:
    json.dump(unit_tokens, f, ensure_ascii=False, indent=2)
print("✅ Exported 20250425unit_tokens.json")

# Step 6: Export the Global Graph (after all the book-level graphs)
global_gexf_filename = os.path.join(save_directory, "20250425_global_graph.gexf")
nx.write_gexf(global_graph, global_gexf_filename)
print(f"📂 Saved global graph: {global_gexf_filename}")

# Export per-unit topic coverage (CSV format)
unit_topic_csv = os.path.join(save_directory, "20250425_w3_topic_coverage.csv")
all_topics = sorted(set(word_to_topic.values()))
fieldnames = ["start", "end"] + [f"topic_{topic}" for topic in all_topics]

with open(unit_topic_csv, "w", newline="") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for row in unit_topic_rows:
        writer.writerow(row)

#print(f"📄 Saved unit-level topic coverage CSV to {unit_topic_csv}")

# Export global core vocabulary (nodes in the global graph)
network_core_vocab = list(global_graph.nodes)

# Save to JSON
export_path = os.path.join(export_dir, "network_core_vocab.json")
with open(export_path, "w", encoding="utf-8") as f:
    json.dump(network_core_vocab, f, ensure_ascii=False, indent=2)
print(f"📤 Exported network-based core vocabulary to: {export_path}")

# Step 7: Collect words by topic and print topic examples
topic_examples = defaultdict(list)
for word, topic in word_to_topic.items():
    topic_examples[topic].append(word)

# Print sample words per topic
print("\n🔍 Topic Assignment Examples:")
for topic_id, words in sorted(topic_examples.items()):
    sampled_words = random.sample(words, min(10, len(words)))  # up to 10 per topic
    print(f"Topic {topic_id:2d}: {', '.join(sampled_words)}")

# Step 8: Save modularity scores (Book-level modularity)
modularity_output_file = os.path.join(BASE_DIR, "2025_0425_w3_distributional_modularity_scores.csv")
with open(modularity_output_file, "w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["Book", "Modularity Score"])

    for book, score in modularity_scores.items():
        writer.writerow([book, score])

print(f"📂 Modularity scores saved in {modularity_output_file}")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\emine\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Words occurring more than 100 times:
['get', 'eat', 'time', 'day', 'lot', 'people', 'big', 'man', 'many', 'let', 'know', 'live', 'like', 'give', 'play', 'school', 'love', 'good', 'great', 'think', 'new', 'start', 'book', 'mum', 'dad', 'help', 'want', 'thing', 'see', 'say', 'come', 'walk', 'watch', 'tell', 'take', 'friend', 'work', 'leave', 'ask', 'find', 'house', 'look', 'call', 'year', 'talk', 'make', 'money', 'world', 'woman']
Processing unit: 1_1
Processing unit: 1_2
Processing unit: 1_3
Processing unit: 1_4
Processing unit: 1_5
Processing unit: 1_6
Processing unit: 1_7
Processing unit: 1_8
Processing unit: 1_9
Processing unit: 1_10
Processing unit: 1_11
Processing unit: 1_12
Processing unit: 1_13
Processing unit: 1_14
Processing unit: 1_15
Processing unit: 1_16
Processing unit: 1_17
Processing unit: 1_18
Processing unit: 2_1
Processing unit: 2_2
Processing unit: 2_3
Processing unit: 2_4
Processing unit: 2_5
Processing unit: 2_6
Processing unit: 2_7
Processing unit: 2_8
Processing u

In [2]:
# adds topic distribution in numbers / percentages April 11
# attempts to use community louvain in the global graph as well and calculate modularity only for the book ranges!
# adds source more granular in unit form to dataframe (first appearance in the book) April 8
# adds unit level networks for stats analysis April 25
# adds debugging to network construction April 29
# uses the window determined to be best by my hierarchical script May 17

import os
import utils  # Import utility functions
import spacy
import glob
import pandas as pd
import networkx as nx
import nltk
from nltk.corpus import stopwords
import community as community_louvain
import random
import numpy as np
from collections import defaultdict
import csv
import json

#seeds random generators for reproducibility
random.seed(42)
np.random.seed(42)

# define stop words
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))  # Define stop_words

def unit_id_to_float(unit_id):
    book, unit = unit_id.split("_")
    return float(f"{book}.{int(unit):02d}")
    

# -----------sets base directory, locates txt files and reads them into texts
BASE_DIR = r"C:\Users\emine\try_env\2025_02"
directory = os.path.join(BASE_DIR, "texts")
pattern = "*.txt"
file_paths = glob.glob(os.path.join(directory, pattern))
texts = utils.read_text_files(directory)
export_dir = os.path.join(BASE_DIR, "w3_exports")

# -----------sets base directory, locates txt files and reads them into texts

#----- other directories to set
save_directory = os.path.join(BASE_DIR, "w3_exports")
output_directory = os.path.join(BASE_DIR, "w3_exports")
unit_graphs_dir = os.path.join(BASE_DIR, "w3_exports")
os.makedirs(unit_graphs_dir, exist_ok=True)
#----- other directories to set

# ----------- loading headword  & proper-nouns lists, lemmatizing them for consistency
lists_directory = os.path.join(BASE_DIR, "lists")
proper_nouns = utils.load_proper_nouns(os.path.join(lists_directory, "mod_proper_nouns_misspelled.csv"))
headwords_one = utils.load_word_list(os.path.join(lists_directory, "BNC-COCA_headwords_1000.txt"))
headwords_two = utils.load_word_list(os.path.join(lists_directory, "BNC-COCA_headwords_2000.txt"))
headwords_three = utils.load_word_list(os.path.join(lists_directory, "BNC-COCA_headwords_3000.txt"))
list1_words = utils.lemmatize_word_list(headwords_one)  # Initialize word_list with list1_words
list2_words = utils.lemmatize_word_list(headwords_two)  # Initialize word_list with list2_words
list3_words = utils.lemmatize_word_list(headwords_three)  # Initialize word_list with list3_words
# ----------- loading headword  & proper-nouns lists, lemmatizing them for consistency


#----------- Compute word frequencies
word_frequencies = utils.compute_word_frequencies(texts, proper_nouns, stop_words)
threshold = 60
high_frequency_words = [word for word, freq in word_frequencies.items() if freq > threshold]
# Update the most common words to be excluded in the graphs by updating stop words!
stop_words.update(high_frequency_words)
#----------- Compute word frequencies

print(f"Words occurring more than {threshold} times:")
print(high_frequency_words)

#----------- # Initialize data structures
snapshots = {}  
global_cooccurrence_counts = defaultdict(lambda: defaultdict(int))
unit_graphs = defaultdict(dict)
token_first_appearance = {}
modularity_scores = {}


# --- Define custom time windows for each book ---
time_windows = []
unit_interval_map = {}  # Initialize unit_interval_map
unit_topic_rows = []  # For storing topic distribution per unit

book_ranges = {
    1: range(1, 19),  # Book 1: 1_1_More to 1_18_More
    2: range(1, 21),  # Book 2: 2_1_More to 2_20_More
    3: range(1, 14),  # Book 3: 3_1_More to 3_13_More
    4: range(1, 14)   # Book 4: 4_1_More to 4_13_More
}

#----------- Populate snapshots with file paths
for book, units in book_ranges.items():
    snapshots[book] = []
    for unit in units:
        file_name = f"{book}_{unit}_More.txt"
        file_path = os.path.join(directory, file_name)
        if os.path.exists(file_path):
            snapshots[book].append(file_path)
        else:
            print(f"File not found: {file_path}")
#----------- org text files by book and unit, stored in snapshots

# Define time windows for each book
for book, units in book_ranges.items():
    unit_floats = [unit_id_to_float(f"{book}_{unit}") for unit in units]
    unit_floats.sort()
    if unit_floats:
        mid_index = len(unit_floats) // 2
        part1 = unit_floats[:mid_index]
        part2 = unit_floats[mid_index:]
        if part1:
            time_windows.append((part1[0], part1[-1]))
        if part2:
            time_windows.append((part2[0], part2[-1]))


# Create lookup from unit_id -> (start, end)
for start, end in time_windows:
    for book, units in book_ranges.items():
        for unit in units:
            uid = f"{book}_{unit}"
            uid_float = unit_id_to_float(uid)
            if start <= uid_float <= end:
                unit_interval_map[uid] = (start, end)
                
# Now `unit_interval_map` is defined and populated

#----------- Unit-level graph construction (process each file/unit)
for book, file_paths in snapshots.items():
    for file_path in file_paths:
        unit_graph = nx.Graph()
        with open(file_path, "r", encoding="utf-8") as file:
            text = file.read()

        words = utils.filter_text(text, proper_nouns, stop_words)
        if not words:
            continue

        window_size = 3
        adj_matrix, unique_words = utils.build_cooccurrence_matrix(words, window_size)

        ## Get the unit ID from filename
        filename = os.path.basename(file_path)          # "1_4_More.txt"
        name_no_ext = os.path.splitext(filename)[0]     # "1_4_More"
        parts = name_no_ext.split("_")                  # ['1', '4', 'More']

        unit_id = "_".join(os.path.basename(file_path).split("_")[:2])

        # Track first appearance by unit
        for word in unique_words:
            if word not in token_first_appearance:
                token_first_appearance[word] = unit_id

        # Add nodes with topic and filename info
        for word in unique_words:
            unit_graph.add_node(
                word,
                source=unit_id,
                file_name=name_no_ext  # add full file name without extension
            )

        for i, word_i in enumerate(unique_words):
            for j, word_j in enumerate(unique_words):
                if adj_matrix[i, j] != 0:
                    unit_graph.add_edge(word_i, word_j, weight=adj_matrix[i, j])

        unit_graphs[unit_id] = unit_graph

        # Update global co-occurrence counts
        for i, word_i in enumerate(unique_words):
            for j, word_j in enumerate(unique_words):
                if word_i != word_j:
                    global_cooccurrence_counts[word_i][word_j] += adj_matrix[i, j]
                    
#-----------initialize global co-occurrence structure; populated as unit texts are processed
global_graph = nx.Graph()

# Add nodes and edges for global graph
for word in sorted(global_cooccurrence_counts.keys()):
    global_graph.add_node(word, file_name=name_no_ext, source=token_first_appearance.get(word, "unknown"))
# add edges 
for word_i, neighbors in global_cooccurrence_counts.items():
    for word_j, weight in neighbors.items():
        if weight > 0:
            global_graph.add_edge(word_i, word_j, weight=weight)

# Add dynamic 'start'for timeline
for word in global_graph.nodes:
    unit_id = token_first_appearance.get(word)
    if unit_id and isinstance(unit_id, str) and "_" in unit_id:
        start_val = unit_id_to_float(unit_id)
        global_graph.nodes[word]['start'] = start_val

# Perform community detection
global_partition = community_louvain.best_partition(global_graph)
word_to_topic = {word: topic_id for word, topic_id in global_partition.items()}
#Add topics to global graph nodes
for word, topic in word_to_topic.items():
    if word in global_graph.nodes:
        global_graph.nodes[word]['topic'] = topic

# Define the same palette you used in matplotlib:
hex_palette = [
    "#e6194b", "#3cb44b", "#722dd0", "#4363d8",
    "#f58231", "#cc3300", "#46f0f0", "#0fa9d0",
    "#384a03", "#663300", "#008080", "#6e00b3",
    "#9a6324", "#0fa929", "#800000", "#3333ff",
]
topic_colors = {tid: hex_palette[tid] for tid in range(len(hex_palette))}

def annotate_with_colors(G):
    for n, data in G.nodes(data=True):
        topic = data.get('topic')
        # fallback if something went wrong
        hexcol = topic_colors.get(topic, "#CCCCCC")
        # 2) regular attribute:
        G.nodes[n]['color'] = hexcol
        # 3) viz:color block for GEXF viewers:
        r, g, b = (int(hexcol.lstrip('#')[i:i+2], 16) for i in (0, 2, 4))
        G.nodes[n]['viz'] = {'color': {'r': r, 'g': g, 'b': b, 'a': 1.0}}

# Apply to all your unit‐level graphs:
for unit_id, unit_graph in unit_graphs.items():
    for word in unit_graph.nodes:
        unit_graph.nodes[word]['topic'] = word_to_topic.get(word, "unkown")
        annotate_with_colors(unit_graph)
    nx.write_gexf(unit_graph, os.path.join(unit_graphs_dir, f"{unit_id}_graph_w3.gexf"))

# …and to the global graph:
annotate_with_colors(global_graph)
nx.write_gexf(global_graph, os.path.join(output_directory, "202505_w3_global_metrics_network.gexf"))



#! ----- add export for analyses May 3 -------------
# Function to export graph data (nodes, edges) into CSV or JSON
def export_graph_data(book_graph, book_name):
    # Export nodes (word, topic, source, start time) to a DataFrame
    node_data = []
    for node, data in book_graph.nodes(data=True):
        node_info = {
            'word': node,
            'topic': data.get('topic', 'unknown'),
            'source': data.get('source', 'unknown'),
            'start': data.get('start', None)
        }
        node_data.append(node_info)
    
    # Convert to DataFrame and save as CSV or JSON
    node_df = pd.DataFrame(node_data)
    node_df.to_csv(os.path.join(EXPORT_DIR, f"{book_name}_nodes.csv"), index=False)
    node_df.to_json(os.path.join(EXPORT_DIR, f"{book_name}_nodes.json"), orient='records', lines=True)
    
    # Export edges (word pairs, co-occurrence weight) to a DataFrame
    edge_data = []
    for u, v, data in book_graph.edges(data=True):
        edge_info = {
            'word_1': u,
            'word_2': v,
            'weight': data.get('weight', 0)
        }
        edge_data.append(edge_info)
    
    # Convert to DataFrame and save as CSV or JSON
    edge_df = pd.DataFrame(edge_data)
    edge_df.to_csv(os.path.join(EXPORT_DIR, f"{book_name}_edges.csv"), index=False)
    edge_df.to_json(os.path.join(EXPORT_DIR, f"{book_name}_edges.json"), orient='records', lines=True)
#! ----- add export for analyses May 3 -------------

# ---------------- Book-level graph construction (process each book)
graphs = {}
for book, file_paths in snapshots.items():
    book_graph = nx.Graph()
    all_words_in_book = set()

    for file_path in file_paths:
        with open(file_path, "r", encoding="utf-8") as file:
            text = file.read()

        words = utils.filter_text(text, proper_nouns, stop_words)
        if not words:
            continue

        # Compute topic distribution for the unit
        topic_counts = defaultdict(int)
        total_words_with_topic = 0
        for word in words:
            topic = word_to_topic.get(word)
            if topic is not None:
                topic_counts[topic] += 1
                total_words_with_topic += 1

       # Extract unit_id and start/end times
        filename = os.path.basename(file_path)
        name_no_ext = os.path.splitext(filename)[0]
        parts = name_no_ext.split("_")
        unit_id = "_".join(parts[:2])
        start_float, end_float = unit_interval_map.get(unit_id, (None, None))

        row = {"start": start_float, "end": end_float}
        for topic in sorted(set(word_to_topic.values())):
            count = topic_counts.get(topic, 0)
            percent = (count / total_words_with_topic * 100) if total_words_with_topic > 0 else 0.0
            row[f"topic_{topic}"] = round(percent, 2)
        unit_topic_rows.append(row)

        window_size = 3
        adj_matrix, unique_words = utils.build_cooccurrence_matrix(words, window_size)

        for word in unique_words:
            source = token_first_appearance.get(word, "unknown")
            if "_" in str(source):
                start_val = unit_id_to_float(source)
                start = start_val
            else:
                start = None

            book_graph.add_node(
                word,
                topic=word_to_topic.get(word, "unknown"),
                source=str(source),
                file_name=name_no_ext,
                start=start,
            )

        for i, word_i in enumerate(unique_words):
            for j, word_j in enumerate(unique_words):
                if adj_matrix[i, j] != 0:
                    book_graph.add_edge(word_i, word_j, weight=adj_matrix[i, j])

    # Perform community detection for the book
    book_partition = community_louvain.best_partition(book_graph)
    book_modularity = community_louvain.modularity(book_partition, book_graph)

    print(f"📖 Book {book} - Modularity Score: {book_modularity:.3f}")

    # Annotate with the same colors
    annotate_with_colors(book_graph)
    
    # Store results
    modularity_scores[book] = book_modularity
    graphs[book] = book_graph

    # Save book graph
    book_gexf_filename = os.path.join(save_directory, f"w3_book_{book}.gexf")
    nx.write_gexf(book_graph, book_gexf_filename)
    print(f"📂 Saved book graph: {book_gexf_filename}")

# Step 6: Export data
# Export word-to-topic mapping
with open(os.path.join(export_dir, "w3_word_to_topic.json"), "w", encoding="utf-8") as f:
    json.dump(word_to_topic, f, ensure_ascii=False, indent=2)
print("✅ Exported w3_word_to_topic.json")


# 2. Export token counts per book (word frequency for each book)
token_counts_per_book = defaultdict(lambda: defaultdict(int))  # book -> word -> count
unit_tokens = defaultdict(list)  # unit_id -> list of tokens

for book, file_paths in snapshots.items():
    for file_path in file_paths:
        with open(file_path, "r", encoding="utf-8") as file:
            text = file.read()

        words = utils.filter_text(text, proper_nouns, stop_words)
        if not words:
            continue

        # Record words at unit level
        filename = os.path.basename(file_path)
        unit_id = "_".join(os.path.splitext(filename)[0].split("_")[:2])

        unit_tokens[unit_id].extend(words)

        # Update frequency count for the book
        for word in words:
            token_counts_per_book[book][word] += 1

# Export token counts per book
with open(os.path.join(export_dir, "w3_token_counts_per_book.json"), "w", encoding="utf-8") as f:
    json.dump(token_counts_per_book, f, ensure_ascii=False, indent=2)
print("✅ Exported w3_token_counts_per_book.json")

# 3. Export unit tokens (words per unit)
with open(os.path.join(export_dir, "w3_unit_tokens.json"), "w", encoding="utf-8") as f:
    json.dump(unit_tokens, f, ensure_ascii=False, indent=2)
print("✅ Exported w3_unit_tokens.json")

# Step 6: Export the Global Graph (after all the book-level graphs)
global_gexf_filename = os.path.join(save_directory, "w3_global_graph.gexf")
nx.write_gexf(global_graph, global_gexf_filename)
print(f"📂 Saved global graph: {global_gexf_filename}")

# Export per-unit topic coverage (CSV format)
unit_topic_csv = os.path.join(output_directory, "w3_topic_coverage.csv")
all_topics = sorted(set(word_to_topic.values()))
fieldnames = ["start", "end"] + [f"topic_{topic}" for topic in all_topics]

with open(unit_topic_csv, "w", newline="") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for row in unit_topic_rows:
        writer.writerow(row)

print(f"📄 Saved unit-level topic coverage CSV to {unit_topic_csv}")

# Export global core vocabulary (nodes in the global graph)
network_core_vocab = list(global_graph.nodes)
global_nodes = list(global_graph.nodes)
#Count the degree of each node (i.e., how many edges are connected to it)
node_degrees = {node: global_graph.degree(node) for node in network_core_vocab}

# Filter nodes with degrees greater than 5
filtered_nodes = [node for node, degree in node_degrees.items() if degree > 5]

# Create a list of nodes with their degrees for the export
export_vocab = [
    {"token": node, "count": node_degrees[node]} for node in filtered_nodes
]

# Save to JSON
export_path = os.path.join(export_dir, "w3_network_core_vocab.json")
with open(export_path, "w", encoding="utf-8") as f:
    json.dump(export_vocab, f, ensure_ascii=False, indent=2)

print(f"📤 Exported network-based core vocabulary to: {export_path}")

#save all nodes to json
export_path = os.path.join(export_dir, "w3_global_nodes.json")
with open(export_path, "w", encoding="utf-8") as f:
    json.dump(export_vocab, f, ensure_ascii=False, indent=2)

print(f"Exported global nodes to {export_path}")

# Step 7: Collect words by topic and print topic examples
topic_examples = defaultdict(list)
for word, topic in word_to_topic.items():
    topic_examples[topic].append(word)

# Print sample words per topic
print("\n🔍 Topic Assignment Examples:")
for topic_id, words in sorted(topic_examples.items()):
    sampled_words = random.sample(words, min(10, len(words)))  # up to 10 per topic
    print(f"Topic {topic_id:2d}: {', '.join(sampled_words)}")

# Step 8: Save modularity scores (Book-level modularity)
modularity_output_file = os.path.join(output_directory, "2025_05_w3_distributional_modularity_scores.csv")
with open(modularity_output_file, "w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["Book", "Modularity Score"])

    for book, score in modularity_scores.items():
        writer.writerow([book, score])

print(f"📂 Modularity scores saved in {modularity_output_file}")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\emine\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Words occurring more than 60 times:
['number', 'thank', 'get', 'right', 'name', 'eat', 'time', 'day', 'lot', 'food', 'people', 'family', 'big', 'home', 'man', 'old', 'many', 'feel', 'let', 'know', 'problem', 'live', 'small', 'boy', 'girl', 'like', 'give', 'play', 'school', 'love', 'good', 'great', 'bad', 'think', 'next', 'new', 'start', 'phone', 'book', 'mum', 'buy', 'dad', 'dream', 'course', 'help', 'night', 'answer', 'want', 'thing', 'little', 'see', 'say', 'hear', 'minute', 'come', 'tree', 'hour', 'walk', 'stop', 'watch', 'try', 'tell', 'sure', 'take', 'car', 'friend', 'room', 'work', 'read', 'water', 'leave', 'ask', 'find', 'house', 'look', 'door', 'call', 'place', 'happy', 'way', 'party', 'year', 'need', 'child', 'talk', 'make', 'kid', 'money', 'put', 'last', 'black', 'parent', 'world', 'climb', 'fall', 'park', 'idea', 'happen', 'believe', 'show', 'woman', 'use', 'job', 'story', 'mean', 'famous', 'wear', 'interviewer', 'allow']
📖 Book 1 - Modularity Score: 0.595
📂 Saved book graph

In [None]:
# adds topic distribution in numbers / percentages April 11
# attempts to use community louvain in the global graph as well and calculate modularity only for the book ranges!
# adds source more granular in unit form to dataframe (first appearance in the book) April 8
# adds unit level networks for stats analysis April 25
# adds debugging to network construction April 29
# visualizes in pyvis because colors change constantly in GEPHI (May 04)
import os
import utils  # Import utility functions
import spacy
import glob
import pandas as pd
import networkx as nx
import nltk
from nltk.corpus import stopwords
import community as community_louvain
import random
import numpy as np
from collections import defaultdict
import csv
import json
from pyvis.network import Network

#seeds random generators for reproducibility
random.seed(42)
np.random.seed(42)

# define stop words
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))  # Define stop_words

def unit_id_to_float(unit_id):
    book, unit = unit_id.split("_")
    return float(f"{book}.{int(unit):02d}")
    

# -----------sets base directory, locates txt files and reads them into texts
BASE_DIR = r"C:\Users\emine\try_env\2025_02"
directory = os.path.join(BASE_DIR, "texts")
pattern = "*.txt"
file_paths = glob.glob(os.path.join(directory, pattern))
texts = utils.read_text_files(directory)
export_dir = os.path.join(BASE_DIR, "analysis_exports")

# -----------sets base directory, locates txt files and reads them into texts

#----- other directories to set
save_directory = os.path.join(BASE_DIR, "book_graphs")
output_directory = os.path.join(BASE_DIR, "analysis_exports")
unit_graphs_dir = os.path.join(BASE_DIR, "unit_graphs")
os.makedirs(unit_graphs_dir, exist_ok=True)
#----- other directories to set

# ----------- loading headword  & proper-nouns lists, lemmatizing them for consistency
lists_directory = os.path.join(BASE_DIR, "lists")
proper_nouns = utils.load_proper_nouns(os.path.join(lists_directory, "mod_proper_nouns_misspelled.csv"))
headwords_one = utils.load_word_list(os.path.join(lists_directory, "BNC-COCA_headwords_1000.txt"))
headwords_two = utils.load_word_list(os.path.join(lists_directory, "BNC-COCA_headwords_2000.txt"))
headwords_three = utils.load_word_list(os.path.join(lists_directory, "BNC-COCA_headwords_3000.txt"))
list1_words = utils.lemmatize_word_list(headwords_one)  # Initialize word_list with list1_words
list2_words = utils.lemmatize_word_list(headwords_two)  # Initialize word_list with list2_words
list3_words = utils.lemmatize_word_list(headwords_three)  # Initialize word_list with list3_words
# ----------- loading headword  & proper-nouns lists, lemmatizing them for consistency


#----------- Compute word frequencies
word_frequencies = utils.compute_word_frequencies(texts, proper_nouns, stop_words)
threshold = 100
high_frequency_words = [word for word, freq in word_frequencies.items() if freq > threshold]
# Update the most common words to be excluded in the graphs by updating stop words!
stop_words.update(high_frequency_words)
#----------- Compute word frequencies

print(f"Words occurring more than {threshold} times:")
print(high_frequency_words)

#----------- # Initialize data structures
snapshots = {}  
global_cooccurrence_counts = defaultdict(lambda: defaultdict(int))
unit_graphs = defaultdict(dict)
token_first_appearance = {}
modularity_scores = {}


# --- Define custom time windows for each book ---
time_windows = []
unit_interval_map = {}  # Initialize unit_interval_map
unit_topic_rows = []  # For storing topic distribution per unit

book_ranges = {
    1: range(1, 19),  # Book 1: 1_1_More to 1_18_More
    2: range(1, 21),  # Book 2: 2_1_More to 2_20_More
    3: range(1, 14),  # Book 3: 3_1_More to 3_13_More
    4: range(1, 14)   # Book 4: 4_1_More to 4_13_More
}

#----------- Populate snapshots with file paths
for book, units in book_ranges.items():
    snapshots[book] = []
    for unit in units:
        file_name = f"{book}_{unit}_More.txt"
        file_path = os.path.join(directory, file_name)
        if os.path.exists(file_path):
            snapshots[book].append(file_path)
        else:
            print(f"File not found: {file_path}")
#----------- org text files by book and unit, stored in snapshots

# Define time windows for each book
for book, units in book_ranges.items():
    unit_floats = [unit_id_to_float(f"{book}_{unit}") for unit in units]
    unit_floats.sort()
    if unit_floats:
        mid_index = len(unit_floats) // 2
        part1 = unit_floats[:mid_index]
        part2 = unit_floats[mid_index:]
        if part1:
            time_windows.append((part1[0], part1[-1]))
        if part2:
            time_windows.append((part2[0], part2[-1]))


# Create lookup from unit_id -> (start, end)
for start, end in time_windows:
    for book, units in book_ranges.items():
        for unit in units:
            uid = f"{book}_{unit}"
            uid_float = unit_id_to_float(uid)
            if start <= uid_float <= end:
                unit_interval_map[uid] = (start, end)
                
# Now `unit_interval_map` is defined and populated

# ─── helper to export ANY nx.Graph to an HTML ───────────────────────────
def export_pyvis(G, html_path, height="800px", width="100%"):
    """
    Takes a networkx.Graph `G` whose nodes have a 'color' attribute (hex string),
    and writes an interactive HTML to `html_path`.
    """
    net = Network(notebook=False, height=height, width=width)
    net.from_nx(G)                  # PyVis will read G.nodes[n]['color']
    net.toggle_physics(False)       # disable physics for a static layout, optional
    net.write_html(html_path)       # writes the interactive HTML
    
#----------- Unit-level graph construction (process each file/unit)
for book, file_paths in snapshots.items():
    for file_path in file_paths:
        unit_graph = nx.Graph()
        with open(file_path, "r", encoding="utf-8") as file:
            text = file.read()

        words = utils.filter_text(text, proper_nouns, stop_words)
        if not words:
            continue

        window_size = 3
        adj_matrix, unique_words = utils.build_cooccurrence_matrix(words, window_size)

        ## Get the unit ID from filename
        filename = os.path.basename(file_path)          # "1_4_More.txt"
        name_no_ext = os.path.splitext(filename)[0]     # "1_4_More"
        parts = name_no_ext.split("_")                  # ['1', '4', 'More']

        unit_id = "_".join(os.path.basename(file_path).split("_")[:2])

        # Track first appearance by unit
        for word in unique_words:
            if word not in token_first_appearance:
                token_first_appearance[word] = unit_id

        # Add nodes with topic and filename info
        for word in unique_words:
            unit_graph.add_node(
                word,
                source=unit_id,
                file_name=name_no_ext  # add full file name without extension
            )

        for i, word_i in enumerate(unique_words):
            for j, word_j in enumerate(unique_words):
                if adj_matrix[i, j] != 0:
                    unit_graph.add_edge(word_i, word_j, weight=adj_matrix[i, j])

        unit_graphs[unit_id] = unit_graph

        # Update global co-occurrence counts
        for i, word_i in enumerate(unique_words):
            for j, word_j in enumerate(unique_words):
                if word_i != word_j:
                    global_cooccurrence_counts[word_i][word_j] += adj_matrix[i, j]
                    
#-----------initialize global co-occurrence structure; populated as unit texts are processed
global_graph = nx.Graph()

# Add nodes and edges for global graph
for word in sorted(global_cooccurrence_counts.keys()):
    global_graph.add_node(word, source=token_first_appearance.get(word, "unknown"))
# add edges 
for word_i, neighbors in global_cooccurrence_counts.items():
    for word_j, weight in neighbors.items():
        if weight > 0:
            global_graph.add_edge(word_i, word_j, weight=weight)

# Add dynamic 'start'for timeline
for word in global_graph.nodes:
    unit_id = token_first_appearance.get(word)
    if unit_id and isinstance(unit_id, str) and "_" in unit_id:
        start_val = unit_id_to_float(unit_id)
        global_graph.nodes[word]['start'] = start_val

# Perform community detection
global_partition = community_louvain.best_partition(global_graph)
word_to_topic = {word: topic_id for word, topic_id in global_partition.items()}
#Add topics to global graph nodes
for word, topic in word_to_topic.items():
    if word in global_graph.nodes:
        global_graph.nodes[word]['topic'] = topic

# Define the same palette you used in matplotlib:
hex_palette = [
    "#e6194b", "#3cb44b", "#ffe119", "#4363d8",
    "#f58231", "#911eb4", "#46f0f0", "#f032e6",
    "#bcf60c", "#fabebe", "#008080", "#e6beff",
    "#9a6324", "#fffac8", "#800000", "#aaffc3",
]
topic_colors = {tid: hex_palette[tid] for tid in range(len(hex_palette))}

def annotate_with_colors(G):
    for n, data in G.nodes(data=True):
        topic = data.get('topic')
        # fallback if something went wrong
        hexcol = topic_colors.get(topic, "#CCCCCC")
        # 2) regular attribute:
        G.nodes[n]['color'] = hexcol
        # 3) viz:color block for GEXF viewers:
        r, g, b = (int(hexcol.lstrip('#')[i:i+2], 16) for i in (0, 2, 4))
        G.nodes[n]['viz'] = {'color': {'r': r, 'g': g, 'b': b, 'a': 1.0}}

# Apply to all your unit‐level graphs:
for unit_id, unit_graph in unit_graphs.items():
    for word in unit_graph.nodes:
        unit_graph.nodes[word]['topic'] = word_to_topic.get(word, "unkown")
        annotate_with_colors(unit_graph)
    nx.write_gexf(unit_graph, os.path.join(unit_graphs_dir, f"{unit_id}_graph.gexf"))

# …and to the global graph:
annotate_with_colors(global_graph)
nx.write_gexf(global_graph, os.path.join(output_directory, "202505_global_metrics_network.gexf"))


#! ----- add export for analyses May 3 -------------
# Function to export graph data (nodes, edges) into CSV or JSON
def export_graph_data(book_graph, book_name):
    # Export nodes (word, topic, source, start time) to a DataFrame
    node_data = []
    for node, data in book_graph.nodes(data=True):
        node_info = {
            'word': node,
            'topic': data.get('topic', 'unknown'),
            'source': data.get('source', 'unknown'),
            'start': data.get('start', None)
        }
        node_data.append(node_info)
    
    # Convert to DataFrame and save as CSV or JSON
    node_df = pd.DataFrame(node_data)
    node_df.to_csv(os.path.join(EXPORT_DIR, f"{book_name}_nodes.csv"), index=False)
    node_df.to_json(os.path.join(EXPORT_DIR, f"{book_name}_nodes.json"), orient='records', lines=True)
    
    # Export edges (word pairs, co-occurrence weight) to a DataFrame
    edge_data = []
    for u, v, data in book_graph.edges(data=True):
        edge_info = {
            'word_1': u,
            'word_2': v,
            'weight': data.get('weight', 0)
        }
        edge_data.append(edge_info)
    
    # Convert to DataFrame and save as CSV or JSON
    edge_df = pd.DataFrame(edge_data)
    edge_df.to_csv(os.path.join(EXPORT_DIR, f"{book_name}_edges.csv"), index=False)
    edge_df.to_json(os.path.join(EXPORT_DIR, f"{book_name}_edges.json"), orient='records', lines=True)

#! ----- add export for analyses May 3 -------------

# ---------------- Book-level graph construction (process each book)
graphs = {}
for book, file_paths in snapshots.items():
    book_graph = nx.Graph()
    all_words_in_book = set()

    for file_path in file_paths:
        with open(file_path, "r", encoding="utf-8") as file:
            text = file.read()

        words = utils.filter_text(text, proper_nouns, stop_words)
        if not words:
            continue

        # Compute topic distribution for the unit
        topic_counts = defaultdict(int)
        total_words_with_topic = 0
        for word in words:
            topic = word_to_topic.get(word)
            if topic is not None:
                topic_counts[topic] += 1
                total_words_with_topic += 1

       # Extract unit_id and start/end times
        filename = os.path.basename(file_path)
        name_no_ext = os.path.splitext(filename)[0]
        parts = name_no_ext.split("_")
        unit_id = "_".join(parts[:2])
        start_float, end_float = unit_interval_map.get(unit_id, (None, None))

        row = {"start": start_float, "end": end_float}
        for topic in sorted(set(word_to_topic.values())):
            count = topic_counts.get(topic, 0)
            percent = (count / total_words_with_topic * 100) if total_words_with_topic > 0 else 0.0
            row[f"topic_{topic}"] = round(percent, 2)
        unit_topic_rows.append(row)

        window_size = 3
        adj_matrix, unique_words = utils.build_cooccurrence_matrix(words, window_size)

        for word in unique_words:
            source = token_first_appearance.get(word, "unknown")
            if "_" in str(source):
                start_val = unit_id_to_float(source)
                start = start_val
            else:
                start = None

            book_graph.add_node(
                word,
                topic=word_to_topic.get(word, "unknown"),
                source=str(source),
                start=start,
            )

        for i, word_i in enumerate(unique_words):
            for j, word_j in enumerate(unique_words):
                if adj_matrix[i, j] != 0:
                    book_graph.add_edge(word_i, word_j, weight=adj_matrix[i, j])

    # Perform community detection for the book
    book_partition = community_louvain.best_partition(book_graph)
    book_modularity = community_louvain.modularity(book_partition, book_graph)

    print(f"📖 Book {book} - Modularity Score: {book_modularity:.3f}")

    # Store results
    modularity_scores[book] = book_modularity
    graphs[book] = book_graph

    # Save book graph
    book_gexf_filename = os.path.join(save_directory, f"book_{book}.gexf")
    nx.write_gexf(book_graph, book_gexf_filename)
    print(f"📂 Saved book graph: {book_gexf_filename}")

# Step 6: Export data
# Export word-to-topic mapping
with open(os.path.join(export_dir, "20250425word_to_topic.json"), "w", encoding="utf-8") as f:
    json.dump(word_to_topic, f, ensure_ascii=False, indent=2)
print("✅ Exported word_to_topic.json")


# 2. Export token counts per book (word frequency for each book)
token_counts_per_book = defaultdict(lambda: defaultdict(int))  # book -> word -> count
unit_tokens = defaultdict(list)  # unit_id -> list of tokens

for book, file_paths in snapshots.items():
    for file_path in file_paths:
        with open(file_path, "r", encoding="utf-8") as file:
            text = file.read()

        words = utils.filter_text(text, proper_nouns, stop_words)
        if not words:
            continue

        # Record words at unit level
        filename = os.path.basename(file_path)
        unit_id = "_".join(os.path.splitext(filename)[0].split("_")[:2])

        unit_tokens[unit_id].extend(words)

        # Update frequency count for the book
        for word in words:
            token_counts_per_book[book][word] += 1

# Export token counts per book
with open(os.path.join(export_dir, "20250425token_counts_per_book.json"), "w", encoding="utf-8") as f:
    json.dump(token_counts_per_book, f, ensure_ascii=False, indent=2)
print("✅ Exported 20250425token_counts_per_book.json")

# 3. Export unit tokens (words per unit)
with open(os.path.join(export_dir, "20250425unit_tokens.json"), "w", encoding="utf-8") as f:
    json.dump(unit_tokens, f, ensure_ascii=False, indent=2)
print("✅ Exported 20250425unit_tokens.json")

# Step 6: Export the Global Graph (after all the book-level graphs)
global_gexf_filename = os.path.join(save_directory, "20250425_global_graph.gexf")
nx.write_gexf(global_graph, global_gexf_filename)
print(f"📂 Saved global graph: {global_gexf_filename}")

# Export per-unit topic coverage (CSV format)
unit_topic_csv = os.path.join(output_directory, "20250425_w3_topic_coverage.csv")
all_topics = sorted(set(word_to_topic.values()))
fieldnames = ["start", "end"] + [f"topic_{topic}" for topic in all_topics]

with open(unit_topic_csv, "w", newline="") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for row in unit_topic_rows:
        writer.writerow(row)

print(f"📄 Saved unit-level topic coverage CSV to {unit_topic_csv}")

# Export global core vocabulary (nodes in the global graph)
network_core_vocab = list(global_graph.nodes)
global_nodes = list(global_graph.nodes)
#Count the degree of each node (i.e., how many edges are connected to it)
node_degrees = {node: global_graph.degree(node) for node in network_core_vocab}

# Filter nodes with degrees greater than 5
filtered_nodes = [node for node, degree in node_degrees.items() if degree > 5]

# Create a list of nodes with their degrees for the export
export_vocab = [
    {"token": node, "count": node_degrees[node]} for node in filtered_nodes
]

# Save to JSON
export_path = os.path.join(export_dir, "network_core_vocab.json")
with open(export_path, "w", encoding="utf-8") as f:
    json.dump(export_vocab, f, ensure_ascii=False, indent=2)

print(f"📤 Exported network-based core vocabulary to: {export_path}")

#save all nodes to json
export_path = os.path.join(export_dir, "global_nodes.json")
with open(export_path, "w", encoding="utf-8") as f:
    json.dump(export_vocab, f, ensure_ascii=False, indent=2)

print(f"Exported global nodes to {export_path}")

# Step 7: Collect words by topic and print topic examples
topic_examples = defaultdict(list)
for word, topic in word_to_topic.items():
    topic_examples[topic].append(word)

# Print sample words per topic
print("\n🔍 Topic Assignment Examples:")
for topic_id, words in sorted(topic_examples.items()):
    sampled_words = random.sample(words, min(10, len(words)))  # up to 10 per topic
    print(f"Topic {topic_id:2d}: {', '.join(sampled_words)}")

# Step 8: Save modularity scores (Book-level modularity)
modularity_output_file = os.path.join(output_directory, "2025_0425_w3_distributional_modularity_scores.csv")
with open(modularity_output_file, "w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["Book", "Modularity Score"])

    for book, score in modularity_scores.items():
        writer.writerow([book, score])

print(f"📂 Modularity scores saved in {modularity_output_file}")

In [7]:
# adds topic distribution in numbers / percentages April 11
# attempts to use community louvain in the global graph as well and calculate modularity only for the book ranges!
# adds source more granular in unit form to dataframe (first appearance in the book) April 8
# adds unit level networks for stats analysis April 25
# adds debugging to network construction April 29

import os
import utils  # Import utility functions
import spacy
import glob
import pandas as pd
import networkx as nx
import nltk
from nltk.corpus import stopwords
import community as community_louvain
import random
import numpy as np
from collections import defaultdict
import csv
import json

#seeds random generators for reproducibility
random.seed(42)
np.random.seed(42)

# define stop words
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))  # Define stop_words

def unit_id_to_float(unit_id):
    book, unit = unit_id.split("_")
    return float(f"{book}.{int(unit):02d}")
    

# -----------sets base directory, locates txt files and reads them into texts
BASE_DIR = r"C:\Users\emine\try_env\2025_02"
directory = os.path.join(BASE_DIR, "texts")
pattern = "*.txt"
file_paths = glob.glob(os.path.join(directory, pattern))
texts = utils.read_text_files(directory)
export_dir = os.path.join(BASE_DIR, "community_detection")

# -----------sets base directory, locates txt files and reads them into texts

#----- other directories to set
save_directory = os.path.join(BASE_DIR, "community_detection")
output_directory = os.path.join(BASE_DIR, "community_detection")
unit_graphs_dir = os.path.join(BASE_DIR, "community_detection")
os.makedirs(unit_graphs_dir, exist_ok=True)
#----- other directories to set

# ----------- loading headword  & proper-nouns lists, lemmatizing them for consistency
lists_directory = os.path.join(BASE_DIR, "lists")
proper_nouns = utils.load_proper_nouns(os.path.join(lists_directory, "mod_proper_nouns_misspelled.csv"))
headwords_one = utils.load_word_list(os.path.join(lists_directory, "BNC-COCA_headwords_1000.txt"))
headwords_two = utils.load_word_list(os.path.join(lists_directory, "BNC-COCA_headwords_2000.txt"))
headwords_three = utils.load_word_list(os.path.join(lists_directory, "BNC-COCA_headwords_3000.txt"))
list1_words = utils.lemmatize_word_list(headwords_one)  # Initialize word_list with list1_words
list2_words = utils.lemmatize_word_list(headwords_two)  # Initialize word_list with list2_words
list3_words = utils.lemmatize_word_list(headwords_three)  # Initialize word_list with list3_words
# ----------- loading headword  & proper-nouns lists, lemmatizing them for consistency


#----------- Compute word frequencies
#word_frequencies = utils.compute_word_frequencies(texts, proper_nouns, stop_words)
#threshold = 100
#high_frequency_words = [word for word, freq in word_frequencies.items() if freq > threshold]
# Update the most common words to be excluded in the graphs by updating stop words!
#stop_words.update(high_frequency_words)
#----------- Compute word frequencies

#print(f"Words occurring more than {threshold} times:")
#print(high_frequency_words)

#----------- # Initialize data structures
snapshots = {}  
global_cooccurrence_counts = defaultdict(lambda: defaultdict(int))
unit_graphs = defaultdict(dict)
token_first_appearance = {}
modularity_scores = {}


# --- Define custom time windows for each book ---
time_windows = []
unit_interval_map = {}  # Initialize unit_interval_map
unit_topic_rows = []  # For storing topic distribution per unit

book_ranges = {
    1: range(1, 19),  # Book 1: 1_1_More to 1_18_More
    2: range(1, 21),  # Book 2: 2_1_More to 2_20_More
    3: range(1, 14),  # Book 3: 3_1_More to 3_13_More
    4: range(1, 14)   # Book 4: 4_1_More to 4_13_More
}

#----------- Populate snapshots with file paths
for book, units in book_ranges.items():
    snapshots[book] = []
    for unit in units:
        file_name = f"{book}_{unit}_More.txt"
        file_path = os.path.join(directory, file_name)
        if os.path.exists(file_path):
            snapshots[book].append(file_path)
        else:
            print(f"File not found: {file_path}")
#----------- org text files by book and unit, stored in snapshots

# Define time windows for each book
for book, units in book_ranges.items():
    unit_floats = [unit_id_to_float(f"{book}_{unit}") for unit in units]
    unit_floats.sort()
    if unit_floats:
        mid_index = len(unit_floats) // 2
        part1 = unit_floats[:mid_index]
        part2 = unit_floats[mid_index:]
        if part1:
            time_windows.append((part1[0], part1[-1]))
        if part2:
            time_windows.append((part2[0], part2[-1]))


# Create lookup from unit_id -> (start, end)
for start, end in time_windows:
    for book, units in book_ranges.items():
        for unit in units:
            uid = f"{book}_{unit}"
            uid_float = unit_id_to_float(uid)
            if start <= uid_float <= end:
                unit_interval_map[uid] = (start, end)
                
# Now `unit_interval_map` is defined and populated

#----------- Unit-level graph construction (process each file/unit)
for book, file_paths in snapshots.items():
    for file_path in file_paths:
        unit_graph = nx.Graph()
        with open(file_path, "r", encoding="utf-8") as file:
            text = file.read()

        words = utils.filter_text(text, proper_nouns, stop_words)
        if not words:
            continue

        window_size = 3
        adj_matrix, unique_words = utils.build_cooccurrence_matrix(words, window_size)

        ## Get the unit ID from filename
        filename = os.path.basename(file_path)          # "1_4_More.txt"
        name_no_ext = os.path.splitext(filename)[0]     # "1_4_More"
        parts = name_no_ext.split("_")                  # ['1', '4', 'More']

        unit_id = "_".join(os.path.basename(file_path).split("_")[:2])

        # Track first appearance by unit
        for word in unique_words:
            if word not in token_first_appearance:
                token_first_appearance[word] = unit_id

        # Add nodes with topic and filename info
        for word in unique_words:
            unit_graph.add_node(
                word,
                source=unit_id,
                file_name=name_no_ext  # add full file name without extension
            )

        for i, word_i in enumerate(unique_words):
            for j, word_j in enumerate(unique_words):
                if adj_matrix[i, j] != 0:
                    unit_graph.add_edge(word_i, word_j, weight=adj_matrix[i, j])

        unit_graphs[unit_id] = unit_graph

        # Update global co-occurrence counts
        for i, word_i in enumerate(unique_words):
            for j, word_j in enumerate(unique_words):
                if word_i != word_j:
                    global_cooccurrence_counts[word_i][word_j] += adj_matrix[i, j]
                    
#-----------initialize global co-occurrence structure; populated as unit texts are processed
global_graph = nx.Graph()

# Add nodes and edges for global graph
for word in sorted(global_cooccurrence_counts.keys()):
    global_graph.add_node(word, file_name=name_no_ext, source=token_first_appearance.get(word, "unknown"))
# add edges 
for word_i, neighbors in global_cooccurrence_counts.items():
    for word_j, weight in neighbors.items():
        if weight > 0:
            global_graph.add_edge(word_i, word_j, weight=weight)

# Add dynamic 'start'for timeline
for word in global_graph.nodes:
    unit_id = token_first_appearance.get(word)
    if unit_id and isinstance(unit_id, str) and "_" in unit_id:
        start_val = unit_id_to_float(unit_id)
        global_graph.nodes[word]['start'] = start_val

# Perform community detection
global_partition = community_louvain.best_partition(global_graph)
word_to_topic = {word: topic_id for word, topic_id in global_partition.items()}
#Add topics to global graph nodes
for word, topic in word_to_topic.items():
    if word in global_graph.nodes:
        global_graph.nodes[word]['topic'] = topic

# Define the same palette you used in matplotlib:
hex_palette = [
    "#e6194b", "#3cb44b", "#722dd0", "#4363d8",
    "#f58231", "#cc3300", "#46f0f0", "#0fa9d0",
    "#384a03", "#663300", "#008080", "#6e00b3",
    "#9a6324", "#0fa929", "#800000", "#3333ff",
]
topic_colors = {tid: hex_palette[tid] for tid in range(len(hex_palette))}

def annotate_with_colors(G):
    for n, data in G.nodes(data=True):
        topic = data.get('topic')
        # fallback if something went wrong
        hexcol = topic_colors.get(topic, "#CCCCCC")
        # 2) regular attribute:
        G.nodes[n]['color'] = hexcol
        # 3) viz:color block for GEXF viewers:
        r, g, b = (int(hexcol.lstrip('#')[i:i+2], 16) for i in (0, 2, 4))
        G.nodes[n]['viz'] = {'color': {'r': r, 'g': g, 'b': b, 'a': 1.0}}

# Apply to all your unit‐level graphs:
for unit_id, unit_graph in unit_graphs.items():
    for word in unit_graph.nodes:
        unit_graph.nodes[word]['topic'] = word_to_topic.get(word, "unkown")
        annotate_with_colors(unit_graph)
    nx.write_gexf(unit_graph, os.path.join(unit_graphs_dir, f"{unit_id}_graph_w3.gexf"))

# …and to the global graph:
annotate_with_colors(global_graph)
nx.write_gexf(global_graph, os.path.join(output_directory, "20250513_w3_globalmetrics_network.gexf"))



#! ----- add export for analyses May 3 -------------
# Function to export graph data (nodes, edges) into CSV or JSON
def export_graph_data(book_graph, book_name):
    # Export nodes (word, topic, source, start time) to a DataFrame
    node_data = []
    for node, data in book_graph.nodes(data=True):
        node_info = {
            'word': node,
            'topic': data.get('topic', 'unknown'),
            'source': data.get('source', 'unknown'),
            'start': data.get('start', None)
        }
        node_data.append(node_info)
    
    # Convert to DataFrame and save as CSV or JSON
    node_df = pd.DataFrame(node_data)
    node_df.to_csv(os.path.join(EXPORT_DIR, f"{book_name}_nodes.csv"), index=False)
    node_df.to_json(os.path.join(EXPORT_DIR, f"{book_name}_nodes.json"), orient='records', lines=True)
    
    # Export edges (word pairs, co-occurrence weight) to a DataFrame
    edge_data = []
    for u, v, data in book_graph.edges(data=True):
        edge_info = {
            'word_1': u,
            'word_2': v,
            'weight': data.get('weight', 0)
        }
        edge_data.append(edge_info)
    
    # Convert to DataFrame and save as CSV or JSON
    edge_df = pd.DataFrame(edge_data)
    edge_df.to_csv(os.path.join(EXPORT_DIR, f"{book_name}_edges.csv"), index=False)
    edge_df.to_json(os.path.join(EXPORT_DIR, f"{book_name}_edges.json"), orient='records', lines=True)
#! ----- add export for analyses May 3 -------------

# ---------------- Book-level graph construction (process each book)
graphs = {}
for book, file_paths in snapshots.items():
    book_graph = nx.Graph()
    all_words_in_book = set()

    for file_path in file_paths:
        with open(file_path, "r", encoding="utf-8") as file:
            text = file.read()

        words = utils.filter_text(text, proper_nouns, stop_words)
        if not words:
            continue

        # Compute topic distribution for the unit
        topic_counts = defaultdict(int)
        total_words_with_topic = 0
        for word in words:
            topic = word_to_topic.get(word)
            if topic is not None:
                topic_counts[topic] += 1
                total_words_with_topic += 1

       # Extract unit_id and start/end times
        filename = os.path.basename(file_path)
        name_no_ext = os.path.splitext(filename)[0]
        parts = name_no_ext.split("_")
        unit_id = "_".join(parts[:2])
        start_float, end_float = unit_interval_map.get(unit_id, (None, None))

        row = {"start": start_float, "end": end_float}
        for topic in sorted(set(word_to_topic.values())):
            count = topic_counts.get(topic, 0)
            percent = (count / total_words_with_topic * 100) if total_words_with_topic > 0 else 0.0
            row[f"topic_{topic}"] = round(percent, 2)
        unit_topic_rows.append(row)

        window_size = 3
        adj_matrix, unique_words = utils.build_cooccurrence_matrix(words, window_size)

        for word in unique_words:
            source = token_first_appearance.get(word, "unknown")
            if "_" in str(source):
                start_val = unit_id_to_float(source)
                start = start_val
            else:
                start = None

            book_graph.add_node(
                word,
                topic=word_to_topic.get(word, "unknown"),
                source=str(source),
                file_name=name_no_ext,
                start=start,
            )

        for i, word_i in enumerate(unique_words):
            for j, word_j in enumerate(unique_words):
                if adj_matrix[i, j] != 0:
                    book_graph.add_edge(word_i, word_j, weight=adj_matrix[i, j])

    # **Instead of** computing a new partition:
    for n in book_graph.nodes:
        # copy the global topic assignment
        book_graph.nodes[n]['topic'] = word_to_topic.get(n, 'unknown')

    # then color it with your helper:
    annotate_with_colors(book_graph)

    # build a dict of {node: global_topic} for exactly this book’s nodes
    book_partition = { n: book_graph.nodes[n]['topic'] for n in book_graph.nodes }
    
    # compute modularity of that fixed assignment
    book_modularity = community_louvain.modularity(book_partition, book_graph)
    modularity_scores[book] = book_modularity
    graphs[book] = book_graph
    print(f"📖 Book {book} - Modularity Score (global): {book_modularity:.3f}")

    # Save book graph
    book_gexf = os.path.join(save_directory, f"w3_book_{book}.gexf")
    nx.write_gexf(book_graph, book_gexf)
    print(f"📂 Saved book graph with global topics: {book_gexf}")

# Step 6: Export data
# Export word-to-topic mapping
with open(os.path.join(export_dir, "w20250513_w3_word_to_topic.json"), "w", encoding="utf-8") as f:
    json.dump(word_to_topic, f, ensure_ascii=False, indent=2)
print("✅ Exported 20250513_w3_word_to_topic.json")


# 2. Export token counts per book (word frequency for each book)
token_counts_per_book = defaultdict(lambda: defaultdict(int))  # book -> word -> count
unit_tokens = defaultdict(list)  # unit_id -> list of tokens

for book, file_paths in snapshots.items():
    for file_path in file_paths:
        with open(file_path, "r", encoding="utf-8") as file:
            text = file.read()

        words = utils.filter_text(text, proper_nouns, stop_words)
        if not words:
            continue

        # Record words at unit level
        filename = os.path.basename(file_path)
        unit_id = "_".join(os.path.splitext(filename)[0].split("_")[:2])

        unit_tokens[unit_id].extend(words)

        # Update frequency count for the book
        for word in words:
            token_counts_per_book[book][word] += 1

# Export token counts per book
with open(os.path.join(export_dir, "2ß25ß513_w3_token_counts_per_book.json"), "w", encoding="utf-8") as f:
    json.dump(token_counts_per_book, f, ensure_ascii=False, indent=2)
print("✅ Exported w5_token_counts_per_book.json")

# 3. Export unit tokens (words per unit)
with open(os.path.join(export_dir, "20250513_w3_unit_tokens.json"), "w", encoding="utf-8") as f:
    json.dump(unit_tokens, f, ensure_ascii=False, indent=2)
print("✅ Exported 2ß250513_w5_unit_tokens.json")

# Step 6: Export the Global Graph (after all the book-level graphs)
global_gexf_filename = os.path.join(save_directory, "2ß25ß513_w3_global_graph.gexf")
nx.write_gexf(global_graph, global_gexf_filename)
print(f"📂 Saved global graph: {global_gexf_filename}")

# Export per-unit topic coverage (CSV format)
unit_topic_csv = os.path.join(output_directory, "20250513_w3_topic_coverage.csv")
all_topics = sorted(set(word_to_topic.values()))
fieldnames = ["start", "end"] + [f"topic_{topic}" for topic in all_topics]

with open(unit_topic_csv, "w", newline="") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for row in unit_topic_rows:
        writer.writerow(row)

print(f"📄 Saved unit-level topic coverage CSV to {unit_topic_csv}")

# Export global core vocabulary (nodes in the global graph)
network_core_vocab = list(global_graph.nodes)
global_nodes = list(global_graph.nodes)
#Count the degree of each node (i.e., how many edges are connected to it)
node_degrees = {node: global_graph.degree(node) for node in network_core_vocab}

# Filter nodes with degrees greater than 5
filtered_nodes = [node for node, degree in node_degrees.items() if degree > 5]

# Create a list of nodes with their degrees for the export
export_vocab = [
    {"token": node, "count": node_degrees[node]} for node in filtered_nodes
]

# Save to JSON
export_path = os.path.join(export_dir, "20250513_w3_network_core_vocab.json")
with open(export_path, "w", encoding="utf-8") as f:
    json.dump(export_vocab, f, ensure_ascii=False, indent=2)

print(f"📤 Exported network-based core vocabulary to: {export_path}")

#save all nodes to json
export_path = os.path.join(export_dir, "20250513_w5_global_nodes.json")
with open(export_path, "w", encoding="utf-8") as f:
    json.dump(export_vocab, f, ensure_ascii=False, indent=2)

print(f"Exported global nodes to {export_path}")

# Step 7: Collect words by topic and print topic examples
topic_examples = defaultdict(list)
for word, topic in word_to_topic.items():
    topic_examples[topic].append(word)

# Print sample words per topic
print("\n🔍 Topic Assignment Examples:")
for topic_id, words in sorted(topic_examples.items()):
    sampled_words = random.sample(words, min(10, len(words)))  # up to 10 per topic
    print(f"Topic {topic_id:2d}: {', '.join(sampled_words)}")

# Step 8: Save modularity scores (Book-level modularity)
modularity_output_file = os.path.join(output_directory, "20250513_w3_distributional_modularity_scores.csv")
with open(modularity_output_file, "w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["Book", "Modularity Score"])

    for book, score in modularity_scores.items():
        writer.writerow([book, score])

print(f"📂 Modularity scores saved in {modularity_output_file}")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\emine\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


📖 Book 1 - Modularity Score (global): 0.290
📂 Saved book graph with global topics: C:\Users\emine\try_env\2025_02\community_detection\w3_book_1.gexf
📖 Book 2 - Modularity Score (global): 0.218
📂 Saved book graph with global topics: C:\Users\emine\try_env\2025_02\community_detection\w3_book_2.gexf
📖 Book 3 - Modularity Score (global): 0.164
📂 Saved book graph with global topics: C:\Users\emine\try_env\2025_02\community_detection\w3_book_3.gexf
📖 Book 4 - Modularity Score (global): 0.210
📂 Saved book graph with global topics: C:\Users\emine\try_env\2025_02\community_detection\w3_book_4.gexf
✅ Exported 20250513_w3_word_to_topic.json
✅ Exported w5_token_counts_per_book.json
✅ Exported 2ß250513_w5_unit_tokens.json
📂 Saved global graph: C:\Users\emine\try_env\2025_02\community_detection\2ß25ß513_w3_global_graph.gexf
📄 Saved unit-level topic coverage CSV to C:\Users\emine\try_env\2025_02\community_detection\20250513_w3_topic_coverage.csv
📤 Exported network-based core vocabulary to: C:\Users\e